diff options
-rw-r--r-- | Documentation/cgroups/blkio-controller.txt | 24 | ||||
-rw-r--r-- | Documentation/cgroups/unified-hierarchy.txt | 61 | ||||
-rw-r--r-- | block/bio.c | 2 | ||||
-rw-r--r-- | block/blk-cgroup.c | 524 | ||||
-rw-r--r-- | block/blk-core.c | 4 | ||||
-rw-r--r-- | block/blk-throttle.c | 505 | ||||
-rw-r--r-- | block/blk.h | 5 | ||||
-rw-r--r-- | block/cfq-iosched.c | 651 | ||||
-rw-r--r-- | fs/fs-writeback.c | 139 | ||||
-rw-r--r-- | fs/kernfs/dir.c | 23 | ||||
-rw-r--r-- | include/linux/backing-dev.h | 26 | ||||
-rw-r--r-- | include/linux/blk-cgroup.h | 340 | ||||
-rw-r--r-- | include/linux/cgroup_subsys.h | 2 | ||||
-rw-r--r-- | include/linux/kernfs.h | 4 | ||||
-rw-r--r-- | include/trace/events/writeback.h | 180 | ||||
-rw-r--r-- | mm/backing-dev.c | 4 | ||||
-rw-r--r-- | mm/page-writeback.c | 6 |
17 files changed, 1422 insertions, 1078 deletions
diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt index 68b6a6a470b0..12686bec37b9 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroups/blkio-controller.txt @@ -201,7 +201,7 @@ Proportional weight policy files specifies the number of bytes. - blkio.io_serviced - - Number of IOs completed to/from the disk by the group. These + - Number of IOs (bio) issued to the disk by the group. These are further divided by the type of operation - read or write, sync or async. First two fields specify the major and minor number of the device, third field specifies the operation type and the fourth field @@ -327,18 +327,11 @@ Note: If both BW and IOPS rules are specified for a device, then IO is subjected to both the constraints. - blkio.throttle.io_serviced - - Number of IOs (bio) completed to/from the disk by the group (as - seen by throttling policy). These are further divided by the type - of operation - read or write, sync or async. First two fields specify - the major and minor number of the device, third field specifies the - operation type and the fourth field specifies the number of IOs. - - blkio.io_serviced does accounting as seen by CFQ and counts are in - number of requests (struct request). On the other hand, - blkio.throttle.io_serviced counts number of IO in terms of number - of bios as seen by throttling policy. These bios can later be - merged by elevator and total number of requests completed can be - lesser. + - Number of IOs (bio) issued to the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of IOs. - blkio.throttle.io_service_bytes - Number of bytes transferred to/from the disk by the group. These @@ -347,11 +340,6 @@ Note: If both BW and IOPS rules are specified for a device, then IO is device, third field specifies the operation type and the fourth field specifies the number of bytes. - These numbers should roughly be same as blkio.io_service_bytes as - updated by CFQ. The difference between two is that - blkio.io_service_bytes will not be updated if CFQ is not operating - on request queue. - Common files among various policies ----------------------------------- - blkio.reset_stats diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt index 1ee9caf29e57..e0975c2cf03d 100644 --- a/Documentation/cgroups/unified-hierarchy.txt +++ b/Documentation/cgroups/unified-hierarchy.txt @@ -27,7 +27,7 @@ CONTENTS 5-3-1. Format 5-3-2. Control Knobs 5-4. Per-Controller Changes - 5-4-1. blkio + 5-4-1. io 5-4-2. cpuset 5-4-3. memory 6. Planned Changes @@ -203,7 +203,7 @@ other issues. The mapping from nice level to weight isn't obvious or universal, and there are various other knobs which simply aren't available for tasks. -The blkio controller implicitly creates a hidden leaf node for each +The io controller implicitly creates a hidden leaf node for each cgroup to host the tasks. The hidden leaf has its own copies of all the knobs with "leaf_" prefixed. While this allows equivalent control over internal tasks, it's with serious drawbacks. It always adds an @@ -438,9 +438,62 @@ may be specified in any order and not all pairs have to be specified. 5-4. Per-Controller Changes -5-4-1. blkio +5-4-1. io -- blk-throttle becomes properly hierarchical. +- blkio is renamed to io. The interface is overhauled anyway. The + new name is more in line with the other two major controllers, cpu + and memory, and better suited given that it may be used for cgroup + writeback without involving block layer. + +- Everything including stat is always hierarchical making separate + recursive stat files pointless and, as no internal node can have + tasks, leaf weights are meaningless. The operation model is + simplified and the interface is overhauled accordingly. + + io.stat + + The stat file. The reported stats are from the point where + bio's are issued to request_queue. The stats are counted + independent of which policies are enabled. Each line in the + file follows the following format. More fields may later be + added at the end. + + $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS + + io.weight + + The weight setting, currently only available and effective if + cfq-iosched is in use for the target device. The weight is + between 1 and 10000 and defaults to 100. The first line + always contains the default weight in the following format to + use when per-device setting is missing. + + default $WEIGHT + + Subsequent lines list per-device weights of the following + format. + + $MAJ:$MIN $WEIGHT + + Writing "$WEIGHT" or "default $WEIGHT" changes the default + setting. Writing "$MAJ:$MIN $WEIGHT" sets per-device weight + while "$MAJ:$MIN default" clears it. + + This file is available only on non-root cgroups. + + io.max + + The maximum bandwidth and/or iops setting, only available if + blk-throttle is enabled. The file is of the following format. + + $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS + + ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are + read/write IOs per second. "max" indicates no limit. Writing + to the file follows the same format but the individual + settings may be ommitted or specified in any order. + + This file is available only on non-root cgroups. 5-4-2. cpuset diff --git a/block/bio.c b/block/bio.c index 515b5434fe2d..ad3f276d74bc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1990,7 +1990,7 @@ int bio_associate_current(struct bio *bio) get_io_context_active(ioc); bio->bi_ioc = ioc; - bio->bi_css = task_get_css(current, blkio_cgrp_id); + bio->bi_css = task_get_css(current, io_cgrp_id); return 0; } EXPORT_SYMBOL_GPL(bio_associate_current); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d6283b3f5db5..ac8370cb2515 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -24,6 +24,7 @@ #include <linux/genhd.h> #include <linux/delay.h> #include <linux/atomic.h> +#include <linux/ctype.h> #include <linux/blk-cgroup.h> #include "blk.h" @@ -68,9 +69,14 @@ static void blkg_free(struct blkcg_gq *blkg) return; for (i = 0; i < BLKCG_MAX_POLS; i++) - kfree(blkg->pd[i]); + if (blkg->pd[i]) + blkcg_policy[i]->pd_free_fn(blkg->pd[i]); - blk_exit_rl(&blkg->rl); + if (blkg->blkcg != &blkcg_root) + blk_exit_rl(&blkg->rl); + + blkg_rwstat_exit(&blkg->stat_ios); + blkg_rwstat_exit(&blkg->stat_bytes); kfree(blkg); } @@ -93,6 +99,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, if (!blkg) return NULL; + if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) || + blkg_rwstat_init(&blkg->stat_ios, gfp_mask)) + goto err_free; + blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; @@ -113,7 +123,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, continue; /* alloc per-policy data and attach it to blkg */ - pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); + pd = pol->pd_alloc_fn(gfp_mask, q->node); if (!pd) goto err_free; @@ -129,26 +139,11 @@ err_free: return NULL; } -/** - * __blkg_lookup - internal version of blkg_lookup() - * @blkcg: blkcg of interest - * @q: request_queue of interest - * @update_hint: whether to update lookup hint with the result or not - * - * This is internal version and shouldn't be used by policy - * implementations. Looks up blkgs for the @blkcg - @q pair regardless of - * @q's bypass state. If @update_hint is %true, the caller should be - * holding @q->queue_lock and lookup hint is updated on success. - */ -struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, - bool update_hint) +struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, + struct request_queue *q, bool update_hint) { struct blkcg_gq *blkg; - blkg = rcu_dereference(blkcg->blkg_hint); - if (blkg && blkg->q == q) - return blkg; - /* * Hint didn't match. Look up from the radix tree. Note that the * hint can only be updated under queue_lock as otherwise @blkg @@ -166,29 +161,11 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, return NULL; } - -/** - * blkg_lookup - lookup blkg for the specified blkcg - q pair - * @blkcg: blkcg of interest - * @q: request_queue of interest - * - * Lookup blkg for the @blkcg - @q pair. This function should be called - * under RCU read lock and is guaranteed to return %NULL if @q is bypassing - * - see blk_queue_bypass_start() for details. - */ -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - if (unlikely(blk_queue_bypass(q))) - return NULL; - return __blkg_lookup(blkcg, q, false); -} -EXPORT_SYMBOL_GPL(blkg_lookup); +EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); /* * If @new_blkg is %NULL, this function tries to allocate a new one as - * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. + * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct request_queue *q, @@ -203,12 +180,12 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* blkg holds a reference to blkcg */ if (!css_tryget_online(&blkcg->css)) { - ret = -EINVAL; + ret = -ENODEV; goto err_free_blkg; } wb_congested = wb_congested_get_create(&q->backing_dev_info, - blkcg->css.id, GFP_ATOMIC); + blkcg->css.id, GFP_NOWAIT); if (!wb_congested) { ret = -ENOMEM; goto err_put_css; @@ -216,7 +193,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* allocate */ if (!new_blkg) { - new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); + new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto err_put_congested; @@ -229,7 +206,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, if (blkcg_parent(blkcg)) { blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); if (WARN_ON_ONCE(!blkg->parent)) { - ret = -EINVAL; + ret = -ENODEV; goto err_put_congested; } blkg_get(blkg->parent); @@ -240,7 +217,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_init_fn) - pol->pd_init_fn(blkg); + pol->pd_init_fn(blkg->pd[i]); } /* insert */ @@ -254,7 +231,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_online_fn) - pol->pd_online_fn(blkg); + pol->pd_online_fn(blkg->pd[i]); } } blkg->online = true; @@ -303,7 +280,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, * we shouldn't allow anything to go through for a bypassing queue. */ if (unlikely(blk_queue_bypass(q))) - return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); + return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); blkg = __blkg_lookup(blkcg, q, true); if (blkg) @@ -327,11 +304,11 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, return blkg; } } -EXPORT_SYMBOL_GPL(blkg_lookup_create); static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; + struct blkcg_gq *parent = blkg->parent; int i; lockdep_assert_held(blkg->q->queue_lock); @@ -345,8 +322,14 @@ static void blkg_destroy(struct blkcg_gq *blkg) struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_offline_fn) - pol->pd_offline_fn(blkg); + pol->pd_offline_fn(blkg->pd[i]); + } + + if (parent) { + blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes); + blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios); } + blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); @@ -400,15 +383,6 @@ static void blkg_destroy_all(struct request_queue *q) void __blkg_release_rcu(struct rcu_head *rcu_head) { struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head); - int i; - - /* tell policies that this one is being freed */ - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - - if (blkg->pd[i] && pol->pd_exit_fn) - pol->pd_exit_fn(blkg); - } /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); @@ -472,12 +446,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { + blkg_rwstat_reset(&blkg->stat_bytes); + blkg_rwstat_reset(&blkg->stat_ios); + for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; - if (blkcg_policy_enabled(blkg->q, pol) && - pol->pd_reset_stats_fn) - pol->pd_reset_stats_fn(blkg); + if (blkg->pd[i] && pol->pd_reset_stats_fn) + pol->pd_reset_stats_fn(blkg->pd[i]); } } @@ -486,13 +462,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, return 0; } -static const char *blkg_dev_name(struct blkcg_gq *blkg) +const char *blkg_dev_name(struct blkcg_gq *blkg) { /* some drivers (floppy) instantiate a queue w/o disk registered */ if (blkg->q->backing_dev_info.dev) return dev_name(blkg->q->backing_dev_info.dev); return NULL; } +EXPORT_SYMBOL_GPL(blkg_dev_name); /** * blkcg_print_blkgs - helper for printing per-blkg data @@ -581,9 +558,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, for (i = 0; i < BLKG_RWSTAT_NR; i++) seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], - (unsigned long long)rwstat->cnt[i]); + (unsigned long long)atomic64_read(&rwstat->aux_cnt[i])); - v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; + v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]); seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); return v; } @@ -620,31 +598,122 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, } EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); +static u64 blkg_prfill_rwstat_field(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off); + + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} + +/** + * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes + * @sf: seq_file to print to + * @v: unused + * + * To be used as cftype->seq_show to print blkg->stat_bytes. + * cftype->private must be set to the blkcg_policy. + */ +int blkg_print_stat_bytes(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_bytes), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_bytes); + +/** + * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios + * @sf: seq_file to print to + * @v: unused + * + * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private + * must be set to the blkcg_policy. + */ +int blkg_print_stat_ios(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_ios), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_ios); + +static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, + int off) +{ + struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg, + NULL, off); + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} + +/** + * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes + * @sf: seq_file to print to + * @v: unused + */ +int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field_recursive, + (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_bytes), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive); + +/** + * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios + * @sf: seq_file to print to + * @v: unused + */ +int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field_recursive, + (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_ios), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive); + /** * blkg_stat_recursive_sum - collect hierarchical blkg_stat - * @pd: policy private data of interest - * @off: offset to the blkg_stat in @pd + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_stat + * @off: offset to the blkg_stat in blkg_policy_data or @blkg + * + * Collect the blkg_stat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. * - * Collect the blkg_stat specified by @off from @pd and all its online - * descendants and return the sum. The caller must be holding the queue - * lock for online tests. + * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is + * at @off bytes into @blkg's blkg_policy_data of the policy. */ -u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) +u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off) { - struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; u64 sum = 0; - lockdep_assert_held(pd->blkg->q->queue_lock); + lockdep_assert_held(blkg->q->queue_lock); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { - struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); - struct blkg_stat *stat = (void *)pos_pd + off; + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_stat *stat; + + if (!pos_blkg->online) + continue; + + if (pol) + stat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + stat = (void *)blkg + off; - if (pos_blkg->online) - sum += blkg_stat_read(stat); + sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt); } rcu_read_unlock(); @@ -654,37 +723,43 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); /** * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat - * @pd: policy private data of interest - * @off: offset to the blkg_stat in @pd + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_rwstat + * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg + * + * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. * - * Collect the blkg_rwstat specified by @off from @pd and all its online - * descendants and return the sum. The caller must be holding the queue - * lock for online tests. + * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it + * is at @off bytes into @blkg's blkg_policy_data of the policy. */ -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, - int off) +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off) { - struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; struct blkg_rwstat sum = { }; int i; - lockdep_assert_held(pd->blkg->q->queue_lock); + lockdep_assert_held(blkg->q->queue_lock); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { - struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); - struct blkg_rwstat *rwstat = (void *)pos_pd + off; - struct blkg_rwstat tmp; + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_rwstat *rwstat; if (!pos_blkg->online) continue; - tmp = blkg_rwstat_read(rwstat); + if (pol) + rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + rwstat = (void *)pos_blkg + off; for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum.cnt[i] += tmp.cnt[i]; + atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) + + percpu_counter_sum_positive(&rwstat->cpu_cnt[i]), + &sum.aux_cnt[i]); } rcu_read_unlock(); @@ -700,29 +775,34 @@ EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); * @ctx: blkg_conf_ctx to be filled * * Parse per-blkg config update from @input and initialize @ctx with the - * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new - * value. This function returns with RCU read lock and queue lock held and - * must be paired with blkg_conf_finish(). + * result. @ctx->blkg points to the blkg to be updated and @ctx->body the + * part of @input following MAJ:MIN. This function returns with RCU read + * lock and queue lock held and must be paired with blkg_conf_finish(). */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx) + char *input, struct blkg_conf_ctx *ctx) __acquires(rcu) __acquires(disk->queue->queue_lock) { struct gendisk *disk; struct blkcg_gq *blkg; unsigned int major, minor; - unsigned long long v; - int part, ret; + int key_len, part, ret; + char *body; - if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) + if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) return -EINVAL; + body = input + key_len; + if (!isspace(*body)) + return -EINVAL; + body = skip_spaces(body); + disk = get_gendisk(MKDEV(major, minor), &part); if (!disk) - return -EINVAL; + return -ENODEV; if (part) { put_disk(disk); - return -EINVAL; + return -ENODEV; } rcu_read_lock(); @@ -731,7 +811,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, if (blkcg_policy_enabled(disk->queue, pol)) blkg = blkg_lookup_create(blkcg, disk->queue); else - blkg = ERR_PTR(-EINVAL); + blkg = ERR_PTR(-EOPNOTSUPP); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); @@ -753,7 +833,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, ctx->disk = disk; ctx->blkg = blkg; - ctx->v = v; + ctx->body = body; return 0; } EXPORT_SYMBOL_GPL(blkg_conf_prep); @@ -774,8 +854,55 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_finish); +static int blkcg_print_stat(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct blkcg_gq *blkg; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + const char *dname; + struct blkg_rwstat rwstat; + u64 rbytes, wbytes, rios, wios; + + dname = blkg_dev_name(blkg); + if (!dname) + continue; + + spin_lock_irq(blkg->q->queue_lock); + + rwstat = blkg_rwstat_recursive_sum(blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes)); + rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); + wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + + rwstat = blkg_rwstat_recursive_sum(blkg, NULL, + offsetof(struct blkcg_gq, stat_ios)); + rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); + wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + + spin_unlock_irq(blkg->q->queue_lock); + + if (rbytes || wbytes || rios || wios) + seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n", + dname, rbytes, wbytes, rios, wios); + } + + rcu_read_unlock(); + return 0; +} + struct cftype blkcg_files[] = { { + .name = "stat", + .seq_show = blkcg_print_stat, + }, + { } /* terminate */ +}; + +struct cftype blkcg_legacy_files[] = { + { .name = "reset_stats", .write_u64 = blkcg_reset_stats, }, @@ -822,18 +949,19 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); + int i; mutex_lock(&blkcg_pol_mutex); + list_del(&blkcg->all_blkcgs_node); - mutex_unlock(&blkcg_pol_mutex); - if (blkcg != &blkcg_root) { - int i; + for (i = 0; i < BLKCG_MAX_POLS; i++) + if (blkcg->cpd[i]) + blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); - for (i = 0; i < BLKCG_MAX_POLS; i++) - kfree(blkcg->pd[i]); - kfree(blkcg); - } + mutex_unlock(&blkcg_pol_mutex); + + kfree(blkcg); } static struct cgroup_subsys_state * @@ -847,13 +975,12 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) if (!parent_css) { blkcg = &blkcg_root; - goto done; - } - - blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); - if (!blkcg) { - ret = ERR_PTR(-ENOMEM); - goto free_blkcg; + } else { + blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); + if (!blkcg) { + ret = ERR_PTR(-ENOMEM); + goto free_blkcg; + } } for (i = 0; i < BLKCG_MAX_POLS ; i++) { @@ -866,23 +993,23 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) * check if the policy requires any specific per-cgroup * data: if it does, allocate and initialize it. */ - if (!pol || !pol->cpd_size) + if (!pol || !pol->cpd_alloc_fn) continue; - BUG_ON(blkcg->pd[i]); - cpd = kzalloc(pol->cpd_size, GFP_KERNEL); + cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) { ret = ERR_PTR(-ENOMEM); goto free_pd_blkcg; } - blkcg->pd[i] = cpd; + blkcg->cpd[i] = cpd; + cpd->blkcg = blkcg; cpd->plid = i; - pol->cpd_init_fn(blkcg); + if (pol->cpd_init_fn) + pol->cpd_init_fn(cpd); } -done: spin_lock_init(&blkcg->lock); - INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); + INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT); INIT_HLIST_HEAD(&blkcg->blkg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); @@ -894,7 +1021,8 @@ done: free_pd_blkcg: for (i--; i >= 0; i--) - kfree(blkcg->pd[i]); + if (blkcg->cpd[i]) + blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); free_blkcg: kfree(blkcg); mutex_unlock(&blkcg_pol_mutex); @@ -938,7 +1066,7 @@ int blkcg_init_queue(struct request_queue *q) radix_tree_preload_end(); if (IS_ERR(blkg)) { - kfree(new_blkg); + blkg_free(new_blkg); return PTR_ERR(blkg); } @@ -1015,12 +1143,35 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css, return ret; } -struct cgroup_subsys blkio_cgrp_subsys = { +static void blkcg_bind(struct cgroup_subsys_state *root_css) +{ + int i; + + mutex_lock(&blkcg_pol_mutex); + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkcg *blkcg; + + if (!pol || !pol->cpd_bind_fn) + continue; + + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) + if (blkcg->cpd[pol->plid]) + pol->cpd_bind_fn(blkcg->cpd[pol->plid]); + } + mutex_unlock(&blkcg_pol_mutex); +} + +struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, - .legacy_cftypes = blkcg_files, + .bind = blkcg_bind, + .dfl_cftypes = blkcg_files, + .legacy_cftypes = blkcg_legacy_files, + .legacy_name = "blkio", #ifdef CONFIG_MEMCG /* * This ensures that, if available, memcg is automatically enabled @@ -1030,7 +1181,7 @@ struct cgroup_subsys blkio_cgrp_subsys = { .depends_on = 1 << memory_cgrp_id, #endif }; -EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); +EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** * blkcg_activate_policy - activate a blkcg policy on a request_queue @@ -1051,65 +1202,54 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { - LIST_HEAD(pds); + struct blkg_policy_data *pd_prealloc = NULL; struct blkcg_gq *blkg; - struct blkg_policy_data *pd, *nd; - int cnt = 0, ret; + int ret; if (blkcg_policy_enabled(q, pol)) return 0; - /* count and allocate policy_data for all existing blkgs */ blk_queue_bypass_start(q); - spin_lock_irq(q->queue_lock); - list_for_each_entry(blkg, &q->blkg_list, q_node) - cnt++; - spin_unlock_irq(q->queue_lock); - - /* allocate per-blkg policy data for all existing blkgs */ - while (cnt--) { - pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); - if (!pd) { +pd_prealloc: + if (!pd_prealloc) { + pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); + if (!pd_prealloc) { ret = -ENOMEM; - goto out_free; + goto out_bypass_end; } - list_add_tail(&pd->alloc_node, &pds); } - /* - * Install the allocated pds and cpds. With @q bypassing, no new blkg - * should have been created while the queue lock was dropped. - */ spin_lock_irq(q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { - if (WARN_ON(list_empty(&pds))) { - /* umm... this shouldn't happen, just abort */ - ret = -ENOMEM; - goto out_unlock; - } - pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); - list_del_init(&pd->alloc_node); + struct blkg_policy_data *pd; - /* grab blkcg lock too while installing @pd on @blkg */ - spin_lock(&blkg->blkcg->lock); + if (blkg->pd[pol->plid]) + continue; + + pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node); + if (!pd) + swap(pd, pd_prealloc); + if (!pd) { + spin_unlock_irq(q->queue_lock); + goto pd_prealloc; + } blkg->pd[pol->plid] = pd; pd->blkg = blkg; pd->plid = pol->plid; - pol->pd_init_fn(blkg); - - spin_unlock(&blkg->blkcg->lock); + if (pol->pd_init_fn) + pol->pd_init_fn(pd); } __set_bit(pol->plid, q->blkcg_pols); ret = 0; -out_unlock: + spin_unlock_irq(q->queue_lock); -out_free: +out_bypass_end: blk_queue_bypass_end(q); - list_for_each_entry_safe(pd, nd, &pds, alloc_node) - kfree(pd); + if (pd_prealloc) + pol->pd_free_fn(pd_prealloc); return ret; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); @@ -1139,13 +1279,12 @@ void blkcg_deactivate_policy(struct request_queue *q, /* grab blkcg lock too while removing @pd from @blkg */ spin_lock(&blkg->blkcg->lock); - if (pol->pd_offline_fn) - pol->pd_offline_fn(blkg); - if (pol->pd_exit_fn) - pol->pd_exit_fn(blkg); - - kfree(blkg->pd[pol->plid]); - blkg->pd[pol->plid] = NULL; + if (blkg->pd[pol->plid]) { + if (pol->pd_offline_fn) + pol->pd_offline_fn(blkg->pd[pol->plid]); + pol->pd_free_fn(blkg->pd[pol->plid]); + blkg->pd[pol->plid] = NULL; + } spin_unlock(&blkg->blkcg->lock); } @@ -1167,9 +1306,6 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg *blkcg; int i, ret; - if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) - return -EINVAL; - mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); @@ -1186,36 +1322,42 @@ int blkcg_policy_register(struct blkcg_policy *pol) blkcg_policy[pol->plid] = pol; /* allocate and install cpd's */ - if (pol->cpd_size) { + if (pol->cpd_alloc_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { struct blkcg_policy_data *cpd; - cpd = kzalloc(pol->cpd_size, GFP_KERNEL); + cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) { mutex_unlock(&blkcg_pol_mutex); goto err_free_cpds; } - blkcg->pd[pol->plid] = cpd; + blkcg->cpd[pol->plid] = cpd; + cpd->blkcg = blkcg; cpd->plid = pol->plid; - pol->cpd_init_fn(blkcg); + pol->cpd_init_fn(cpd); } } mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ - if (pol->cftypes) - WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys, - pol->cftypes)); + if (pol->dfl_cftypes) + WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, + pol->dfl_cftypes)); + if (pol->legacy_cftypes) + WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, + pol->legacy_cftypes)); mutex_unlock(&blkcg_pol_register_mutex); return 0; err_free_cpds: - if (pol->cpd_size) { + if (pol->cpd_alloc_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { - kfree(blkcg->pd[pol->plid]); - blkcg->pd[pol->plid] = NULL; + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } } } blkcg_policy[pol->plid] = NULL; @@ -1242,16 +1384,20 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) goto out_unlock; /* kill the intf files first */ - if (pol->cftypes) - cgroup_rm_cftypes(pol->cftypes); + if (pol->dfl_cftypes) + cgroup_rm_cftypes(pol->dfl_cftypes); + if (pol->legacy_cftypes) + cgroup_rm_cftypes(pol->legacy_cftypes); /* remove cpds and unregister */ mutex_lock(&blkcg_pol_mutex); - if (pol->cpd_size) { + if (pol->cpd_alloc_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { - kfree(blkcg->pd[pol->plid]); - blkcg->pd[pol->plid] = NULL; + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } } } blkcg_policy[pol->plid] = NULL; diff --git a/block/blk-core.c b/block/blk-core.c index 60912e983f16..2eb722d48773 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1888,8 +1888,8 @@ generic_make_request_checks(struct bio *bio) */ create_io_context(GFP_ATOMIC, q->node); - if (blk_throtl_bio(q, bio)) - return false; /* throttled, will be resubmitted later */ + if (!blkcg_bio_issue_check(q, bio)) + return false; trace_block_bio_queue(q, bio); return true; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index b23193518ac7..c75a2636dd40 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -83,14 +83,6 @@ enum tg_state_flags { #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) -/* Per-cpu group stats */ -struct tg_stats_cpu { - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; -}; - struct throtl_grp { /* must be the first member */ struct blkg_policy_data pd; @@ -141,12 +133,6 @@ struct throtl_grp { /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; - - /* Per cpu stats pointer */ - struct tg_stats_cpu __percpu *stats_cpu; - - /* List of tgs waiting for per cpu stats memory to be allocated */ - struct list_head stats_alloc_node; }; struct throtl_data @@ -168,13 +154,6 @@ struct throtl_data struct work_struct dispatch_work; }; -/* list and work item to allocate percpu group stats */ -static DEFINE_SPINLOCK(tg_stats_alloc_lock); -static LIST_HEAD(tg_stats_alloc_list); - -static void tg_stats_alloc_fn(struct work_struct *); -static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); - static void throtl_pending_timer_fn(unsigned long arg); static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) @@ -192,11 +171,6 @@ static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) return pd_to_blkg(&tg->pd); } -static inline struct throtl_grp *td_root_tg(struct throtl_data *td) -{ - return blkg_to_tg(td->queue->root_blkg); -} - /** * sq_to_tg - return the throl_grp the specified service queue belongs to * @sq: the throtl_service_queue of interest @@ -256,53 +230,6 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) } \ } while (0) -static void tg_stats_init(struct tg_stats_cpu *tg_stats) -{ - blkg_rwstat_init(&tg_stats->service_bytes); - blkg_rwstat_init(&tg_stats->serviced); -} - -/* - * Worker for allocating per cpu stat for tgs. This is scheduled on the - * system_wq once there are some groups on the alloc_list waiting for - * allocation. - */ -static void tg_stats_alloc_fn(struct work_struct *work) -{ - static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ - struct delayed_work *dwork = to_delayed_work(work); - bool empty = false; - -alloc_stats: - if (!stats_cpu) { - int cpu; - - stats_cpu = alloc_percpu(struct tg_stats_cpu); - if (!stats_cpu) { - /* allocation failed, try again after some time */ - schedule_delayed_work(dwork, msecs_to_jiffies(10)); - return; - } - for_each_possible_cpu(cpu) - tg_stats_init(per_cpu_ptr(stats_cpu, cpu)); - } - - spin_lock_irq(&tg_stats_alloc_lock); - - if (!list_empty(&tg_stats_alloc_list)) { - struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, - struct throtl_grp, - stats_alloc_node); - swap(tg->stats_cpu, stats_cpu); - list_del_init(&tg->stats_alloc_node); - } - - empty = list_empty(&tg_stats_alloc_list); - spin_unlock_irq(&tg_stats_alloc_lock); - if (!empty) - goto alloc_stats; -} - static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) { INIT_LIST_HEAD(&qn->node); @@ -387,29 +314,46 @@ static struct bio *throtl_pop_queued(struct list_head *queued, } /* init a service_queue, assumes the caller zeroed it */ -static void throtl_service_queue_init(struct throtl_service_queue *sq, - struct throtl_service_queue *parent_sq) +static void throtl_service_queue_init(struct throtl_service_queue *sq) { INIT_LIST_HEAD(&sq->queued[0]); INIT_LIST_HEAD(&sq->queued[1]); sq->pending_tree = RB_ROOT; - sq->parent_sq = parent_sq; setup_timer(&sq->pending_timer, throtl_pending_timer_fn, (unsigned long)sq); } -static void throtl_service_queue_exit(struct throtl_service_queue *sq) +static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) { - del_timer_sync(&sq->pending_timer); + struct throtl_grp *tg; + int rw; + + tg = kzalloc_node(sizeof(*tg), gfp, node); + if (!tg) + return NULL; + + throtl_service_queue_init(&tg->service_queue); + + for (rw = READ; rw <= WRITE; rw++) { + throtl_qnode_init(&tg->qnode_on_self[rw], tg); + throtl_qnode_init(&tg->qnode_on_parent[rw], tg); + } + + RB_CLEAR_NODE(&tg->rb_node); + tg->bps[READ] = -1; + tg->bps[WRITE] = -1; + tg->iops[READ] = -1; + tg->iops[WRITE] = -1; + + return &tg->pd; } -static void throtl_pd_init(struct blkcg_gq *blkg) +static void throtl_pd_init(struct blkg_policy_data *pd) { - struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_grp *tg = pd_to_tg(pd); + struct blkcg_gq *blkg = tg_to_blkg(tg); struct throtl_data *td = blkg->q->td; - struct throtl_service_queue *parent_sq; - unsigned long flags; - int rw; + struct throtl_service_queue *sq = &tg->service_queue; /* * If on the default hierarchy, we switch to properly hierarchical @@ -424,35 +368,10 @@ static void throtl_pd_init(struct blkcg_gq *blkg) * Limits of a group don't interact with limits of other groups * regardless of the position of the group in the hierarchy. */ - parent_sq = &td->service_queue; - + sq->parent_sq = &td->service_queue; if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent) - parent_sq = &blkg_to_tg(blkg->parent)->service_queue; - - throtl_service_queue_init(&tg->service_queue, parent_sq); - - for (rw = READ; rw <= WRITE; rw++) { - throtl_qnode_init(&tg->qnode_on_self[rw], tg); - throtl_qnode_init(&tg->qnode_on_parent[rw], tg); - } - - RB_CLEAR_NODE(&tg->rb_node); + sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; - - tg->bps[READ] = -1; - tg->bps[WRITE] = -1; - tg->iops[READ] = -1; - tg->iops[WRITE] = -1; - - /* - * Ugh... We need to perform per-cpu allocation for tg->stats_cpu - * but percpu allocator can't be called from IO path. Queue tg on - * tg_stats_alloc_list and allocate from work item. - */ - spin_lock_irqsave(&tg_stats_alloc_lock, flags); - list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); - schedule_delayed_work(&tg_stats_alloc_work, 0); - spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); } /* @@ -470,83 +389,21 @@ static void tg_update_has_rules(struct throtl_grp *tg) (tg->bps[rw] != -1 || tg->iops[rw] != -1); } -static void throtl_pd_online(struct blkcg_gq *blkg) +static void throtl_pd_online(struct blkg_policy_data *pd) { /* * We don't want new groups to escape the limits of its ancestors. * Update has_rules[] after a new group is brought online. */ - tg_update_has_rules(blkg_to_tg(blkg)); -} - -static void throtl_pd_exit(struct blkcg_gq *blkg) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - unsigned long flags; - - spin_lock_irqsave(&tg_stats_alloc_lock, flags); - list_del_init(&tg->stats_alloc_node); - spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); - - free_percpu(tg->stats_cpu); - - throtl_service_queue_exit(&tg->service_queue); -} - -static void throtl_pd_reset_stats(struct blkcg_gq *blkg) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - int cpu; - - if (tg->stats_cpu == NULL) - return; - - for_each_possible_cpu(cpu) { - struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); - - blkg_rwstat_reset(&sc->service_bytes); - blkg_rwstat_reset(&sc->serviced); - } -} - -static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, - struct blkcg *blkcg) -{ - /* - * This is the common case when there are no blkcgs. Avoid lookup - * in this case - */ - if (blkcg == &blkcg_root) - return td_root_tg(td); - - return blkg_to_tg(blkg_lookup(blkcg, td->queue)); + tg_update_has_rules(pd_to_tg(pd)); } -static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, - struct blkcg *blkcg) +static void throtl_pd_free(struct blkg_policy_data *pd) { - struct request_queue *q = td->queue; - struct throtl_grp *tg = NULL; - - /* - * This is the common case when there are no blkcgs. Avoid lookup - * in this case - */ - if (blkcg == &blkcg_root) { - tg = td_root_tg(td); - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - - /* if %NULL and @q is alive, fall back to root_tg */ - if (!IS_ERR(blkg)) - tg = blkg_to_tg(blkg); - else if (!blk_queue_dying(q)) - tg = td_root_tg(td); - } + struct throtl_grp *tg = pd_to_tg(pd); - return tg; + del_timer_sync(&tg->service_queue.pending_timer); + kfree(tg); } static struct throtl_grp * @@ -956,32 +813,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, return 0; } -static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, - int rw) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - struct tg_stats_cpu *stats_cpu; - unsigned long flags; - - /* If per cpu stats are not allocated yet, don't do any accounting. */ - if (tg->stats_cpu == NULL) - return; - - /* - * Disabling interrupts to provide mutual exclusion between two - * writes on same cpu. It probably is not needed for 64bit. Not - * optimizing that case yet. - */ - local_irq_save(flags); - - stats_cpu = this_cpu_ptr(tg->stats_cpu); - - blkg_rwstat_add(&stats_cpu->serviced, rw, 1); - blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); - - local_irq_restore(flags); -} - static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); @@ -995,17 +826,9 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) * more than once as a throttled bio will go through blk-throtl the * second time when it eventually gets issued. Set it when a bio * is being charged to a tg. - * - * Dispatch stats aren't recursive and each @bio should only be - * accounted by the @tg it was originally associated with. Let's - * update the stats when setting REQ_THROTTLED for the first time - * which is guaranteed to be for the @bio's original tg. */ - if (!(bio->bi_rw & REQ_THROTTLED)) { + if (!(bio->bi_rw & REQ_THROTTLED)) bio->bi_rw |= REQ_THROTTLED; - throtl_update_dispatch_stats(tg_to_blkg(tg), - bio->bi_iter.bi_size, bio->bi_rw); - } } /** @@ -1285,34 +1108,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) } } -static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct throtl_grp *tg = pd_to_tg(pd); - struct blkg_rwstat rwstat = { }, tmp; - int i, cpu; - - if (tg->stats_cpu == NULL) - return 0; - - for_each_possible_cpu(cpu) { - struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); - - tmp = blkg_rwstat_read((void *)sc + off); - for (i = 0; i < BLKG_RWSTAT_NR; i++) - rwstat.cnt[i] += tmp.cnt[i]; - } - - return __blkg_prfill_rwstat(sf, pd, &rwstat); -} - -static int tg_print_cpu_rwstat(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat, - &blkcg_policy_throtl, seq_cft(sf)->private, true); - return 0; -} - static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -1349,31 +1144,11 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v) return 0; } -static ssize_t tg_set_conf(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off, bool is_u64) +static void tg_conf_updated(struct throtl_grp *tg) { - struct blkcg *blkcg = css_to_blkcg(of_css(of)); - struct blkg_conf_ctx ctx; - struct throtl_grp *tg; - struct throtl_service_queue *sq; - struct blkcg_gq *blkg; + struct throtl_service_queue *sq = &tg->service_queue; struct cgroup_subsys_state *pos_css; - int ret; - - ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); - if (ret) - return ret; - - tg = blkg_to_tg(ctx.blkg); - sq = &tg->service_queue; - - if (!ctx.v) - ctx.v = -1; - - if (is_u64) - *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v; - else - *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v; + struct blkcg_gq *blkg; throtl_log(&tg->service_queue, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", @@ -1387,7 +1162,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ - blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) + blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg)) tg_update_has_rules(blkg_to_tg(blkg)); /* @@ -1405,9 +1180,39 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, tg_update_disptime(tg); throtl_schedule_next_dispatch(sq->parent_sq, true); } +} + +static ssize_t tg_set_conf(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, bool is_u64) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkg_conf_ctx ctx; + struct throtl_grp *tg; + int ret; + u64 v; + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + if (ret) + return ret; + + ret = -EINVAL; + if (sscanf(ctx.body, "%llu", &v) != 1) + goto out_finish; + if (!v) + v = -1; + + tg = blkg_to_tg(ctx.blkg); + + if (is_u64) + *(u64 *)((void *)tg + of_cft(of)->private) = v; + else + *(unsigned int *)((void *)tg + of_cft(of)->private) = v; + + tg_conf_updated(tg); + ret = 0; +out_finish: blkg_conf_finish(&ctx); - return nbytes; + return ret ?: nbytes; } static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, @@ -1422,7 +1227,7 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, return tg_set_conf(of, buf, nbytes, off, false); } -static struct cftype throtl_files[] = { +static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", .private = offsetof(struct throtl_grp, bps[READ]), @@ -1449,13 +1254,124 @@ static struct cftype throtl_files[] = { }, { .name = "throttle.io_service_bytes", - .private = offsetof(struct tg_stats_cpu, service_bytes), - .seq_show = tg_print_cpu_rwstat, + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_bytes, }, { .name = "throttle.io_serviced", - .private = offsetof(struct tg_stats_cpu, serviced), - .seq_show = tg_print_cpu_rwstat, + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_ios, + }, + { } /* terminate */ +}; + +static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct throtl_grp *tg = pd_to_tg(pd); + const char *dname = blkg_dev_name(pd->blkg); + char bufs[4][21] = { "max", "max", "max", "max" }; + + if (!dname) + return 0; + if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 && + tg->iops[READ] == -1 && tg->iops[WRITE] == -1) + return 0; + + if (tg->bps[READ] != -1) + snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]); + if (tg->bps[WRITE] != -1) + snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]); + if (tg->iops[READ] != -1) + snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]); + if (tg->iops[WRITE] != -1) + snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]); + + seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n", + dname, bufs[0], bufs[1], bufs[2], bufs[3]); + return 0; +} + +static int tg_print_max(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max, + &blkcg_policy_throtl, seq_cft(sf)->private, false); + return 0; +} + +static ssize_t tg_set_max(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkg_conf_ctx ctx; + struct throtl_grp *tg; + u64 v[4]; + int ret; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + if (ret) + return ret; + + tg = blkg_to_tg(ctx.blkg); + + v[0] = tg->bps[READ]; + v[1] = tg->bps[WRITE]; + v[2] = tg->iops[READ]; + v[3] = tg->iops[WRITE]; + + while (true) { + char tok[27]; /* wiops=18446744073709551616 */ + char *p; + u64 val = -1; + int len; + + if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) + break; + if (tok[0] == '\0') + break; + ctx.body += len; + + ret = -EINVAL; + p = tok; + strsep(&p, "="); + if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) + goto out_finish; + + ret = -ERANGE; + if (!val) + goto out_finish; + + ret = -EINVAL; + if (!strcmp(tok, "rbps")) + v[0] = val; + else if (!strcmp(tok, "wbps")) + v[1] = val; + else if (!strcmp(tok, "riops")) + v[2] = min_t(u64, val, UINT_MAX); + else if (!strcmp(tok, "wiops")) + v[3] = min_t(u64, val, UINT_MAX); + else + goto out_finish; + } + + tg->bps[READ] = v[0]; + tg->bps[WRITE] = v[1]; + tg->iops[READ] = v[2]; + tg->iops[WRITE] = v[3]; + + tg_conf_updated(tg); + ret = 0; +out_finish: + blkg_conf_finish(&ctx); + return ret ?: nbytes; +} + +static struct cftype throtl_files[] = { + { + .name = "max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = tg_print_max, + .write = tg_set_max, }, { } /* terminate */ }; @@ -1468,52 +1384,33 @@ static void throtl_shutdown_wq(struct request_queue *q) } static struct blkcg_policy blkcg_policy_throtl = { - .pd_size = sizeof(struct throtl_grp), - .cftypes = throtl_files, + .dfl_cftypes = throtl_files, + .legacy_cftypes = throtl_legacy_files, + .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, - .pd_exit_fn = throtl_pd_exit, - .pd_reset_stats_fn = throtl_pd_reset_stats, + .pd_free_fn = throtl_pd_free, }; -bool blk_throtl_bio(struct request_queue *q, struct bio *bio) +bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, + struct bio *bio) { - struct throtl_data *td = q->td; struct throtl_qnode *qn = NULL; - struct throtl_grp *tg; + struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); - struct blkcg *blkcg; bool throttled = false; + WARN_ON_ONCE(!rcu_read_lock_held()); + /* see throtl_charge_bio() */ - if (bio->bi_rw & REQ_THROTTLED) + if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw]) goto out; - /* - * A throtl_grp pointer retrieved under rcu can be used to access - * basic fields like stats and io rates. If a group has no rules, - * just update the dispatch stats in lockless manner and return. - */ - rcu_read_lock(); - blkcg = bio_blkcg(bio); - tg = throtl_lookup_tg(td, blkcg); - if (tg) { - if (!tg->has_rules[rw]) { - throtl_update_dispatch_stats(tg_to_blkg(tg), - bio->bi_iter.bi_size, bio->bi_rw); - goto out_unlock_rcu; - } - } - - /* - * Either group has not been allocated yet or it is not an unlimited - * IO group - */ spin_lock_irq(q->queue_lock); - tg = throtl_lookup_create_tg(td, blkcg); - if (unlikely(!tg)) + + if (unlikely(blk_queue_bypass(q))) goto out_unlock; sq = &tg->service_queue; @@ -1580,8 +1477,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) out_unlock: spin_unlock_irq(q->queue_lock); -out_unlock_rcu: - rcu_read_unlock(); out: /* * As multiple blk-throtls may stack in the same issue path, we @@ -1667,7 +1562,7 @@ int blk_throtl_init(struct request_queue *q) return -ENOMEM; INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); - throtl_service_queue_init(&td->service_queue, NULL); + throtl_service_queue_init(&td->service_queue); q->td = td; td->queue = q; diff --git a/block/blk.h b/block/blk.h index 838188b35a83..98614ad37c81 100644 --- a/block/blk.h +++ b/block/blk.h @@ -272,15 +272,10 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) * Internal throttling interface */ #ifdef CONFIG_BLK_DEV_THROTTLING -extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); extern void blk_throtl_drain(struct request_queue *q); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); #else /* CONFIG_BLK_DEV_THROTTLING */ -static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) -{ - return false; -} static inline void blk_throtl_drain(struct request_queue *q) { } static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index c62bb2e650b8..04de88463a98 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -68,9 +68,9 @@ static struct kmem_cache *cfq_pool; #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) /* blkio-related constants */ -#define CFQ_WEIGHT_MIN 10 -#define CFQ_WEIGHT_MAX 1000 -#define CFQ_WEIGHT_DEFAULT 500 +#define CFQ_WEIGHT_LEGACY_MIN 10 +#define CFQ_WEIGHT_LEGACY_DFL 500 +#define CFQ_WEIGHT_LEGACY_MAX 1000 struct cfq_ttime { unsigned long last_end_request; @@ -177,10 +177,6 @@ enum wl_type_t { struct cfqg_stats { #ifdef CONFIG_CFQ_GROUP_IOSCHED - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; /* number of ios merged */ struct blkg_rwstat merged; /* total time spent on device in ns, may not be accurate w/ queueing */ @@ -189,8 +185,6 @@ struct cfqg_stats { struct blkg_rwstat wait_time; /* number of IOs queued up */ struct blkg_rwstat queued; - /* total sectors transferred */ - struct blkg_stat sectors; /* total disk time and nr sectors dispatched by this group */ struct blkg_stat time; #ifdef CONFIG_DEBUG_BLK_CGROUP @@ -220,7 +214,7 @@ struct cfqg_stats { /* Per-cgroup data */ struct cfq_group_data { /* must be the first member */ - struct blkcg_policy_data pd; + struct blkcg_policy_data cpd; unsigned int weight; unsigned int leaf_weight; @@ -304,7 +298,11 @@ struct cfq_group { int dispatched; struct cfq_ttime ttime; struct cfqg_stats stats; /* stats for this cfqg */ - struct cfqg_stats dead_stats; /* stats pushed from dead children */ + + /* async queue for each priority case */ + struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; + struct cfq_queue *async_idle_cfqq; + }; struct cfq_io_cq { @@ -370,12 +368,6 @@ struct cfq_data { struct cfq_queue *active_queue; struct cfq_io_cq *active_cic; - /* - * async queue for each priority case - */ - struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; - struct cfq_queue *async_idle_cfqq; - sector_t last_position; /* @@ -401,6 +393,7 @@ struct cfq_data { }; static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); +static void cfq_put_queue(struct cfq_queue *cfqq); static struct cfq_rb_root *st_for(struct cfq_group *cfqg, enum wl_class_t class, @@ -612,7 +605,7 @@ static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) static struct cfq_group_data *cpd_to_cfqgd(struct blkcg_policy_data *cpd) { - return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL; + return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL; } static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) @@ -693,14 +686,6 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) blkg_rwstat_add(&cfqg->stats.merged, rw, 1); } -static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, - uint64_t bytes, int rw) -{ - blkg_stat_add(&cfqg->stats.sectors, bytes >> 9); - blkg_rwstat_add(&cfqg->stats.serviced, rw, 1); - blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes); -} - static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, uint64_t start_time, uint64_t io_start_time, int rw) { @@ -718,8 +703,6 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, static void cfqg_stats_reset(struct cfqg_stats *stats) { /* queued stats shouldn't be cleared */ - blkg_rwstat_reset(&stats->service_bytes); - blkg_rwstat_reset(&stats->serviced); blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); blkg_rwstat_reset(&stats->wait_time); @@ -736,28 +719,26 @@ static void cfqg_stats_reset(struct cfqg_stats *stats) } /* @to += @from */ -static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from) +static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from) { /* queued stats shouldn't be cleared */ - blkg_rwstat_merge(&to->service_bytes, &from->service_bytes); - blkg_rwstat_merge(&to->serviced, &from->serviced); - blkg_rwstat_merge(&to->merged, &from->merged); - blkg_rwstat_merge(&to->service_time, &from->service_time); - blkg_rwstat_merge(&to->wait_time, &from->wait_time); - blkg_stat_merge(&from->time, &from->time); + blkg_rwstat_add_aux(&to->merged, &from->merged); + blkg_rwstat_add_aux(&to->service_time, &from->service_time); + blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); + blkg_stat_add_aux(&from->time, &from->time); #ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time); - blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum); - blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples); - blkg_stat_merge(&to->dequeue, &from->dequeue); - blkg_stat_merge(&to->group_wait_time, &from->group_wait_time); - blkg_stat_merge(&to->idle_time, &from->idle_time); - blkg_stat_merge(&to->empty_time, &from->empty_time); + blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); + blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); + blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); + blkg_stat_add_aux(&to->dequeue, &from->dequeue); + blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); + blkg_stat_add_aux(&to->idle_time, &from->idle_time); + blkg_stat_add_aux(&to->empty_time, &from->empty_time); #endif } /* - * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors' + * Transfer @cfqg's stats to its parent's aux counts so that the ancestors' * recursive stats can still account for the amount used by this cfqg after * it's gone. */ @@ -770,10 +751,8 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) if (unlikely(!parent)) return; - cfqg_stats_merge(&parent->dead_stats, &cfqg->stats); - cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats); + cfqg_stats_add_aux(&parent->stats, &cfqg->stats); cfqg_stats_reset(&cfqg->stats); - cfqg_stats_reset(&cfqg->dead_stats); } #else /* CONFIG_CFQ_GROUP_IOSCHED */ @@ -795,8 +774,6 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, unsigned long time, unsigned long unaccounted_time) { } static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { } static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { } -static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, - uint64_t bytes, int rw) { } static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, uint64_t start_time, uint64_t io_start_time, int rw) { } @@ -883,8 +860,7 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, static void cfq_dispatch_insert(struct request_queue *, struct request *); static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, - struct cfq_io_cq *cic, struct bio *bio, - gfp_t gfp_mask); + struct cfq_io_cq *cic, struct bio *bio); static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) { @@ -1546,130 +1522,171 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg) } #ifdef CONFIG_CFQ_GROUP_IOSCHED -static void cfqg_stats_init(struct cfqg_stats *stats) +static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, + bool on_dfl, bool reset_dev, bool is_leaf_weight); + +static void cfqg_stats_exit(struct cfqg_stats *stats) { - blkg_rwstat_init(&stats->service_bytes); - blkg_rwstat_init(&stats->serviced); - blkg_rwstat_init(&stats->merged); - blkg_rwstat_init(&stats->service_time); - blkg_rwstat_init(&stats->wait_time); - blkg_rwstat_init(&stats->queued); + blkg_rwstat_exit(&stats->merged); + blkg_rwstat_exit(&stats->service_time); + blkg_rwstat_exit(&stats->wait_time); + blkg_rwstat_exit(&stats->queued); + blkg_stat_exit(&stats->time); +#ifdef CONFIG_DEBUG_BLK_CGROUP + blkg_stat_exit(&stats->unaccounted_time); + blkg_stat_exit(&stats->avg_queue_size_sum); + blkg_stat_exit(&stats->avg_queue_size_samples); + blkg_stat_exit(&stats->dequeue); + blkg_stat_exit(&stats->group_wait_time); + blkg_stat_exit(&stats->idle_time); + blkg_stat_exit(&stats->empty_time); +#endif +} - blkg_stat_init(&stats->sectors); - blkg_stat_init(&stats->time); +static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp) +{ + if (blkg_rwstat_init(&stats->merged, gfp) || + blkg_rwstat_init(&stats->service_time, gfp) || + blkg_rwstat_init(&stats->wait_time, gfp) || + blkg_rwstat_init(&stats->queued, gfp) || + blkg_stat_init(&stats->time, gfp)) + goto err; #ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_init(&stats->unaccounted_time); - blkg_stat_init(&stats->avg_queue_size_sum); - blkg_stat_init(&stats->avg_queue_size_samples); - blkg_stat_init(&stats->dequeue); - blkg_stat_init(&stats->group_wait_time); - blkg_stat_init(&stats->idle_time); - blkg_stat_init(&stats->empty_time); + if (blkg_stat_init(&stats->unaccounted_time, gfp) || + blkg_stat_init(&stats->avg_queue_size_sum, gfp) || + blkg_stat_init(&stats->avg_queue_size_samples, gfp) || + blkg_stat_init(&stats->dequeue, gfp) || + blkg_stat_init(&stats->group_wait_time, gfp) || + blkg_stat_init(&stats->idle_time, gfp) || + blkg_stat_init(&stats->empty_time, gfp)) + goto err; #endif + return 0; +err: + cfqg_stats_exit(stats); + return -ENOMEM; } -static void cfq_cpd_init(const struct blkcg *blkcg) +static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) { - struct cfq_group_data *cgd = - cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]); + struct cfq_group_data *cgd; - if (blkcg == &blkcg_root) { - cgd->weight = 2 * CFQ_WEIGHT_DEFAULT; - cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; - } else { - cgd->weight = CFQ_WEIGHT_DEFAULT; - cgd->leaf_weight = CFQ_WEIGHT_DEFAULT; - } + cgd = kzalloc(sizeof(*cgd), GFP_KERNEL); + if (!cgd) + return NULL; + return &cgd->cpd; +} + +static void cfq_cpd_init(struct blkcg_policy_data *cpd) +{ + struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); + unsigned int weight = cgroup_on_dfl(blkcg_root.css.cgroup) ? + CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; + + if (cpd_to_blkcg(cpd) == &blkcg_root) + weight *= 2; + + cgd->weight = weight; + cgd->leaf_weight = weight; } -static void cfq_pd_init(struct blkcg_gq *blkg) +static void cfq_cpd_free(struct blkcg_policy_data *cpd) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); - struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg); + kfree(cpd_to_cfqgd(cpd)); +} + +static void cfq_cpd_bind(struct blkcg_policy_data *cpd) +{ + struct blkcg *blkcg = cpd_to_blkcg(cpd); + bool on_dfl = cgroup_on_dfl(blkcg_root.css.cgroup); + unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; + + if (blkcg == &blkcg_root) + weight *= 2; + + WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false)); + WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true)); +} + +static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) +{ + struct cfq_group *cfqg; + + cfqg = kzalloc_node(sizeof(*cfqg), gfp, node); + if (!cfqg) + return NULL; cfq_init_cfqg_base(cfqg); + if (cfqg_stats_init(&cfqg->stats, gfp)) { + kfree(cfqg); + return NULL; + } + + return &cfqg->pd; +} + +static void cfq_pd_init(struct blkg_policy_data *pd) +{ + struct cfq_group *cfqg = pd_to_cfqg(pd); + struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg); + cfqg->weight = cgd->weight; cfqg->leaf_weight = cgd->leaf_weight; - cfqg_stats_init(&cfqg->stats); - cfqg_stats_init(&cfqg->dead_stats); } -static void cfq_pd_offline(struct blkcg_gq *blkg) +static void cfq_pd_offline(struct blkg_policy_data *pd) { + struct cfq_group *cfqg = pd_to_cfqg(pd); + int i; + + for (i = 0; i < IOPRIO_BE_NR; i++) { + if (cfqg->async_cfqq[0][i]) + cfq_put_queue(cfqg->async_cfqq[0][i]); + if (cfqg->async_cfqq[1][i]) + cfq_put_queue(cfqg->async_cfqq[1][i]); + } + + if (cfqg->async_idle_cfqq) + cfq_put_queue(cfqg->async_idle_cfqq); + /* * @blkg is going offline and will be ignored by * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so * that they don't get lost. If IOs complete after this point, the * stats for them will be lost. Oh well... */ - cfqg_stats_xfer_dead(blkg_to_cfqg(blkg)); + cfqg_stats_xfer_dead(cfqg); } -/* offset delta from cfqg->stats to cfqg->dead_stats */ -static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) - - offsetof(struct cfq_group, stats); - -/* to be used by recursive prfill, sums live and dead stats recursively */ -static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) +static void cfq_pd_free(struct blkg_policy_data *pd) { - u64 sum = 0; - - sum += blkg_stat_recursive_sum(pd, off); - sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta); - return sum; -} - -/* to be used by recursive prfill, sums live and dead rwstats recursively */ -static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, - int off) -{ - struct blkg_rwstat a, b; + struct cfq_group *cfqg = pd_to_cfqg(pd); - a = blkg_rwstat_recursive_sum(pd, off); - b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta); - blkg_rwstat_merge(&a, &b); - return a; + cfqg_stats_exit(&cfqg->stats); + return kfree(cfqg); } -static void cfq_pd_reset_stats(struct blkcg_gq *blkg) +static void cfq_pd_reset_stats(struct blkg_policy_data *pd) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); + struct cfq_group *cfqg = pd_to_cfqg(pd); cfqg_stats_reset(&cfqg->stats); - cfqg_stats_reset(&cfqg->dead_stats); } -/* - * Search for the cfq group current task belongs to. request_queue lock must - * be held. - */ -static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) +static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, + struct blkcg *blkcg) { - struct request_queue *q = cfqd->queue; - struct cfq_group *cfqg = NULL; - - /* avoid lookup for the common case where there's no blkcg */ - if (blkcg == &blkcg_root) { - cfqg = cfqd->root_group; - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - if (!IS_ERR(blkg)) - cfqg = blkg_to_cfqg(blkg); - } + struct blkcg_gq *blkg; - return cfqg; + blkg = blkg_lookup(blkcg, cfqd->queue); + if (likely(blkg)) + return blkg_to_cfqg(blkg); + return NULL; } static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { - /* Currently, all async queues are mapped to root group */ - if (!cfq_cfqq_sync(cfqq)) - cfqg = cfqq->cfqd->root_group; - cfqq->cfqg = cfqg; /* cfqq reference on cfqg */ cfqg_get(cfqg); @@ -1739,36 +1756,48 @@ static int cfq_print_leaf_weight(struct seq_file *sf, void *v) static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, - bool is_leaf_weight) + bool on_dfl, bool is_leaf_weight) { + unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; + unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct cfq_group *cfqg; struct cfq_group_data *cfqgd; int ret; + u64 v; ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); if (ret) return ret; - ret = -EINVAL; + if (sscanf(ctx.body, "%llu", &v) == 1) { + /* require "default" on dfl */ + ret = -ERANGE; + if (!v && on_dfl) + goto out_finish; + } else if (!strcmp(strim(ctx.body), "default")) { + v = 0; + } else { + ret = -EINVAL; + goto out_finish; + } + cfqg = blkg_to_cfqg(ctx.blkg); cfqgd = blkcg_to_cfqgd(blkcg); - if (!cfqg || !cfqgd) - goto err; - if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { + ret = -ERANGE; + if (!v || (v >= min && v <= max)) { if (!is_leaf_weight) { - cfqg->dev_weight = ctx.v; - cfqg->new_weight = ctx.v ?: cfqgd->weight; + cfqg->dev_weight = v; + cfqg->new_weight = v ?: cfqgd->weight; } else { - cfqg->dev_leaf_weight = ctx.v; - cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight; + cfqg->dev_leaf_weight = v; + cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight; } ret = 0; } - -err: +out_finish: blkg_conf_finish(&ctx); return ret ?: nbytes; } @@ -1776,25 +1805,27 @@ err: static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return __cfqg_set_weight_device(of, buf, nbytes, off, false); + return __cfqg_set_weight_device(of, buf, nbytes, off, false, false); } static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return __cfqg_set_weight_device(of, buf, nbytes, off, true); + return __cfqg_set_weight_device(of, buf, nbytes, off, false, true); } -static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, - u64 val, bool is_leaf_weight) +static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, + bool on_dfl, bool reset_dev, bool is_leaf_weight) { + unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; + unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; struct cfq_group_data *cfqgd; int ret = 0; - if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) - return -EINVAL; + if (val < min || val > max) + return -ERANGE; spin_lock_irq(&blkcg->lock); cfqgd = blkcg_to_cfqgd(blkcg); @@ -1815,9 +1846,13 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, continue; if (!is_leaf_weight) { + if (reset_dev) + cfqg->dev_weight = 0; if (!cfqg->dev_weight) cfqg->new_weight = cfqgd->weight; } else { + if (reset_dev) + cfqg->dev_leaf_weight = 0; if (!cfqg->dev_leaf_weight) cfqg->new_leaf_weight = cfqgd->leaf_weight; } @@ -1831,13 +1866,13 @@ out: static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - return __cfq_set_weight(css, cft, val, false); + return __cfq_set_weight(css, val, false, false, false); } static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - return __cfq_set_weight(css, cft, val, true); + return __cfq_set_weight(css, val, false, false, true); } static int cfqg_print_stat(struct seq_file *sf, void *v) @@ -1857,16 +1892,16 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v) static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = cfqg_stat_pd_recursive_sum(pd, off); - + u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_cfq, off); return __blkg_prfill_u64(sf, pd, sum); } static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off); - + struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_cfq, off); return __blkg_prfill_rwstat(sf, pd, &sum); } @@ -1886,6 +1921,40 @@ static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v) return 0; } +static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int cfqg_print_stat_sectors(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false); + return 0; +} + +static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes)); + u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0, + false); + return 0; +} + #ifdef CONFIG_DEBUG_BLK_CGROUP static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, struct blkg_policy_data *pd, int off) @@ -1912,7 +1981,7 @@ static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v) } #endif /* CONFIG_DEBUG_BLK_CGROUP */ -static struct cftype cfq_blkcg_files[] = { +static struct cftype cfq_blkcg_legacy_files[] = { /* on root, weight is mapped to leaf_weight */ { .name = "weight_device", @@ -1960,18 +2029,17 @@ static struct cftype cfq_blkcg_files[] = { }, { .name = "sectors", - .private = offsetof(struct cfq_group, stats.sectors), - .seq_show = cfqg_print_stat, + .seq_show = cfqg_print_stat_sectors, }, { .name = "io_service_bytes", - .private = offsetof(struct cfq_group, stats.service_bytes), - .seq_show = cfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_bytes, }, { .name = "io_serviced", - .private = offsetof(struct cfq_group, stats.serviced), - .seq_show = cfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_ios, }, { .name = "io_service_time", @@ -2002,18 +2070,17 @@ static struct cftype cfq_blkcg_files[] = { }, { .name = "sectors_recursive", - .private = offsetof(struct cfq_group, stats.sectors), - .seq_show = cfqg_print_stat_recursive, + .seq_show = cfqg_print_stat_sectors_recursive, }, { .name = "io_service_bytes_recursive", - .private = offsetof(struct cfq_group, stats.service_bytes), - .seq_show = cfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_bytes_recursive, }, { .name = "io_serviced_recursive", - .private = offsetof(struct cfq_group, stats.serviced), - .seq_show = cfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_ios_recursive, }, { .name = "io_service_time_recursive", @@ -2068,9 +2135,51 @@ static struct cftype cfq_blkcg_files[] = { #endif /* CONFIG_DEBUG_BLK_CGROUP */ { } /* terminate */ }; + +static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); + + seq_printf(sf, "default %u\n", cgd->weight); + blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device, + &blkcg_policy_cfq, 0, false); + return 0; +} + +static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + char *endp; + int ret; + u64 v; + + buf = strim(buf); + + /* "WEIGHT" or "default WEIGHT" sets the default weight */ + v = simple_strtoull(buf, &endp, 0); + if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { + ret = __cfq_set_weight(of_css(of), v, true, false, false); + return ret ?: nbytes; + } + + /* "MAJ:MIN WEIGHT" */ + return __cfqg_set_weight_device(of, buf, nbytes, off, true, false); +} + +static struct cftype cfq_blkcg_files[] = { + { + .name = "weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cfq_print_weight_on_dfl, + .write = cfq_set_weight_on_dfl, + }, + { } /* terminate */ +}; + #else /* GROUP_IOSCHED */ -static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) +static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, + struct blkcg *blkcg) { return cfqd->root_group; } @@ -2873,7 +2982,6 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; cfqq->nr_sectors += blk_rq_sectors(rq); - cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags); } /* @@ -3506,14 +3614,14 @@ static void cfq_exit_icq(struct io_cq *icq) struct cfq_io_cq *cic = icq_to_cic(icq); struct cfq_data *cfqd = cic_to_cfqd(cic); - if (cic->cfqq[BLK_RW_ASYNC]) { - cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); - cic->cfqq[BLK_RW_ASYNC] = NULL; + if (cic_to_cfqq(cic, false)) { + cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false)); + cic_set_cfqq(cic, NULL, false); } - if (cic->cfqq[BLK_RW_SYNC]) { - cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); - cic->cfqq[BLK_RW_SYNC] = NULL; + if (cic_to_cfqq(cic, true)) { + cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true)); + cic_set_cfqq(cic, NULL, true); } } @@ -3572,18 +3680,14 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) return; - cfqq = cic->cfqq[BLK_RW_ASYNC]; + cfqq = cic_to_cfqq(cic, false); if (cfqq) { - struct cfq_queue *new_cfqq; - new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, - GFP_ATOMIC); - if (new_cfqq) { - cic->cfqq[BLK_RW_ASYNC] = new_cfqq; - cfq_put_queue(cfqq); - } + cfq_put_queue(cfqq); + cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio); + cic_set_cfqq(cic, cfqq, false); } - cfqq = cic->cfqq[BLK_RW_SYNC]; + cfqq = cic_to_cfqq(cic, true); if (cfqq) cfq_mark_cfqq_prio_changed(cfqq); @@ -3614,7 +3718,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { struct cfq_data *cfqd = cic_to_cfqd(cic); - struct cfq_queue *sync_cfqq; + struct cfq_queue *cfqq; uint64_t serial_nr; rcu_read_lock(); @@ -3628,15 +3732,22 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) return; - sync_cfqq = cic_to_cfqq(cic, 1); - if (sync_cfqq) { - /* - * Drop reference to sync queue. A new sync queue will be - * assigned in new group upon arrival of a fresh request. - */ - cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); - cic_set_cfqq(cic, NULL, 1); - cfq_put_queue(sync_cfqq); + /* + * Drop reference to queues. New queues will be assigned in new + * group upon arrival of fresh requests. + */ + cfqq = cic_to_cfqq(cic, false); + if (cfqq) { + cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); + cic_set_cfqq(cic, NULL, false); + cfq_put_queue(cfqq); + } + + cfqq = cic_to_cfqq(cic, true); + if (cfqq) { + cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); + cic_set_cfqq(cic, NULL, true); + cfq_put_queue(cfqq); } cic->blkcg_serial_nr = serial_nr; @@ -3645,81 +3756,19 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } #endif /* CONFIG_CFQ_GROUP_IOSCHED */ -static struct cfq_queue * -cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, - struct bio *bio, gfp_t gfp_mask) -{ - struct blkcg *blkcg; - struct cfq_queue *cfqq, *new_cfqq = NULL; - struct cfq_group *cfqg; - -retry: - rcu_read_lock(); - - blkcg = bio_blkcg(bio); - cfqg = cfq_lookup_create_cfqg(cfqd, blkcg); - if (!cfqg) { - cfqq = &cfqd->oom_cfqq; - goto out; - } - - cfqq = cic_to_cfqq(cic, is_sync); - - /* - * Always try a new alloc if we fell back to the OOM cfqq - * originally, since it should just be a temporary situation. - */ - if (!cfqq || cfqq == &cfqd->oom_cfqq) { - cfqq = NULL; - if (new_cfqq) { - cfqq = new_cfqq; - new_cfqq = NULL; - } else if (gfp_mask & __GFP_WAIT) { - rcu_read_unlock(); - spin_unlock_irq(cfqd->queue->queue_lock); - new_cfqq = kmem_cache_alloc_node(cfq_pool, - gfp_mask | __GFP_ZERO, - cfqd->queue->node); - spin_lock_irq(cfqd->queue->queue_lock); - if (new_cfqq) - goto retry; - else - return &cfqd->oom_cfqq; - } else { - cfqq = kmem_cache_alloc_node(cfq_pool, - gfp_mask | __GFP_ZERO, - cfqd->queue->node); - } - - if (cfqq) { - cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); - cfq_init_prio_data(cfqq, cic); - cfq_link_cfqq_cfqg(cfqq, cfqg); - cfq_log_cfqq(cfqd, cfqq, "alloced"); - } else - cfqq = &cfqd->oom_cfqq; - } -out: - if (new_cfqq) - kmem_cache_free(cfq_pool, new_cfqq); - - rcu_read_unlock(); - return cfqq; -} - static struct cfq_queue ** -cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) +cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio) { switch (ioprio_class) { case IOPRIO_CLASS_RT: - return &cfqd->async_cfqq[0][ioprio]; + return &cfqg->async_cfqq[0][ioprio]; case IOPRIO_CLASS_NONE: ioprio = IOPRIO_NORM; /* fall through */ case IOPRIO_CLASS_BE: - return &cfqd->async_cfqq[1][ioprio]; + return &cfqg->async_cfqq[1][ioprio]; case IOPRIO_CLASS_IDLE: - return &cfqd->async_idle_cfqq; + return &cfqg->async_idle_cfqq; default: BUG(); } @@ -3727,12 +3776,20 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) static struct cfq_queue * cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, - struct bio *bio, gfp_t gfp_mask) + struct bio *bio) { int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); struct cfq_queue **async_cfqq = NULL; - struct cfq_queue *cfqq = NULL; + struct cfq_queue *cfqq; + struct cfq_group *cfqg; + + rcu_read_lock(); + cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio)); + if (!cfqg) { + cfqq = &cfqd->oom_cfqq; + goto out; + } if (!is_sync) { if (!ioprio_valid(cic->ioprio)) { @@ -3740,22 +3797,32 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, ioprio = task_nice_ioprio(tsk); ioprio_class = task_nice_ioclass(tsk); } - async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); + async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio); cfqq = *async_cfqq; + if (cfqq) + goto out; } - if (!cfqq) - cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask); + cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO, + cfqd->queue->node); + if (!cfqq) { + cfqq = &cfqd->oom_cfqq; + goto out; + } - /* - * pin the queue now that it's allocated, scheduler exit will prune it - */ - if (!is_sync && !(*async_cfqq)) { + cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); + cfq_init_prio_data(cfqq, cic); + cfq_link_cfqq_cfqg(cfqq, cfqg); + cfq_log_cfqq(cfqd, cfqq, "alloced"); + + if (async_cfqq) { + /* a new async queue is created, pin and remember */ cfqq->ref++; *async_cfqq = cfqq; } - +out: cfqq->ref++; + rcu_read_unlock(); return cfqq; } @@ -4289,8 +4356,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, const bool is_sync = rq_is_sync(rq); struct cfq_queue *cfqq; - might_sleep_if(gfp_mask & __GFP_WAIT); - spin_lock_irq(q->queue_lock); check_ioprio_changed(cic, bio); @@ -4298,7 +4363,9 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, new_queue: cfqq = cic_to_cfqq(cic, is_sync); if (!cfqq || cfqq == &cfqd->oom_cfqq) { - cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask); + if (cfqq) + cfq_put_queue(cfqq); + cfqq = cfq_get_queue(cfqd, is_sync, cic, bio); cic_set_cfqq(cic, cfqq, is_sync); } else { /* @@ -4404,21 +4471,6 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) cancel_work_sync(&cfqd->unplug_work); } -static void cfq_put_async_queues(struct cfq_data *cfqd) -{ - int i; - - for (i = 0; i < IOPRIO_BE_NR; i++) { - if (cfqd->async_cfqq[0][i]) - cfq_put_queue(cfqd->async_cfqq[0][i]); - if (cfqd->async_cfqq[1][i]) - cfq_put_queue(cfqd->async_cfqq[1][i]); - } - - if (cfqd->async_idle_cfqq) - cfq_put_queue(cfqd->async_idle_cfqq); -} - static void cfq_exit_queue(struct elevator_queue *e) { struct cfq_data *cfqd = e->elevator_data; @@ -4431,8 +4483,6 @@ static void cfq_exit_queue(struct elevator_queue *e) if (cfqd->active_queue) __cfq_slice_expired(cfqd, cfqd->active_queue, 0); - cfq_put_async_queues(cfqd); - spin_unlock_irq(q->queue_lock); cfq_shutdown_timer_wq(cfqd); @@ -4486,9 +4536,9 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) goto out_free; cfq_init_cfqg_base(cfqd->root_group); + cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL; + cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL; #endif - cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; - cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; /* * Not strictly needed (since RB_ROOT just clears the node and we @@ -4499,7 +4549,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) cfqd->prio_trees[i] = RB_ROOT; /* - * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. + * Our fallback cfqq if cfq_get_queue() runs into OOM issues. * Grab a permanent reference to it, so that the normal code flow * will not attempt to free it. oom_cfqq is linked to root_group * but shouldn't hold a reference as it'll never be unlinked. Lose @@ -4683,13 +4733,18 @@ static struct elevator_type iosched_cfq = { #ifdef CONFIG_CFQ_GROUP_IOSCHED static struct blkcg_policy blkcg_policy_cfq = { - .pd_size = sizeof(struct cfq_group), - .cpd_size = sizeof(struct cfq_group_data), - .cftypes = cfq_blkcg_files, + .dfl_cftypes = cfq_blkcg_files, + .legacy_cftypes = cfq_blkcg_legacy_files, + .cpd_alloc_fn = cfq_cpd_alloc, .cpd_init_fn = cfq_cpd_init, + .cpd_free_fn = cfq_cpd_free, + .cpd_bind_fn = cfq_cpd_bind, + + .pd_alloc_fn = cfq_pd_alloc, .pd_init_fn = cfq_pd_init, .pd_offline_fn = cfq_pd_offline, + .pd_free_fn = cfq_pd_free, .pd_reset_stats_fn = cfq_pd_reset_stats, }; #endif diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ae0f438c2ee6..24489126f8ca 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -53,8 +53,6 @@ struct wb_writeback_work { unsigned int for_background:1; unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ unsigned int auto_free:1; /* free on completion */ - unsigned int single_wait:1; - unsigned int single_done:1; enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ @@ -178,14 +176,11 @@ static void wb_wakeup(struct bdi_writeback *wb) static void wb_queue_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { - trace_writeback_queue(wb->bdi, work); + trace_writeback_queue(wb, work); spin_lock_bh(&wb->work_lock); - if (!test_bit(WB_registered, &wb->state)) { - if (work->single_wait) - work->single_done = 1; + if (!test_bit(WB_registered, &wb->state)) goto out_unlock; - } if (work->done) atomic_inc(&work->done->cnt); list_add_tail(&work->list, &wb->work_list); @@ -706,7 +701,7 @@ EXPORT_SYMBOL_GPL(wbc_account_io); /** * inode_congested - test whether an inode is congested - * @inode: inode to test for congestion + * @inode: inode to test for congestion (may be NULL) * @cong_bits: mask of WB_[a]sync_congested bits to test * * Tests whether @inode is congested. @cong_bits is the mask of congestion @@ -716,6 +711,9 @@ EXPORT_SYMBOL_GPL(wbc_account_io); * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg * associated with @inode is congested; otherwise, the root wb's congestion * state is used. + * + * @inode is allowed to be NULL as this function is often called on + * mapping->host which is NULL for the swapper space. */ int inode_congested(struct inode *inode, int cong_bits) { @@ -738,32 +736,6 @@ int inode_congested(struct inode *inode, int cong_bits) EXPORT_SYMBOL_GPL(inode_congested); /** - * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work - * @bdi: bdi the work item was issued to - * @work: work item to wait for - * - * Wait for the completion of @work which was issued to one of @bdi's - * bdi_writeback's. The caller must have set @work->single_wait before - * issuing it. This wait operates independently fo - * wb_wait_for_completion() and also disables automatic freeing of @work. - */ -static void wb_wait_for_single_work(struct backing_dev_info *bdi, - struct wb_writeback_work *work) -{ - if (WARN_ON_ONCE(!work->single_wait)) - return; - - wait_event(bdi->wb_waitq, work->single_done); - - /* - * Paired with smp_wmb() in wb_do_writeback() and ensures that all - * modifications to @work prior to assertion of ->single_done is - * visible to the caller once this function returns. - */ - smp_rmb(); -} - -/** * wb_split_bdi_pages - split nr_pages to write according to bandwidth * @wb: target bdi_writeback to split @nr_pages to * @nr_pages: number of pages to write for the whole bdi @@ -792,38 +764,6 @@ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) } /** - * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb - * @wb: target bdi_writeback - * @base_work: source wb_writeback_work - * - * Try to make a clone of @base_work and issue it to @wb. If cloning - * succeeds, %true is returned; otherwise, @base_work is issued directly - * and %false is returned. In the latter case, the caller is required to - * wait for @base_work's completion using wb_wait_for_single_work(). - * - * A clone is auto-freed on completion. @base_work never is. - */ -static bool wb_clone_and_queue_work(struct bdi_writeback *wb, - struct wb_writeback_work *base_work) -{ - struct wb_writeback_work *work; - - work = kmalloc(sizeof(*work), GFP_ATOMIC); - if (work) { - *work = *base_work; - work->auto_free = 1; - work->single_wait = 0; - } else { - work = base_work; - work->auto_free = 0; - work->single_wait = 1; - } - work->single_done = 0; - wb_queue_work(wb, work); - return work != base_work; -} - -/** * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi * @bdi: target backing_dev_info * @base_work: wb_writeback_work to issue @@ -838,15 +778,19 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, struct wb_writeback_work *base_work, bool skip_if_busy) { - long nr_pages = base_work->nr_pages; - int next_blkcg_id = 0; + int next_memcg_id = 0; struct bdi_writeback *wb; struct wb_iter iter; might_sleep(); restart: rcu_read_lock(); - bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) { + bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) { + DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done); + struct wb_writeback_work fallback_work; + struct wb_writeback_work *work; + long nr_pages; + /* SYNC_ALL writes out I_DIRTY_TIME too */ if (!wb_has_dirty_io(wb) && (base_work->sync_mode == WB_SYNC_NONE || @@ -855,13 +799,30 @@ restart: if (skip_if_busy && writeback_in_progress(wb)) continue; - base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages); - if (!wb_clone_and_queue_work(wb, base_work)) { - next_blkcg_id = wb->blkcg_css->id + 1; - rcu_read_unlock(); - wb_wait_for_single_work(bdi, base_work); - goto restart; + nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages); + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + *work = *base_work; + work->nr_pages = nr_pages; + work->auto_free = 1; + wb_queue_work(wb, work); + continue; } + + /* alloc failed, execute synchronously using on-stack fallback */ + work = &fallback_work; + *work = *base_work; + work->nr_pages = nr_pages; + work->auto_free = 0; + work->done = &fallback_work_done; + + wb_queue_work(wb, work); + + next_memcg_id = wb->memcg_css->id + 1; + rcu_read_unlock(); + wb_wait_for_completion(bdi, &fallback_work_done); + goto restart; } rcu_read_unlock(); } @@ -902,8 +863,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) { base_work->auto_free = 0; - base_work->single_wait = 0; - base_work->single_done = 0; wb_queue_work(&bdi->wb, base_work); } } @@ -924,7 +883,7 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - trace_writeback_nowork(wb->bdi); + trace_writeback_nowork(wb); wb_wakeup(wb); return; } @@ -954,7 +913,7 @@ void wb_start_background_writeback(struct bdi_writeback *wb) * We just wake up the flusher thread. It will perform background * writeback as soon as there is no other work to do. */ - trace_writeback_wake_background(wb->bdi); + trace_writeback_wake_background(wb); wb_wakeup(wb); } @@ -1660,14 +1619,14 @@ static long wb_writeback(struct bdi_writeback *wb, } else if (work->for_background) oldest_jif = jiffies; - trace_writeback_start(wb->bdi, work); + trace_writeback_start(wb, work); if (list_empty(&wb->b_io)) queue_io(wb, work); if (work->sb) progress = writeback_sb_inodes(work->sb, wb, work); else progress = __writeback_inodes_wb(wb, work); - trace_writeback_written(wb->bdi, work); + trace_writeback_written(wb, work); wb_update_bandwidth(wb, wb_start); @@ -1692,7 +1651,7 @@ static long wb_writeback(struct bdi_writeback *wb, * we'll just busyloop. */ if (!list_empty(&wb->b_more_io)) { - trace_writeback_wait(wb->bdi, work); + trace_writeback_wait(wb, work); inode = wb_inode(wb->b_more_io.prev); spin_lock(&inode->i_lock); spin_unlock(&wb->list_lock); @@ -1797,26 +1756,14 @@ static long wb_do_writeback(struct bdi_writeback *wb) set_bit(WB_writeback_running, &wb->state); while ((work = get_next_work_item(wb)) != NULL) { struct wb_completion *done = work->done; - bool need_wake_up = false; - trace_writeback_exec(wb->bdi, work); + trace_writeback_exec(wb, work); wrote += wb_writeback(wb, work); - if (work->single_wait) { - WARN_ON_ONCE(work->auto_free); - /* paired w/ rmb in wb_wait_for_single_work() */ - smp_wmb(); - work->single_done = 1; - need_wake_up = true; - } else if (work->auto_free) { + if (work->auto_free) kfree(work); - } - if (done && atomic_dec_and_test(&done->cnt)) - need_wake_up = true; - - if (need_wake_up) wake_up_all(&wb->bdi->wb_waitq); } diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 2d48d28e1640..91e004518237 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -92,6 +92,29 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) } /** + * kernfs_path_len - determine the length of the full path of a given node + * @kn: kernfs_node of interest + * + * The returned length doesn't include the space for the terminating '\0'. + */ +size_t kernfs_path_len(struct kernfs_node *kn) +{ + size_t len = 0; + unsigned long flags; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + + do { + len += strlen(kn->name) + 1; + kn = kn->parent; + } while (kn && kn->parent); + + spin_unlock_irqrestore(&kernfs_rename_lock, flags); + + return len; +} + +/** * kernfs_path - build full path of a given node * @kn: kernfs_node of interest * @buf: buffer to copy @kn's name into diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0fe9df983ab7..5a5d79ee256f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -286,7 +286,7 @@ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi * %current's blkcg equals the effective blkcg of its memcg. No * need to use the relatively expensive cgroup_get_e_css(). */ - if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id))) + if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id))) return wb; return NULL; } @@ -402,7 +402,7 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) } struct wb_iter { - int start_blkcg_id; + int start_memcg_id; struct radix_tree_iter tree_iter; void **slot; }; @@ -414,9 +414,9 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter, WARN_ON_ONCE(!rcu_read_lock_held()); - if (iter->start_blkcg_id >= 0) { - iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id); - iter->start_blkcg_id = -1; + if (iter->start_memcg_id >= 0) { + iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id); + iter->start_memcg_id = -1; } else { iter->slot = radix_tree_next_slot(iter->slot, titer, 0); } @@ -430,30 +430,30 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter, static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter, struct backing_dev_info *bdi, - int start_blkcg_id) + int start_memcg_id) { - iter->start_blkcg_id = start_blkcg_id; + iter->start_memcg_id = start_memcg_id; - if (start_blkcg_id) + if (start_memcg_id) return __wb_iter_next(iter, bdi); else return &bdi->wb; } /** - * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order + * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order * @wb_cur: cursor struct bdi_writeback pointer * @bdi: bdi to walk wb's of * @iter: pointer to struct wb_iter to be used as iteration buffer - * @start_blkcg_id: blkcg ID to start iteration from + * @start_memcg_id: memcg ID to start iteration from * * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending - * blkcg ID order starting from @start_blkcg_id. @iter is struct wb_iter + * memcg ID order starting from @start_memcg_id. @iter is struct wb_iter * to be used as temp storage during iteration. rcu_read_lock() must be * held throughout iteration. */ -#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \ - for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id); \ +#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id) \ + for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id); \ (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi)) #else /* CONFIG_CGROUP_WRITEBACK */ diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index a4cd1641e9e2..0a5cc7a1109b 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -14,12 +14,15 @@ */ #include <linux/cgroup.h> -#include <linux/u64_stats_sync.h> +#include <linux/percpu_counter.h> #include <linux/seq_file.h> #include <linux/radix-tree.h> #include <linux/blkdev.h> #include <linux/atomic.h> +/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ +#define BLKG_STAT_CPU_BATCH (INT_MAX / 2) + /* Max limits for throttle policy */ #define THROTL_IOPS_MAX UINT_MAX @@ -45,7 +48,7 @@ struct blkcg { struct blkcg_gq *blkg_hint; struct hlist_head blkg_list; - struct blkcg_policy_data *pd[BLKCG_MAX_POLS]; + struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; struct list_head all_blkcgs_node; #ifdef CONFIG_CGROUP_WRITEBACK @@ -53,14 +56,19 @@ struct blkcg { #endif }; +/* + * blkg_[rw]stat->aux_cnt is excluded for local stats but included for + * recursive. Used to carry stats of dead children, and, for blkg_rwstat, + * to carry result values from read and sum operations. + */ struct blkg_stat { - struct u64_stats_sync syncp; - uint64_t cnt; + struct percpu_counter cpu_cnt; + atomic64_t aux_cnt; }; struct blkg_rwstat { - struct u64_stats_sync syncp; - uint64_t cnt[BLKG_RWSTAT_NR]; + struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR]; + atomic64_t aux_cnt[BLKG_RWSTAT_NR]; }; /* @@ -68,32 +76,28 @@ struct blkg_rwstat { * request_queue (q). This is used by blkcg policies which need to track * information per blkcg - q pair. * - * There can be multiple active blkcg policies and each has its private - * data on each blkg, the size of which is determined by - * blkcg_policy->pd_size. blkcg core allocates and frees such areas - * together with blkg and invokes pd_init/exit_fn() methods. - * - * Such private data must embed struct blkg_policy_data (pd) at the - * beginning and pd_size can't be smaller than pd. + * There can be multiple active blkcg policies and each blkg:policy pair is + * represented by a blkg_policy_data which is allocated and freed by each + * policy's pd_alloc/free_fn() methods. A policy can allocate private data + * area by allocating larger data structure which embeds blkg_policy_data + * at the beginning. */ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; int plid; - - /* used during policy activation */ - struct list_head alloc_node; }; /* - * Policies that need to keep per-blkcg data which is independent - * from any request_queue associated to it must specify its size - * with the cpd_size field of the blkcg_policy structure and - * embed a blkcg_policy_data in it. cpd_init() is invoked to let - * each policy handle per-blkcg data. + * Policies that need to keep per-blkcg data which is independent from any + * request_queue associated to it should implement cpd_alloc/free_fn() + * methods. A policy can allocate private data area by allocating larger + * data structure which embeds blkcg_policy_data at the beginning. + * cpd_init() is invoked to let each policy handle per-blkcg data. */ struct blkcg_policy_data { - /* the policy id this per-policy data belongs to */ + /* the blkcg and policy id this per-policy data belongs to */ + struct blkcg *blkcg; int plid; }; @@ -123,40 +127,50 @@ struct blkcg_gq { /* is this blkg online? protected by both blkcg and q locks */ bool online; + struct blkg_rwstat stat_bytes; + struct blkg_rwstat stat_ios; + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; struct rcu_head rcu_head; }; -typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg); -typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); +typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); +typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); +typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); +typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); +typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); +typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); struct blkcg_policy { int plid; - /* policy specific private data size */ - size_t pd_size; - /* policy specific per-blkcg data size */ - size_t cpd_size; /* cgroup files for the policy */ - struct cftype *cftypes; + struct cftype *dfl_cftypes; + struct cftype *legacy_cftypes; /* operations */ + blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; blkcg_pol_init_cpd_fn *cpd_init_fn; + blkcg_pol_free_cpd_fn *cpd_free_fn; + blkcg_pol_bind_cpd_fn *cpd_bind_fn; + + blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; blkcg_pol_offline_pd_fn *pd_offline_fn; - blkcg_pol_exit_pd_fn *pd_exit_fn; + blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; }; extern struct blkcg blkcg_root; extern struct cgroup_subsys_state * const blkcg_root_css; -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); +struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, + struct request_queue *q, bool update_hint); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q); int blkcg_init_queue(struct request_queue *q); @@ -171,6 +185,7 @@ int blkcg_activate_policy(struct request_queue *q, void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol); +const char *blkg_dev_name(struct blkcg_gq *blkg); void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), @@ -182,19 +197,24 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); +int blkg_print_stat_bytes(struct seq_file *sf, void *v); +int blkg_print_stat_ios(struct seq_file *sf, void *v); +int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v); +int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v); -u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, - int off); +u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off); +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off); struct blkg_conf_ctx { struct gendisk *disk; struct blkcg_gq *blkg; - u64 v; + char *body; }; int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx); + char *input, struct blkg_conf_ctx *ctx); void blkg_conf_finish(struct blkg_conf_ctx *ctx); @@ -205,7 +225,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) static inline struct blkcg *task_blkcg(struct task_struct *tsk) { - return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); + return css_to_blkcg(task_css(tsk, io_cgrp_id)); } static inline struct blkcg *bio_blkcg(struct bio *bio) @@ -218,7 +238,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) static inline struct cgroup_subsys_state * task_get_blkcg_css(struct task_struct *task) { - return task_get_css(task, blkio_cgrp_id); + return task_get_css(task, io_cgrp_id); } /** @@ -233,6 +253,52 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) } /** + * __blkg_lookup - internal version of blkg_lookup() + * @blkcg: blkcg of interest + * @q: request_queue of interest + * @update_hint: whether to update lookup hint with the result or not + * + * This is internal version and shouldn't be used by policy + * implementations. Looks up blkgs for the @blkcg - @q pair regardless of + * @q's bypass state. If @update_hint is %true, the caller should be + * holding @q->queue_lock and lookup hint is updated on success. + */ +static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, + struct request_queue *q, + bool update_hint) +{ + struct blkcg_gq *blkg; + + if (blkcg == &blkcg_root) + return q->root_blkg; + + blkg = rcu_dereference(blkcg->blkg_hint); + if (blkg && blkg->q == q) + return blkg; + + return blkg_lookup_slowpath(blkcg, q, update_hint); +} + +/** + * blkg_lookup - lookup blkg for the specified blkcg - q pair + * @blkcg: blkcg of interest + * @q: request_queue of interest + * + * Lookup blkg for the @blkcg - @q pair. This function should be called + * under RCU read lock and is guaranteed to return %NULL if @q is bypassing + * - see blk_queue_bypass_start() for details. + */ +static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, + struct request_queue *q) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (unlikely(blk_queue_bypass(q))) + return NULL; + return __blkg_lookup(blkcg, q, false); +} + +/** * blkg_to_pdata - get policy private data * @blkg: blkg of interest * @pol: policy of interest @@ -248,7 +314,7 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, struct blkcg_policy *pol) { - return blkcg ? blkcg->pd[pol->plid] : NULL; + return blkcg ? blkcg->cpd[pol->plid] : NULL; } /** @@ -262,6 +328,11 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) return pd ? pd->blkg : NULL; } +static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) +{ + return cpd ? cpd->blkcg : NULL; +} + /** * blkg_path - format cgroup path of blkg * @blkg: blkg of interest @@ -309,9 +380,6 @@ static inline void blkg_put(struct blkcg_gq *blkg) call_rcu(&blkg->rcu_head, __blkg_release_rcu); } -struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, - bool update_hint); - /** * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant @@ -373,8 +441,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q, * or if either the blkcg or queue is going away. Fall back to * root_rl in such cases. */ - blkg = blkg_lookup_create(blkcg, q); - if (IS_ERR(blkg)) + blkg = blkg_lookup(blkcg, q); + if (unlikely(!blkg)) goto root_rl; blkg_get(blkg); @@ -394,8 +462,7 @@ root_rl: */ static inline void blk_put_rl(struct request_list *rl) { - /* root_rl may not have blkg set */ - if (rl->blkg && rl->blkg->blkcg != &blkcg_root) + if (rl->blkg->blkcg != &blkcg_root) blkg_put(rl->blkg); } @@ -433,9 +500,21 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl, #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) -static inline void blkg_stat_init(struct blkg_stat *stat) +static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp) { - u64_stats_init(&stat->syncp); + int ret; + + ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp); + if (ret) + return ret; + + atomic64_set(&stat->aux_cnt, 0); + return 0; +} + +static inline void blkg_stat_exit(struct blkg_stat *stat) +{ + percpu_counter_destroy(&stat->cpu_cnt); } /** @@ -443,34 +522,21 @@ static inline void blkg_stat_init(struct blkg_stat *stat) * @stat: target blkg_stat * @val: value to add * - * Add @val to @stat. The caller is responsible for synchronizing calls to - * this function. + * Add @val to @stat. The caller must ensure that IRQ on the same CPU + * don't re-enter this function for the same counter. */ static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) { - u64_stats_update_begin(&stat->syncp); - stat->cnt += val; - u64_stats_update_end(&stat->syncp); + __percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH); } /** * blkg_stat_read - read the current value of a blkg_stat * @stat: blkg_stat to read - * - * Read the current value of @stat. This function can be called without - * synchroniztion and takes care of u64 atomicity. */ static inline uint64_t blkg_stat_read(struct blkg_stat *stat) { - unsigned int start; - uint64_t v; - - do { - start = u64_stats_fetch_begin_irq(&stat->syncp); - v = stat->cnt; - } while (u64_stats_fetch_retry_irq(&stat->syncp, start)); - - return v; + return percpu_counter_sum_positive(&stat->cpu_cnt); } /** @@ -479,24 +545,46 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat) */ static inline void blkg_stat_reset(struct blkg_stat *stat) { - stat->cnt = 0; + percpu_counter_set(&stat->cpu_cnt, 0); + atomic64_set(&stat->aux_cnt, 0); } /** - * blkg_stat_merge - merge a blkg_stat into another + * blkg_stat_add_aux - add a blkg_stat into another's aux count * @to: the destination blkg_stat * @from: the source * - * Add @from's count to @to. + * Add @from's count including the aux one to @to's aux count. */ -static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) +static inline void blkg_stat_add_aux(struct blkg_stat *to, + struct blkg_stat *from) { - blkg_stat_add(to, blkg_stat_read(from)); + atomic64_add(blkg_stat_read(from) + atomic64_read(&from->aux_cnt), + &to->aux_cnt); } -static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) +static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp) { - u64_stats_init(&rwstat->syncp); + int i, ret; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp); + if (ret) { + while (--i >= 0) + percpu_counter_destroy(&rwstat->cpu_cnt[i]); + return ret; + } + atomic64_set(&rwstat->aux_cnt[i], 0); + } + return 0; +} + +static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat) +{ + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + percpu_counter_destroy(&rwstat->cpu_cnt[i]); } /** @@ -511,39 +599,38 @@ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, int rw, uint64_t val) { - u64_stats_update_begin(&rwstat->syncp); + struct percpu_counter *cnt; if (rw & REQ_WRITE) - rwstat->cnt[BLKG_RWSTAT_WRITE] += val; + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; else - rwstat->cnt[BLKG_RWSTAT_READ] += val; + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; + + __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); + if (rw & REQ_SYNC) - rwstat->cnt[BLKG_RWSTAT_SYNC] += val; + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; else - rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; - u64_stats_update_end(&rwstat->syncp); + __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); } /** * blkg_rwstat_read - read the current values of a blkg_rwstat * @rwstat: blkg_rwstat to read * - * Read the current snapshot of @rwstat and return it as the return value. - * This function can be called without synchronization and takes care of - * u64 atomicity. + * Read the current snapshot of @rwstat and return it in the aux counts. */ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) { - unsigned int start; - struct blkg_rwstat tmp; - - do { - start = u64_stats_fetch_begin_irq(&rwstat->syncp); - tmp = *rwstat; - } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start)); + struct blkg_rwstat result; + int i; - return tmp; + for (i = 0; i < BLKG_RWSTAT_NR; i++) + atomic64_set(&result.aux_cnt[i], + percpu_counter_sum_positive(&rwstat->cpu_cnt[i])); + return result; } /** @@ -558,7 +645,8 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) { struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); - return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; + return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); } /** @@ -567,26 +655,71 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) */ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) { - memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + percpu_counter_set(&rwstat->cpu_cnt[i], 0); + atomic64_set(&rwstat->aux_cnt[i], 0); + } } /** - * blkg_rwstat_merge - merge a blkg_rwstat into another + * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count * @to: the destination blkg_rwstat * @from: the source * - * Add @from's counts to @to. + * Add @from's count including the aux one to @to's aux count. */ -static inline void blkg_rwstat_merge(struct blkg_rwstat *to, - struct blkg_rwstat *from) +static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, + struct blkg_rwstat *from) { struct blkg_rwstat v = blkg_rwstat_read(from); int i; - u64_stats_update_begin(&to->syncp); for (i = 0; i < BLKG_RWSTAT_NR; i++) - to->cnt[i] += v.cnt[i]; - u64_stats_update_end(&to->syncp); + atomic64_add(atomic64_read(&v.aux_cnt[i]) + + atomic64_read(&from->aux_cnt[i]), + &to->aux_cnt[i]); +} + +#ifdef CONFIG_BLK_DEV_THROTTLING +extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, + struct bio *bio); +#else +static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, + struct bio *bio) { return false; } +#endif + +static inline bool blkcg_bio_issue_check(struct request_queue *q, + struct bio *bio) +{ + struct blkcg *blkcg; + struct blkcg_gq *blkg; + bool throtl = false; + + rcu_read_lock(); + blkcg = bio_blkcg(bio); + + blkg = blkg_lookup(blkcg, q); + if (unlikely(!blkg)) { + spin_lock_irq(q->queue_lock); + blkg = blkg_lookup_create(blkcg, q); + if (IS_ERR(blkg)) + blkg = NULL; + spin_unlock_irq(q->queue_lock); + } + + throtl = blk_throtl_bio(q, blkg, bio); + + if (!throtl) { + blkg = blkg ?: q->root_blkg; + blkg_rwstat_add(&blkg->stat_bytes, bio->bi_flags, + bio->bi_iter.bi_size); + blkg_rwstat_add(&blkg->stat_ios, bio->bi_flags, 1); + } + + rcu_read_unlock(); + return !throtl; } #else /* CONFIG_BLK_CGROUP */ @@ -642,6 +775,9 @@ static inline void blk_put_rl(struct request_list *rl) { } static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } +static inline bool blkcg_bio_issue_check(struct request_queue *q, + struct bio *bio) { return true; } + #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 1f36945fd23d..1a96fdaa33d5 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -27,7 +27,7 @@ SUBSYS(cpuacct) #endif #if IS_ENABLED(CONFIG_BLK_CGROUP) -SUBSYS(blkio) +SUBSYS(io) #endif #if IS_ENABLED(CONFIG_MEMCG) diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 123be25ea15a..5d4e9c4b821d 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -266,6 +266,7 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) } int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); +size_t kernfs_path_len(struct kernfs_node *kn); char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen); void pr_cont_kernfs_name(struct kernfs_node *kn); @@ -332,6 +333,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { return -ENOSYS; } +static inline size_t kernfs_path_len(struct kernfs_node *kn) +{ return 0; } + static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) { return NULL; } diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index a7aa607a4c55..fff846b512e6 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -131,6 +131,66 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, TP_ARGS(inode, flags) ); +#ifdef CREATE_TRACE_POINTS +#ifdef CONFIG_CGROUP_WRITEBACK + +static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb) +{ + return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1; +} + +static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb) +{ + struct cgroup *cgrp = wb->memcg_css->cgroup; + char *path; + + path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1); + WARN_ON_ONCE(path != buf); +} + +static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc) +{ + if (wbc->wb) + return __trace_wb_cgroup_size(wbc->wb); + else + return 2; +} + +static inline void __trace_wbc_assign_cgroup(char *buf, + struct writeback_control *wbc) +{ + if (wbc->wb) + __trace_wb_assign_cgroup(buf, wbc->wb); + else + strcpy(buf, "/"); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb) +{ + return 2; +} + +static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb) +{ + strcpy(buf, "/"); +} + +static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc) +{ + return 2; +} + +static inline void __trace_wbc_assign_cgroup(char *buf, + struct writeback_control *wbc) +{ + strcpy(buf, "/"); +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ +#endif /* CREATE_TRACE_POINTS */ + DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc), @@ -141,6 +201,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, __array(char, name, 32) __field(unsigned long, ino) __field(int, sync_mode) + __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc)) ), TP_fast_assign( @@ -148,12 +209,14 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, dev_name(inode_to_bdi(inode)->dev), 32); __entry->ino = inode->i_ino; __entry->sync_mode = wbc->sync_mode; + __trace_wbc_assign_cgroup(__get_str(cgroup), wbc); ), - TP_printk("bdi %s: ino=%lu sync_mode=%d", + TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s", __entry->name, __entry->ino, - __entry->sync_mode + __entry->sync_mode, + __get_str(cgroup) ) ); @@ -172,8 +235,8 @@ DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode, ); DECLARE_EVENT_CLASS(writeback_work_class, - TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), - TP_ARGS(bdi, work), + TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), + TP_ARGS(wb, work), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_pages) @@ -183,10 +246,11 @@ DECLARE_EVENT_CLASS(writeback_work_class, __field(int, range_cyclic) __field(int, for_background) __field(int, reason) + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) ), TP_fast_assign( strncpy(__entry->name, - bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); + wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32); __entry->nr_pages = work->nr_pages; __entry->sb_dev = work->sb ? work->sb->s_dev : 0; __entry->sync_mode = work->sync_mode; @@ -194,9 +258,10 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->range_cyclic = work->range_cyclic; __entry->for_background = work->for_background; __entry->reason = work->reason; + __trace_wb_assign_cgroup(__get_str(cgroup), wb); ), TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " - "kupdate=%d range_cyclic=%d background=%d reason=%s", + "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s", __entry->name, MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), __entry->nr_pages, @@ -204,13 +269,14 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->for_kupdate, __entry->range_cyclic, __entry->for_background, - __print_symbolic(__entry->reason, WB_WORK_REASON) + __print_symbolic(__entry->reason, WB_WORK_REASON), + __get_str(cgroup) ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ DEFINE_EVENT(writeback_work_class, name, \ - TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ - TP_ARGS(bdi, work)) + TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \ + TP_ARGS(wb, work)) DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); DEFINE_WRITEBACK_WORK_EVENT(writeback_start); @@ -230,26 +296,42 @@ TRACE_EVENT(writeback_pages_written, ); DECLARE_EVENT_CLASS(writeback_class, - TP_PROTO(struct backing_dev_info *bdi), - TP_ARGS(bdi), + TP_PROTO(struct bdi_writeback *wb), + TP_ARGS(wb), TP_STRUCT__entry( __array(char, name, 32) + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strncpy(__entry->name, dev_name(wb->bdi->dev), 32); + __trace_wb_assign_cgroup(__get_str(cgroup), wb); ), - TP_printk("bdi %s", - __entry->name + TP_printk("bdi %s: cgroup=%s", + __entry->name, + __get_str(cgroup) ) ); #define DEFINE_WRITEBACK_EVENT(name) \ DEFINE_EVENT(writeback_class, name, \ - TP_PROTO(struct backing_dev_info *bdi), \ - TP_ARGS(bdi)) + TP_PROTO(struct bdi_writeback *wb), \ + TP_ARGS(wb)) DEFINE_WRITEBACK_EVENT(writeback_nowork); DEFINE_WRITEBACK_EVENT(writeback_wake_background); -DEFINE_WRITEBACK_EVENT(writeback_bdi_register); + +TRACE_EVENT(writeback_bdi_register, + TP_PROTO(struct backing_dev_info *bdi), + TP_ARGS(bdi), + TP_STRUCT__entry( + __array(char, name, 32) + ), + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + ), + TP_printk("bdi %s", + __entry->name + ) +); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), @@ -265,6 +347,7 @@ DECLARE_EVENT_CLASS(wbc_class, __field(int, range_cyclic) __field(long, range_start) __field(long, range_end) + __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc)) ), TP_fast_assign( @@ -278,11 +361,12 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->range_cyclic = wbc->range_cyclic; __entry->range_start = (long)wbc->range_start; __entry->range_end = (long)wbc->range_end; + __trace_wbc_assign_cgroup(__get_str(cgroup), wbc); ), TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " "bgrd=%d reclm=%d cyclic=%d " - "start=0x%lx end=0x%lx", + "start=0x%lx end=0x%lx cgroup=%s", __entry->name, __entry->nr_to_write, __entry->pages_skipped, @@ -292,7 +376,9 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->for_reclaim, __entry->range_cyclic, __entry->range_start, - __entry->range_end) + __entry->range_end, + __get_str(cgroup) + ) ) #define DEFINE_WBC_EVENT(name) \ @@ -312,6 +398,7 @@ TRACE_EVENT(writeback_queue_io, __field(long, age) __field(int, moved) __field(int, reason) + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) ), TP_fast_assign( unsigned long *older_than_this = work->older_than_this; @@ -321,13 +408,15 @@ TRACE_EVENT(writeback_queue_io, (jiffies - *older_than_this) * 1000 / HZ : -1; __entry->moved = moved; __entry->reason = work->reason; + __trace_wb_assign_cgroup(__get_str(cgroup), wb); ), - TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s", + TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s", __entry->name, __entry->older, /* older_than_this in jiffies */ __entry->age, /* older_than_this in relative milliseconds */ __entry->moved, - __print_symbolic(__entry->reason, WB_WORK_REASON) + __print_symbolic(__entry->reason, WB_WORK_REASON), + __get_str(cgroup) ) ); @@ -381,11 +470,11 @@ TRACE_EVENT(global_dirty_state, TRACE_EVENT(bdi_dirty_ratelimit, - TP_PROTO(struct backing_dev_info *bdi, + TP_PROTO(struct bdi_writeback *wb, unsigned long dirty_rate, unsigned long task_ratelimit), - TP_ARGS(bdi, dirty_rate, task_ratelimit), + TP_ARGS(wb, dirty_rate, task_ratelimit), TP_STRUCT__entry( __array(char, bdi, 32) @@ -395,36 +484,39 @@ TRACE_EVENT(bdi_dirty_ratelimit, __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned long, balanced_dirty_ratelimit) + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) ), TP_fast_assign( - strlcpy(__entry->bdi, dev_name(bdi->dev), 32); - __entry->write_bw = KBps(bdi->wb.write_bandwidth); - __entry->avg_write_bw = KBps(bdi->wb.avg_write_bandwidth); + strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); + __entry->write_bw = KBps(wb->write_bandwidth); + __entry->avg_write_bw = KBps(wb->avg_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); - __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit); + __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->balanced_dirty_ratelimit = - KBps(bdi->wb.balanced_dirty_ratelimit); + KBps(wb->balanced_dirty_ratelimit); + __trace_wb_assign_cgroup(__get_str(cgroup), wb); ), TP_printk("bdi %s: " "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " - "balanced_dirty_ratelimit=%lu", + "balanced_dirty_ratelimit=%lu cgroup=%s", __entry->bdi, __entry->write_bw, /* write bandwidth */ __entry->avg_write_bw, /* avg write bandwidth */ __entry->dirty_rate, /* bdi dirty rate */ __entry->dirty_ratelimit, /* base ratelimit */ __entry->task_ratelimit, /* ratelimit with position control */ - __entry->balanced_dirty_ratelimit /* the balanced ratelimit */ + __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */ + __get_str(cgroup) ) ); TRACE_EVENT(balance_dirty_pages, - TP_PROTO(struct backing_dev_info *bdi, + TP_PROTO(struct bdi_writeback *wb, unsigned long thresh, unsigned long bg_thresh, unsigned long dirty, @@ -437,7 +529,7 @@ TRACE_EVENT(balance_dirty_pages, long pause, unsigned long start_time), - TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, + TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, dirty_ratelimit, task_ratelimit, dirtied, period, pause, start_time), @@ -456,11 +548,12 @@ TRACE_EVENT(balance_dirty_pages, __field( long, pause) __field(unsigned long, period) __field( long, think) + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) ), TP_fast_assign( unsigned long freerun = (thresh + bg_thresh) / 2; - strlcpy(__entry->bdi, dev_name(bdi->dev), 32); + strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); __entry->limit = global_wb_domain.dirty_limit; __entry->setpoint = (global_wb_domain.dirty_limit + @@ -478,6 +571,7 @@ TRACE_EVENT(balance_dirty_pages, __entry->period = period * 1000 / HZ; __entry->pause = pause * 1000 / HZ; __entry->paused = (jiffies - start_time) * 1000 / HZ; + __trace_wb_assign_cgroup(__get_str(cgroup), wb); ), @@ -486,7 +580,7 @@ TRACE_EVENT(balance_dirty_pages, "bdi_setpoint=%lu bdi_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " - "paused=%lu pause=%ld period=%lu think=%ld", + "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s", __entry->bdi, __entry->limit, __entry->setpoint, @@ -500,7 +594,8 @@ TRACE_EVENT(balance_dirty_pages, __entry->paused, /* ms */ __entry->pause, /* ms */ __entry->period, /* ms */ - __entry->think /* ms */ + __entry->think, /* ms */ + __get_str(cgroup) ) ); @@ -514,6 +609,8 @@ TRACE_EVENT(writeback_sb_inodes_requeue, __field(unsigned long, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) + __dynamic_array(char, cgroup, + __trace_wb_cgroup_size(inode_to_wb(inode))) ), TP_fast_assign( @@ -522,14 +619,16 @@ TRACE_EVENT(writeback_sb_inodes_requeue, __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; + __trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode)); ), - TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu", + TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s", __entry->name, __entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, - (jiffies - __entry->dirtied_when) / HZ + (jiffies - __entry->dirtied_when) / HZ, + __get_str(cgroup) ) ); @@ -585,6 +684,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, __field(unsigned long, writeback_index) __field(long, nr_to_write) __field(unsigned long, wrote) + __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc)) ), TP_fast_assign( @@ -596,10 +696,11 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, __entry->writeback_index = inode->i_mapping->writeback_index; __entry->nr_to_write = nr_to_write; __entry->wrote = nr_to_write - wbc->nr_to_write; + __trace_wbc_assign_cgroup(__get_str(cgroup), wbc); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " - "index=%lu to_write=%ld wrote=%lu", + "index=%lu to_write=%ld wrote=%lu cgroup=%s", __entry->name, __entry->ino, show_inode_state(__entry->state), @@ -607,7 +708,8 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, (jiffies - __entry->dirtied_when) / HZ, __entry->writeback_index, __entry->nr_to_write, - __entry->wrote + __entry->wrote, + __get_str(cgroup) ) ); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index ee8d7fd07be3..2df8ddcb0ca0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -523,7 +523,7 @@ static int cgwb_create(struct backing_dev_info *bdi, int ret = 0; memcg = mem_cgroup_from_css(memcg_css); - blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys); + blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); blkcg = css_to_blkcg(blkcg_css); memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); blkcg_cgwb_list = &blkcg->cgwb_list; @@ -645,7 +645,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, /* see whether the blkcg association has changed */ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, - &blkio_cgrp_subsys); + &io_cgrp_subsys); if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb))) wb = NULL; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5cccc127ef81..0a931cdd4f6b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1289,7 +1289,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; - trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit); + trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); } static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, @@ -1683,7 +1683,7 @@ static void balance_dirty_pages(struct address_space *mapping, * do a reset, as it may be a light dirtier. */ if (pause < min_pause) { - trace_balance_dirty_pages(bdi, + trace_balance_dirty_pages(wb, sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, @@ -1712,7 +1712,7 @@ static void balance_dirty_pages(struct address_space *mapping, } pause: - trace_balance_dirty_pages(bdi, + trace_balance_dirty_pages(wb, sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, |