diff options
author | Linus Torvalds | 2019-09-17 16:57:47 -0700 |
---|---|---|
committer | Linus Torvalds | 2019-09-17 16:57:47 -0700 |
commit | 7ad67ca5534ee7c958559c4ad610f05c4578e361 (patch) | |
tree | dc6b6a8a6b70b5f25b07bcdc06d8e77e705f6822 /mm | |
parent | 5260c2b863ef1152445ce93476c95d8c8a727eef (diff) | |
parent | 9c7eddf1b080f98fed1aadb74fe784f29bf77a08 (diff) |
Merge tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
- Two NVMe pull requests:
- ana log parse fix from Anton
- nvme quirks support for Apple devices from Ben
- fix missing bio completion tracing for multipath stack devices
from Hannes and Mikhail
- IP TOS settings for nvme rdma and tcp transports from Israel
- rq_dma_dir cleanups from Israel
- tracing for Get LBA Status command from Minwoo
- Some nvme-tcp cleanups from Minwoo, Potnuri and Myself
- Some consolidation between the fabrics transports for handling
the CAP register
- reset race with ns scanning fix for fabrics (move fabrics
commands to a dedicated request queue with a different lifetime
from the admin request queue)."
- controller reset and namespace scan races fixes
- nvme discovery log change uevent support
- naming improvements from Keith
- multiple discovery controllers reject fix from James
- some regular cleanups from various people
- Series fixing (and re-fixing) null_blk debug printing and nr_devices
checks (André)
- A few pull requests from Song, with fixes from Andy, Guoqing,
Guilherme, Neil, Nigel, and Yufen.
- REQ_OP_ZONE_RESET_ALL support (Chaitanya)
- Bio merge handling unification (Christoph)
- Pick default elevator correctly for devices with special needs
(Damien)
- Block stats fixes (Hou)
- Timeout and support devices nbd fixes (Mike)
- Series fixing races around elevator switching and device add/remove
(Ming)
- sed-opal cleanups (Revanth)
- Per device weight support for BFQ (Fam)
- Support for blk-iocost, a new model that can properly account cost of
IO workloads. (Tejun)
- blk-cgroup writeback fixes (Tejun)
- paride queue init fixes (zhengbin)
- blk_set_runtime_active() cleanup (Stanley)
- Block segment mapping optimizations (Bart)
- lightnvm fixes (Hans/Minwoo/YueHaibing)
- Various little fixes and cleanups
* tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block: (186 commits)
null_blk: format pr_* logs with pr_fmt
null_blk: match the type of parameter nr_devices
null_blk: do not fail the module load with zero devices
block: also check RQF_STATS in blk_mq_need_time_stamp()
block: make rq sector size accessible for block stats
bfq: Fix bfq linkage error
raid5: use bio_end_sector in r5_next_bio
raid5: remove STRIPE_OPS_REQ_PENDING
md: add feature flag MD_FEATURE_RAID0_LAYOUT
md/raid0: avoid RAID0 data corruption due to layout confusion.
raid5: don't set STRIPE_HANDLE to stripe which is in batch list
raid5: don't increment read_errors on EILSEQ return
nvmet: fix a wrong error status returned in error log page
nvme: send discovery log page change events to userspace
nvme: add uevent variables for controller devices
nvme: enable aen regardless of the presence of I/O queues
nvme-fabrics: allow discovery subsystems accept a kato
nvmet: Use PTR_ERR_OR_ZERO() in nvmet_init_discovery()
nvme: Remove redundant assignment of cq vector
nvme: Assign subsys instance from first ctrl
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 120 | ||||
-rw-r--r-- | mm/memcontrol.c | 139 | ||||
-rw-r--r-- | mm/page-writeback.c | 4 |
3 files changed, 243 insertions, 20 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e8e89158adec..d9daa3e422d0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/wait.h> +#include <linux/rbtree.h> #include <linux/backing-dev.h> #include <linux/kthread.h> #include <linux/freezer.h> @@ -22,10 +23,12 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; /* - * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side - * locking. + * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU + * reader side locking. */ DEFINE_SPINLOCK(bdi_lock); +static u64 bdi_id_cursor; +static struct rb_root bdi_tree = RB_ROOT; LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ @@ -615,13 +618,12 @@ out_put: } /** - * wb_get_create - get wb for a given memcg, create if necessary + * wb_get_lookup - get wb for a given memcg * @bdi: target bdi * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) - * @gfp: allocation mask to use * - * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to - * create one. The returned wb has its refcount incremented. + * Try to get the wb for @memcg_css on @bdi. The returned wb has its + * refcount incremented. * * This function uses css_get() on @memcg_css and thus expects its refcnt * to be positive on invocation. IOW, rcu_read_lock() protection on @@ -638,6 +640,39 @@ out_put: * each lookup. On mismatch, the existing wb is discarded and a new one is * created. */ +struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css) +{ + struct bdi_writeback *wb; + + if (!memcg_css->parent) + return &bdi->wb; + + rcu_read_lock(); + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + if (wb) { + struct cgroup_subsys_state *blkcg_css; + + /* see whether the blkcg association has changed */ + blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); + if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb))) + wb = NULL; + css_put(blkcg_css); + } + rcu_read_unlock(); + + return wb; +} + +/** + * wb_get_create - get wb for a given memcg, create if necessary + * @bdi: target bdi + * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) + * @gfp: allocation mask to use + * + * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to + * create one. See wb_get_lookup() for more details. + */ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) @@ -650,20 +685,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, return &bdi->wb; do { - rcu_read_lock(); - wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); - if (wb) { - struct cgroup_subsys_state *blkcg_css; - - /* see whether the blkcg association has changed */ - blkcg_css = cgroup_get_e_css(memcg_css->cgroup, - &io_cgrp_subsys); - if (unlikely(wb->blkcg_css != blkcg_css || - !wb_tryget(wb))) - wb = NULL; - css_put(blkcg_css); - } - rcu_read_unlock(); + wb = wb_get_lookup(bdi, memcg_css); } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); return wb; @@ -859,9 +881,58 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id) } EXPORT_SYMBOL(bdi_alloc_node); +static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp) +{ + struct rb_node **p = &bdi_tree.rb_node; + struct rb_node *parent = NULL; + struct backing_dev_info *bdi; + + lockdep_assert_held(&bdi_lock); + + while (*p) { + parent = *p; + bdi = rb_entry(parent, struct backing_dev_info, rb_node); + + if (bdi->id > id) + p = &(*p)->rb_left; + else if (bdi->id < id) + p = &(*p)->rb_right; + else + break; + } + + if (parentp) + *parentp = parent; + return p; +} + +/** + * bdi_get_by_id - lookup and get bdi from its id + * @id: bdi id to lookup + * + * Find bdi matching @id and get it. Returns NULL if the matching bdi + * doesn't exist or is already unregistered. + */ +struct backing_dev_info *bdi_get_by_id(u64 id) +{ + struct backing_dev_info *bdi = NULL; + struct rb_node **p; + + spin_lock_bh(&bdi_lock); + p = bdi_lookup_rb_node(id, NULL); + if (*p) { + bdi = rb_entry(*p, struct backing_dev_info, rb_node); + bdi_get(bdi); + } + spin_unlock_bh(&bdi_lock); + + return bdi; +} + int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) { struct device *dev; + struct rb_node *parent, **p; if (bdi->dev) /* The driver needs to use separate queues per device */ return 0; @@ -877,7 +948,15 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) set_bit(WB_registered, &bdi->wb.state); spin_lock_bh(&bdi_lock); + + bdi->id = ++bdi_id_cursor; + + p = bdi_lookup_rb_node(bdi->id, &parent); + rb_link_node(&bdi->rb_node, parent, p); + rb_insert_color(&bdi->rb_node, &bdi_tree); + list_add_tail_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); trace_writeback_bdi_register(bdi); @@ -918,6 +997,7 @@ EXPORT_SYMBOL(bdi_register_owner); static void bdi_remove_from_list(struct backing_dev_info *bdi) { spin_lock_bh(&bdi_lock); + rb_erase(&bdi->rb_node, &bdi_tree); list_del_rcu(&bdi->bdi_list); spin_unlock_bh(&bdi_lock); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9ec5e12486a7..597d58101872 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -87,6 +87,10 @@ int do_swap_account __read_mostly; #define do_swap_account 0 #endif +#ifdef CONFIG_CGROUP_WRITEBACK +static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); +#endif + /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { @@ -4172,6 +4176,8 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, #ifdef CONFIG_CGROUP_WRITEBACK +#include <trace/events/writeback.h> + static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) { return wb_domain_init(&memcg->cgwb_domain, gfp); @@ -4255,6 +4261,130 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, } } +/* + * Foreign dirty flushing + * + * There's an inherent mismatch between memcg and writeback. The former + * trackes ownership per-page while the latter per-inode. This was a + * deliberate design decision because honoring per-page ownership in the + * writeback path is complicated, may lead to higher CPU and IO overheads + * and deemed unnecessary given that write-sharing an inode across + * different cgroups isn't a common use-case. + * + * Combined with inode majority-writer ownership switching, this works well + * enough in most cases but there are some pathological cases. For + * example, let's say there are two cgroups A and B which keep writing to + * different but confined parts of the same inode. B owns the inode and + * A's memory is limited far below B's. A's dirty ratio can rise enough to + * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid + * triggering background writeback. A will be slowed down without a way to + * make writeback of the dirty pages happen. + * + * Conditions like the above can lead to a cgroup getting repatedly and + * severely throttled after making some progress after each + * dirty_expire_interval while the underyling IO device is almost + * completely idle. + * + * Solving this problem completely requires matching the ownership tracking + * granularities between memcg and writeback in either direction. However, + * the more egregious behaviors can be avoided by simply remembering the + * most recent foreign dirtying events and initiating remote flushes on + * them when local writeback isn't enough to keep the memory clean enough. + * + * The following two functions implement such mechanism. When a foreign + * page - a page whose memcg and writeback ownerships don't match - is + * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning + * bdi_writeback on the page owning memcg. When balance_dirty_pages() + * decides that the memcg needs to sleep due to high dirty ratio, it calls + * mem_cgroup_flush_foreign() which queues writeback on the recorded + * foreign bdi_writebacks which haven't expired. Both the numbers of + * recorded bdi_writebacks and concurrent in-flight foreign writebacks are + * limited to MEMCG_CGWB_FRN_CNT. + * + * The mechanism only remembers IDs and doesn't hold any object references. + * As being wrong occasionally doesn't matter, updates and accesses to the + * records are lockless and racy. + */ +void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, + struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = page->mem_cgroup; + struct memcg_cgwb_frn *frn; + u64 now = get_jiffies_64(); + u64 oldest_at = now; + int oldest = -1; + int i; + + trace_track_foreign_dirty(page, wb); + + /* + * Pick the slot to use. If there is already a slot for @wb, keep + * using it. If not replace the oldest one which isn't being + * written out. + */ + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { + frn = &memcg->cgwb_frn[i]; + if (frn->bdi_id == wb->bdi->id && + frn->memcg_id == wb->memcg_css->id) + break; + if (time_before64(frn->at, oldest_at) && + atomic_read(&frn->done.cnt) == 1) { + oldest = i; + oldest_at = frn->at; + } + } + + if (i < MEMCG_CGWB_FRN_CNT) { + /* + * Re-using an existing one. Update timestamp lazily to + * avoid making the cacheline hot. We want them to be + * reasonably up-to-date and significantly shorter than + * dirty_expire_interval as that's what expires the record. + * Use the shorter of 1s and dirty_expire_interval / 8. + */ + unsigned long update_intv = + min_t(unsigned long, HZ, + msecs_to_jiffies(dirty_expire_interval * 10) / 8); + + if (time_before64(frn->at, now - update_intv)) + frn->at = now; + } else if (oldest >= 0) { + /* replace the oldest free one */ + frn = &memcg->cgwb_frn[oldest]; + frn->bdi_id = wb->bdi->id; + frn->memcg_id = wb->memcg_css->id; + frn->at = now; + } +} + +/* issue foreign writeback flushes for recorded foreign dirtying events */ +void mem_cgroup_flush_foreign(struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); + u64 now = jiffies_64; + int i; + + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { + struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; + + /* + * If the record is older than dirty_expire_interval, + * writeback on it has already started. No need to kick it + * off again. Also, don't start a new one if there's + * already one in flight. + */ + if (time_after64(frn->at, now - intv) && + atomic_read(&frn->done.cnt) == 1) { + frn->at = 0; + trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); + cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0, + WB_REASON_FOREIGN_FLUSH, + &frn->done); + } + } +} + #else /* CONFIG_CGROUP_WRITEBACK */ static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) @@ -4777,6 +4907,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) struct mem_cgroup *memcg; unsigned int size; int node; + int __maybe_unused i; size = sizeof(struct mem_cgroup); size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); @@ -4820,6 +4951,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void) #endif #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) + memcg->cgwb_frn[i].done = + __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); #endif idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; @@ -4949,7 +5083,12 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + int __maybe_unused i; +#ifdef CONFIG_CGROUP_WRITEBACK + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) + wb_wait_for_completion(&memcg->cgwb_frn[i].done); +#endif if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_dec(&memcg_sockets_enabled_key); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 1804f64ff43c..50055d2e4ea8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1667,6 +1667,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb, if (unlikely(!writeback_in_progress(wb))) wb_start_background_writeback(wb); + mem_cgroup_flush_foreign(wb); + /* * Calculate global domain's pos_ratio and select the * global dtc by default. @@ -2427,6 +2429,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) task_io_account_write(PAGE_SIZE); current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); + + mem_cgroup_track_foreign_dirty(page, wb); } } |