From 82f9d486e59f588c7d100865c36510644abda356 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 26 Jul 2011 16:08:26 -0700 Subject: memcg: add memory.vmscan_stat The commit log of 0ae5e89c60c9 ("memcg: count the soft_limit reclaim in...") says it adds scanning stats to memory.stat file. But it doesn't because we considered we needed to make a concensus for such new APIs. This patch is a trial to add memory.scan_stat. This shows - the number of scanned pages(total, anon, file) - the number of rotated pages(total, anon, file) - the number of freed pages(total, anon, file) - the number of elaplsed time (including sleep/pause time) for both of direct/soft reclaim. The biggest difference with oringinal Ying's one is that this file can be reset by some write, as # echo 0 ...../memory.scan_stat Example of output is here. This is a result after make -j 6 kernel under 300M limit. [kamezawa@bluextal ~]$ cat /cgroup/memory/A/memory.scan_stat [kamezawa@bluextal ~]$ cat /cgroup/memory/A/memory.vmscan_stat scanned_pages_by_limit 9471864 scanned_anon_pages_by_limit 6640629 scanned_file_pages_by_limit 2831235 rotated_pages_by_limit 4243974 rotated_anon_pages_by_limit 3971968 rotated_file_pages_by_limit 272006 freed_pages_by_limit 2318492 freed_anon_pages_by_limit 962052 freed_file_pages_by_limit 1356440 elapsed_ns_by_limit 351386416101 scanned_pages_by_system 0 scanned_anon_pages_by_system 0 scanned_file_pages_by_system 0 rotated_pages_by_system 0 rotated_anon_pages_by_system 0 rotated_file_pages_by_system 0 freed_pages_by_system 0 freed_anon_pages_by_system 0 freed_file_pages_by_system 0 elapsed_ns_by_system 0 scanned_pages_by_limit_under_hierarchy 9471864 scanned_anon_pages_by_limit_under_hierarchy 6640629 scanned_file_pages_by_limit_under_hierarchy 2831235 rotated_pages_by_limit_under_hierarchy 4243974 rotated_anon_pages_by_limit_under_hierarchy 3971968 rotated_file_pages_by_limit_under_hierarchy 272006 freed_pages_by_limit_under_hierarchy 2318492 freed_anon_pages_by_limit_under_hierarchy 962052 freed_file_pages_by_limit_under_hierarchy 1356440 elapsed_ns_by_limit_under_hierarchy 351386416101 scanned_pages_by_system_under_hierarchy 0 scanned_anon_pages_by_system_under_hierarchy 0 scanned_file_pages_by_system_under_hierarchy 0 rotated_pages_by_system_under_hierarchy 0 rotated_anon_pages_by_system_under_hierarchy 0 rotated_file_pages_by_system_under_hierarchy 0 freed_pages_by_system_under_hierarchy 0 freed_anon_pages_by_system_under_hierarchy 0 freed_file_pages_by_system_under_hierarchy 0 elapsed_ns_by_system_under_hierarchy 0 total_xxxx is for hierarchy management. This will be useful for further memcg developments and need to be developped before we do some complicated rework on LRU/softlimit management. This patch adds a new struct memcg_scanrecord into scan_control struct. sc->nr_scanned at el is not designed for exporting information. For example, nr_scanned is reset frequentrly and incremented +2 at scanning mapped pages. To avoid complexity, I added a new param in scan_control which is for exporting scanning score. Signed-off-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Michal Hocko Cc: Ying Han Cc: Andrew Bresticker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 172 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 166 insertions(+), 6 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dfeca594fd7a..04e505bfd7dd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -205,6 +205,50 @@ struct mem_cgroup_eventfd_list { static void mem_cgroup_threshold(struct mem_cgroup *mem); static void mem_cgroup_oom_notify(struct mem_cgroup *mem); +enum { + SCAN_BY_LIMIT, + SCAN_BY_SYSTEM, + NR_SCAN_CONTEXT, + SCAN_BY_SHRINK, /* not recorded now */ +}; + +enum { + SCAN, + SCAN_ANON, + SCAN_FILE, + ROTATE, + ROTATE_ANON, + ROTATE_FILE, + FREED, + FREED_ANON, + FREED_FILE, + ELAPSED, + NR_SCANSTATS, +}; + +struct scanstat { + spinlock_t lock; + unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS]; + unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS]; +}; + +const char *scanstat_string[NR_SCANSTATS] = { + "scanned_pages", + "scanned_anon_pages", + "scanned_file_pages", + "rotated_pages", + "rotated_anon_pages", + "rotated_file_pages", + "freed_pages", + "freed_anon_pages", + "freed_file_pages", + "elapsed_ns", +}; +#define SCANSTAT_WORD_LIMIT "_by_limit" +#define SCANSTAT_WORD_SYSTEM "_by_system" +#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy" + + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -270,7 +314,8 @@ struct mem_cgroup { /* For oom notifier event fd */ struct list_head oom_notify; - + /* For recording LRU-scan statistics */ + struct scanstat scanstat; /* * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? @@ -1623,6 +1668,44 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) } #endif +static void __mem_cgroup_record_scanstat(unsigned long *stats, + struct memcg_scanrecord *rec) +{ + + stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1]; + stats[SCAN_ANON] += rec->nr_scanned[0]; + stats[SCAN_FILE] += rec->nr_scanned[1]; + + stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1]; + stats[ROTATE_ANON] += rec->nr_rotated[0]; + stats[ROTATE_FILE] += rec->nr_rotated[1]; + + stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1]; + stats[FREED_ANON] += rec->nr_freed[0]; + stats[FREED_FILE] += rec->nr_freed[1]; + + stats[ELAPSED] += rec->elapsed; +} + +static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec) +{ + struct mem_cgroup *mem; + int context = rec->context; + + if (context >= NR_SCAN_CONTEXT) + return; + + mem = rec->mem; + spin_lock(&mem->scanstat.lock); + __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec); + spin_unlock(&mem->scanstat.lock); + + mem = rec->root; + spin_lock(&mem->scanstat.lock); + __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec); + spin_unlock(&mem->scanstat.lock); +} + /* * Scan the hierarchy if needed to reclaim memory. We remember the last child * we reclaimed from, so that we don't end up penalizing one child extensively @@ -1647,8 +1730,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; + struct memcg_scanrecord rec; unsigned long excess; - unsigned long nr_scanned; + unsigned long scanned; excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; @@ -1656,6 +1740,15 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, if (!check_soft && !shrink && root_mem->memsw_is_minimum) noswap = true; + if (shrink) + rec.context = SCAN_BY_SHRINK; + else if (check_soft) + rec.context = SCAN_BY_SYSTEM; + else + rec.context = SCAN_BY_LIMIT; + + rec.root = root_mem; + while (1) { victim = mem_cgroup_select_victim(root_mem); if (victim == root_mem) { @@ -1696,14 +1789,23 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, css_put(&victim->css); continue; } + rec.mem = victim; + rec.nr_scanned[0] = 0; + rec.nr_scanned[1] = 0; + rec.nr_rotated[0] = 0; + rec.nr_rotated[1] = 0; + rec.nr_freed[0] = 0; + rec.nr_freed[1] = 0; + rec.elapsed = 0; /* we use swappiness of local cgroup */ if (check_soft) { ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, zone, &nr_scanned); - *total_scanned += nr_scanned; + noswap, zone, &rec, &scanned); + *total_scanned += scanned; } else ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, - noswap); + noswap, &rec); + mem_cgroup_record_scanstat(&rec); css_put(&victim->css); /* * At shrinking usage, we can't check we should stop here or @@ -3792,14 +3894,18 @@ try_to_free: /* try to free all pages in this cgroup */ shrink = 1; while (nr_retries && mem->res.usage > 0) { + struct memcg_scanrecord rec; int progress; if (signal_pending(current)) { ret = -EINTR; goto out; } + rec.context = SCAN_BY_SHRINK; + rec.mem = mem; + rec.root = mem; progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, - false); + false, &rec); if (!progress) { nr_retries--; /* maybe some writeback is necessary */ @@ -4643,6 +4749,54 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file) } #endif /* CONFIG_NUMA */ +static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp, + struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + char string[64]; + int i; + + for (i = 0; i < NR_SCANSTATS; i++) { + strcpy(string, scanstat_string[i]); + strcat(string, SCANSTAT_WORD_LIMIT); + cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]); + } + + for (i = 0; i < NR_SCANSTATS; i++) { + strcpy(string, scanstat_string[i]); + strcat(string, SCANSTAT_WORD_SYSTEM); + cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]); + } + + for (i = 0; i < NR_SCANSTATS; i++) { + strcpy(string, scanstat_string[i]); + strcat(string, SCANSTAT_WORD_LIMIT); + strcat(string, SCANSTAT_WORD_HIERARCHY); + cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]); + } + for (i = 0; i < NR_SCANSTATS; i++) { + strcpy(string, scanstat_string[i]); + strcat(string, SCANSTAT_WORD_SYSTEM); + strcat(string, SCANSTAT_WORD_HIERARCHY); + cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]); + } + return 0; +} + +static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp, + unsigned int event) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + + spin_lock(&mem->scanstat.lock); + memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats)); + memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats)); + spin_unlock(&mem->scanstat.lock); + return 0; +} + + static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -4713,6 +4867,11 @@ static struct cftype mem_cgroup_files[] = { .mode = S_IRUGO, }, #endif + { + .name = "vmscan_stat", + .read_map = mem_cgroup_vmscan_stat_read, + .trigger = mem_cgroup_reset_vmscan_stat, + }, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP @@ -4976,6 +5135,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) atomic_set(&mem->refcnt, 1); mem->move_charge_at_immigrate = 0; mutex_init(&mem->thresholds_lock); + spin_lock_init(&mem->scanstat.lock); return &mem->css; free_out: __mem_cgroup_free(mem); -- cgit v1.2.3