diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 222 |
1 files changed, 158 insertions, 64 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bd9052a5d3ad..e013b8e57d25 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -35,6 +35,7 @@ #include <linux/limits.h> #include <linux/mutex.h> #include <linux/rbtree.h> +#include <linux/shmem_fs.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/swapops.h> @@ -107,10 +108,12 @@ enum mem_cgroup_events_index { enum mem_cgroup_events_target { MEM_CGROUP_TARGET_THRESH, MEM_CGROUP_TARGET_SOFTLIMIT, + MEM_CGROUP_TARGET_NUMAINFO, MEM_CGROUP_NTARGETS, }; #define THRESHOLDS_EVENTS_TARGET (128) #define SOFTLIMIT_EVENTS_TARGET (1024) +#define NUMAINFO_EVENTS_TARGET (1024) struct mem_cgroup_stat_cpu { long count[MEM_CGROUP_STAT_NSTATS]; @@ -236,7 +239,8 @@ struct mem_cgroup { int last_scanned_node; #if MAX_NUMNODES > 1 nodemask_t scan_nodes; - unsigned long next_scan_node_update; + atomic_t numainfo_events; + atomic_t numainfo_updating; #endif /* * Should the accounting and control be hierarchical, per subtree? @@ -359,7 +363,7 @@ enum charge_type { static void mem_cgroup_get(struct mem_cgroup *mem); static void mem_cgroup_put(struct mem_cgroup *mem); static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); -static void drain_all_stock_async(void); +static void drain_all_stock_async(struct mem_cgroup *mem); static struct mem_cgroup_per_zone * mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) @@ -576,15 +580,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem, return val; } -static long mem_cgroup_local_usage(struct mem_cgroup *mem) -{ - long ret; - - ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); - ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); - return ret; -} - static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, bool charge) { @@ -688,6 +683,9 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) case MEM_CGROUP_TARGET_SOFTLIMIT: next = val + SOFTLIMIT_EVENTS_TARGET; break; + case MEM_CGROUP_TARGET_NUMAINFO: + next = val + NUMAINFO_EVENTS_TARGET; + break; default: return; } @@ -706,11 +704,19 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page) mem_cgroup_threshold(mem); __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); if (unlikely(__memcg_event_check(mem, - MEM_CGROUP_TARGET_SOFTLIMIT))){ + MEM_CGROUP_TARGET_SOFTLIMIT))) { mem_cgroup_update_tree(mem, page); __mem_cgroup_target_update(mem, - MEM_CGROUP_TARGET_SOFTLIMIT); + MEM_CGROUP_TARGET_SOFTLIMIT); } +#if MAX_NUMNODES > 1 + if (unlikely(__memcg_event_check(mem, + MEM_CGROUP_TARGET_NUMAINFO))) { + atomic_inc(&mem->numainfo_events); + __mem_cgroup_target_update(mem, + MEM_CGROUP_TARGET_NUMAINFO); + } +#endif } } @@ -735,7 +741,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) struct mem_cgroup, css); } -static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) +struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) { struct mem_cgroup *mem = NULL; @@ -1128,7 +1134,6 @@ unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, return MEM_CGROUP_ZSTAT(mz, lru); } -#ifdef CONFIG_NUMA static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, int nid) { @@ -1140,6 +1145,17 @@ static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, return ret; } +static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, + int nid) +{ + unsigned long ret; + + ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + + mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); + return ret; +} + +#if MAX_NUMNODES > 1 static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) { u64 total = 0; @@ -1151,17 +1167,6 @@ static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) return total; } -static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, - int nid) -{ - unsigned long ret; - - ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + - mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); - - return ret; -} - static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) { u64 total = 0; @@ -1558,6 +1563,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) return ret; } +/** + * test_mem_cgroup_node_reclaimable + * @mem: the target memcg + * @nid: the node ID to be checked. + * @noswap : specify true here if the user wants flle only information. + * + * This function returns whether the specified memcg contains any + * reclaimable pages on a node. Returns true if there are any reclaimable + * pages in the node. + */ +static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, + int nid, bool noswap) +{ + if (mem_cgroup_node_nr_file_lru_pages(mem, nid)) + return true; + if (noswap || !total_swap_pages) + return false; + if (mem_cgroup_node_nr_anon_lru_pages(mem, nid)) + return true; + return false; + +} #if MAX_NUMNODES > 1 /* @@ -1569,26 +1596,26 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) { int nid; - - if (time_after(mem->next_scan_node_update, jiffies)) + /* + * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET + * pagein/pageout changes since the last update. + */ + if (!atomic_read(&mem->numainfo_events)) + return; + if (atomic_inc_return(&mem->numainfo_updating) > 1) return; - mem->next_scan_node_update = jiffies + 10*HZ; /* make a nodemask where this memcg uses memory from */ mem->scan_nodes = node_states[N_HIGH_MEMORY]; for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { - if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) || - mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE)) - continue; - - if (total_swap_pages && - (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) || - mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON))) - continue; - node_clear(nid, mem->scan_nodes); + if (!test_mem_cgroup_node_reclaimable(mem, nid, false)) + node_clear(nid, mem->scan_nodes); } + + atomic_set(&mem->numainfo_events, 0); + atomic_set(&mem->numainfo_updating, 0); } /* @@ -1626,11 +1653,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem) return node; } +/* + * Check all nodes whether it contains reclaimable pages or not. + * For quick scan, we make use of scan_nodes. This will allow us to skip + * unused nodes. But scan_nodes is lazily updated and may not cotain + * enough new information. We need to do double check. + */ +bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) +{ + int nid; + + /* + * quick check...making use of scan_node. + * We can skip unused nodes. + */ + if (!nodes_empty(mem->scan_nodes)) { + for (nid = first_node(mem->scan_nodes); + nid < MAX_NUMNODES; + nid = next_node(nid, mem->scan_nodes)) { + + if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) + return true; + } + } + /* + * Check rest of nodes. + */ + for_each_node_state(nid, N_HIGH_MEMORY) { + if (node_isset(nid, mem->scan_nodes)) + continue; + if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) + return true; + } + return false; +} + #else int mem_cgroup_select_victim_node(struct mem_cgroup *mem) { return 0; } + +bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) +{ + return test_mem_cgroup_node_reclaimable(mem, 0, noswap); +} #endif /* @@ -1663,15 +1730,21 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; /* If memsw_is_minimum==1, swap-out is of-no-use. */ - if (root_mem->memsw_is_minimum) + if (!check_soft && root_mem->memsw_is_minimum) noswap = true; while (1) { victim = mem_cgroup_select_victim(root_mem); if (victim == root_mem) { loop++; - if (loop >= 1) - drain_all_stock_async(); + /* + * We are not draining per cpu cached charges during + * soft limit reclaim because global reclaim doesn't + * care about charges. It tries to free some memory and + * charges will not give any. + */ + if (!check_soft && loop >= 1) + drain_all_stock_async(root_mem); if (loop >= 2) { /* * If we have not been able to reclaim @@ -1695,7 +1768,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, } } } - if (!mem_cgroup_local_usage(victim)) { + if (!mem_cgroup_reclaimable(victim, noswap)) { /* this cgroup's local usage == 0 */ css_put(&victim->css); continue; @@ -1934,9 +2007,11 @@ struct memcg_stock_pcp { struct mem_cgroup *cached; /* this never be root cgroup */ unsigned int nr_pages; struct work_struct work; + unsigned long flags; +#define FLUSHING_CACHED_CHARGE (0) }; static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); -static atomic_t memcg_drain_count; +static DEFINE_MUTEX(percpu_charge_mutex); /* * Try to consume stocked charge on this cpu. If success, one page is consumed @@ -1984,6 +2059,7 @@ static void drain_local_stock(struct work_struct *dummy) { struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); drain_stock(stock); + clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); } /* @@ -2008,26 +2084,45 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) * expects some charges will be back to res_counter later but cannot wait for * it. */ -static void drain_all_stock_async(void) +static void drain_all_stock_async(struct mem_cgroup *root_mem) { - int cpu; - /* This function is for scheduling "drain" in asynchronous way. - * The result of "drain" is not directly handled by callers. Then, - * if someone is calling drain, we don't have to call drain more. - * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if - * there is a race. We just do loose check here. + int cpu, curcpu; + /* + * If someone calls draining, avoid adding more kworker runs. */ - if (atomic_read(&memcg_drain_count)) + if (!mutex_trylock(&percpu_charge_mutex)) return; /* Notify other cpus that system-wide "drain" is running */ - atomic_inc(&memcg_drain_count); get_online_cpus(); + /* + * Get a hint for avoiding draining charges on the current cpu, + * which must be exhausted by our charging. It is not required that + * this be a precise check, so we use raw_smp_processor_id() instead of + * getcpu()/putcpu(). + */ + curcpu = raw_smp_processor_id(); for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); - schedule_work_on(cpu, &stock->work); + struct mem_cgroup *mem; + + if (cpu == curcpu) + continue; + + mem = stock->cached; + if (!mem) + continue; + if (mem != root_mem) { + if (!root_mem->use_hierarchy) + continue; + /* check whether "mem" is under tree of "root_mem" */ + if (!css_is_ancestor(&mem->css, &root_mem->css)) + continue; + } + if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) + schedule_work_on(cpu, &stock->work); } put_online_cpus(); - atomic_dec(&memcg_drain_count); + mutex_unlock(&percpu_charge_mutex); /* We don't wait for flush_work */ } @@ -2035,9 +2130,9 @@ static void drain_all_stock_async(void) static void drain_all_stock_sync(void) { /* called when force_empty is called */ - atomic_inc(&memcg_drain_count); + mutex_lock(&percpu_charge_mutex); schedule_on_each_cpu(drain_local_stock); - atomic_dec(&memcg_drain_count); + mutex_unlock(&percpu_charge_mutex); } /* @@ -4640,6 +4735,7 @@ static struct cftype mem_cgroup_files[] = { { .name = "numa_stat", .open = mem_control_numa_stat_open, + .mode = S_IRUGO, }, #endif }; @@ -5414,18 +5510,16 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *old_cont, struct task_struct *p) { - struct mm_struct *mm; + struct mm_struct *mm = get_task_mm(p); - if (!mc.to) - /* no need to move charge */ - return; - - mm = get_task_mm(p); if (mm) { - mem_cgroup_move_charge(mm); + if (mc.to) + mem_cgroup_move_charge(mm); + put_swap_token(mm); mmput(mm); } - mem_cgroup_clear_mc(); + if (mc.to) + mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, |