From f95ba941d1bee594d536cdcbf879a0865381b903 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 25 Jan 2011 15:07:11 -0800 Subject: mm/pgtable-generic.c: fix CONFIG_SWAP=n build mips (and sparc32): In file included from arch/mips/include/asm/tlb.h:21, from mm/pgtable-generic.c:9: include/asm-generic/tlb.h: In function `tlb_flush_mmu': include/asm-generic/tlb.h:76: error: implicit declaration of function `release_pages' include/asm-generic/tlb.h: In function `tlb_remove_page': include/asm-generic/tlb.h:105: error: implicit declaration of function `page_cache_release' free_pages_and_swap_cache() and free_page_and_swap_cache() are macros which call release_pages() and page_cache_release(). The obvious fix is to include pagemap.h in swap.h, where those macros are defined. But that breaks sparc for weird reasons. So fix it within mm/pgtable-generic.c instead. Reported-by: Yoichi Yuasa Cc: Geert Uytterhoeven Acked-by: Sam Ravnborg Cc: Sergei Shtylyov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pgtable-generic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 0369f5b3ba1b..eb663fb533e0 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -6,6 +6,7 @@ * Copyright (C) 2010 Linus Torvalds */ +#include #include #include -- cgit v1.2.3 From f33261d75b88f55a08e6a9648cef73509979bfba Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 25 Jan 2011 15:07:20 -0800 Subject: mm: fix deferred congestion timeout if preferred zone is not allowed Before 0e093d99763e ("writeback: do not sleep on the congestion queue if there are no congested BDIs or if significant congestion is not being encountered in the current zone"), preferred_zone was only used for NUMA statistics, to determine the zoneidx from which to allocate from given the type requested, and whether to utilize memory compaction. wait_iff_congested(), though, uses preferred_zone to determine if the congestion wait should be deferred because its dirty pages are backed by a congested bdi. This incorrectly defers the timeout and busy loops in the page allocator with various cond_resched() calls if preferred_zone is not allowed in the current context, usually consuming 100% of a cpu. This patch ensures preferred_zone is an allowed zone in the fastpath depending on whether current is constrained by its cpuset or nodes in its mempolicy (when the nodemask passed is non-NULL). This is correct since the fastpath allocation always passes ALLOC_CPUSET when trying to allocate memory. In the slowpath, this patch resets preferred_zone to the first zone of the allowed type when the allocation is not constrained by current's cpuset, i.e. it does not pass ALLOC_CPUSET. This patch also ensures preferred_zone is from the set of allowed nodes when called from within direct reclaim since allocations are always constrained by cpusets in this context (it is blockable). Both of these uses of cpuset_current_mems_allowed are protected by get_mems_allowed(). Signed-off-by: David Rientjes Cc: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: Wu Fengguang Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 +++++++++++- mm/vmscan.c | 3 ++- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 90c1439549fd..f4967910c967 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2034,6 +2034,14 @@ restart: */ alloc_flags = gfp_to_alloc_flags(gfp_mask); + /* + * Find the true preferred zone if the allocation is unconstrained by + * cpusets. + */ + if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) + first_zones_zonelist(zonelist, high_zoneidx, NULL, + &preferred_zone); + /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, @@ -2192,7 +2200,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, get_mems_allowed(); /* The preferred zone is used for statistics later */ - first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); + first_zones_zonelist(zonelist, high_zoneidx, + nodemask ? : &cpuset_current_mems_allowed, + &preferred_zone); if (!preferred_zone) { put_mems_allowed(); return NULL; diff --git a/mm/vmscan.c b/mm/vmscan.c index f5d90dedebba..148c6e630df2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2083,7 +2083,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zone *preferred_zone; first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), - NULL, &preferred_zone); + &cpuset_current_mems_allowed, + &preferred_zone); wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); } } -- cgit v1.2.3 From 2ff754fa8f416e82327f2d8f1354a033b66286df Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 25 Jan 2011 15:07:23 -0800 Subject: mm: clear pages_scanned only if draining a pcp adds pages to the buddy allocator Commit 0e093d99763e ("writeback: do not sleep on the congestion queue if there are no congested BDIs or if significant congestion is not being encountered in the current zone") uncovered a livelock in the page allocator that resulted in tasks infinitely looping trying to find memory and kswapd running at 100% cpu. The issue occurs because drain_all_pages() is called immediately following direct reclaim when no memory is freed and try_to_free_pages() returns non-zero because all zones in the zonelist do not have their all_unreclaimable flag set. When draining the per-cpu pagesets back to the buddy allocator for each zone, the zone->pages_scanned counter is cleared to avoid erroneously setting zone->all_unreclaimable later. The problem is that no pages may actually be drained and, thus, the unreclaimable logic never fails direct reclaim so the oom killer may be invoked. This apparently only manifested after wait_iff_congested() was introduced and the zone was full of anonymous memory that would not congest the backing store. The page allocator would infinitely loop if there were no other tasks waiting to be scheduled and clear zone->pages_scanned because of drain_all_pages() as the result of this change before kswapd could scan enough pages to trigger the reclaim logic. Additionally, with every loop of the page allocator and in the reclaim path, kswapd would be kicked and would end up running at 100% cpu. In this scenario, current and kswapd are all running continuously with kswapd incrementing zone->pages_scanned and current clearing it. The problem is even more pronounced when current swaps some of its memory to swap cache and the reclaimable logic then considers all active anonymous memory in the all_unreclaimable logic, which requires a much higher zone->pages_scanned value for try_to_free_pages() to return zero that is never attainable in this scenario. Before wait_iff_congested(), the page allocator would incur an unconditional timeout and allow kswapd to elevate zone->pages_scanned to a level that the oom killer would be called the next time it loops. The fix is to only attempt to drain pcp pages if there is actually a quantity to be drained. The unconditional clearing of zone->pages_scanned in free_pcppages_bulk() need not be changed since other callers already ensure that draining will occur. This patch ensures that free_pcppages_bulk() will actually free memory before calling into it from drain_all_pages() so zone->pages_scanned is only cleared if appropriate. Signed-off-by: David Rientjes Cc: Mel Gorman Reviewed-by: Johannes Weiner Cc: Minchan Kim Cc: Wu Fengguang Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f4967910c967..a873e61e312e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1088,8 +1088,10 @@ static void drain_pages(unsigned int cpu) pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - free_pcppages_bulk(zone, pcp->count, pcp); - pcp->count = 0; + if (pcp->count) { + free_pcppages_bulk(zone, pcp->count, pcp); + pcp->count = 0; + } local_irq_restore(flags); } } -- cgit v1.2.3 From 8dba474f034c322d96ada39cb20cac711d80dcb2 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Tue, 25 Jan 2011 15:07:24 -0800 Subject: mm/memcontrol.c: fix uninitialized variable use in mem_cgroup_move_parent() In mm/memcontrol.c::mem_cgroup_move_parent() there's a path that jumps to the 'put_back' label ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, charge); if (ret || !parent) goto put_back; where we'll if (charge > PAGE_SIZE) compound_unlock_irqrestore(page, flags); but, we have not assigned anything to 'flags' at this point, nor have we called 'compound_lock_irqsave()' (which is what sets 'flags'). The 'put_back' label should be moved below the call to compound_unlock_irqrestore() as per this patch. Signed-off-by: Jesper Juhl Cc: Balbir Singh Cc: Daisuke Nishimura Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelianov Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index db76ef726293..4fcf47a62550 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2292,9 +2292,10 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, ret = mem_cgroup_move_account(pc, child, parent, true, charge); if (ret) mem_cgroup_cancel_charge(parent, charge); -put_back: + if (charge > PAGE_SIZE) compound_unlock_irqrestore(page, flags); +put_back: putback_lru_page(page); put: put_page(page); -- cgit v1.2.3 From 33a938774fdb9933e9c77504b035f4f87c0859df Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 25 Jan 2011 15:07:25 -0800 Subject: mm: compaction: don't depend on HUGETLB_PAGE Commit 5d6892407 ("thp: select CONFIG_COMPACTION if TRANSPARENT_HUGEPAGE enabled") causes this warning during the configuration process: warning: (TRANSPARENT_HUGEPAGE) selects COMPACTION which has unmet direct dependencies (EXPERIMENTAL && HUGETLB_PAGE && MMU) COMPACTION doesn't depend on HUGETLB_PAGE, it doesn't depend on THP either, it is also useful for regular alloc_pages(order > 0) including the very kernel stack during fork (THREAD_ORDER = 1). It's always better to enable COMPACTION. The warning should be an error because we would end up with MIGRATION not selected, and COMPACTION wouldn't work without migration (despite it seems to build with an inline migrate_pages returning -ENOSYS). I'd also like to remove EXPERIMENTAL: compaction has been in the kernel for some releases (for full safety the default remains disabled which I think is enough). Signed-off-by: Andrea Arcangeli Reported-by: Luca Tettamanti Tested-by: Luca Tettamanti Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 3ad483bdf505..e9c0c61f2ddd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS config COMPACTION bool "Allow for memory compaction" select MIGRATION - depends on EXPERIMENTAL && HUGETLB_PAGE && MMU + depends on MMU help Allows the compaction of memory for the allocation of huge pages. -- cgit v1.2.3 From 28bd65781c848d95ba6a7f58b5c4b8265a804ec6 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 25 Jan 2011 15:07:26 -0800 Subject: mm: migration: clarify migrate_pages() comment Callers of migrate_pages should putback_lru_pages to return pages isolated to LRU or free list. Now comment is rather confusing. It says caller always have to call it. It is more clear to point out that the caller has to call it if migrate_pages's return value isn't zero. Signed-off-by: Minchan Kim Cc: Christoph Lameter Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 46fe8cc13d67..9f29a3b7aac2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -888,7 +888,7 @@ out: * are movable anymore because to has become empty * or no retryable pages exist anymore. * Caller should call putback_lru_pages to return pages to the LRU - * or free list. + * or free list only if ret != 0. * * Return: Number of pages not migrated or error code. */ -- cgit v1.2.3 From 01c88e2d6b7330c0cc5867fe2297e7d826e1337d Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 25 Jan 2011 15:07:27 -0800 Subject: memcg: fix account leak at failure of memsw acconting Commit 4b53433468 ("memcg: clean up try_charge main loop") removes a cancel of charge at case: memory charge-> success. mem+swap charge-> failure. This leaks usage of memory. Fix it. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Johannes Weiner Acked-by: Daisuke Nishimura Cc: Balbir Singh Cc: [2.6.36+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4fcf47a62550..1eb1a04f874c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1832,6 +1832,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, if (likely(!ret)) return CHARGE_OK; + res_counter_uncharge(&mem->res, csize); mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); flags |= MEM_CGROUP_RECLAIM_NOSWAP; } else -- cgit v1.2.3 From 3d37c4a9199920964ffdfaec6335d93b9dcf9ca5 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 25 Jan 2011 15:07:28 -0800 Subject: memcg: bugfix check mem_cgroup_disabled() at split fixup mem_cgroup_disabled() should be checked at splitting. If disabled, no heavy work is necesary. Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Daisuke Nishimura Reviewed-by: Johannes Weiner Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1eb1a04f874c..8ab1d42664fb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2145,6 +2145,8 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) struct page_cgroup *tail_pc = lookup_page_cgroup(tail); unsigned long flags; + if (mem_cgroup_disabled()) + return; /* * We have no races with charge/uncharge but will have races with * page state accounting. -- cgit v1.2.3 From 52dbb9050936fd33ceb45f10529dbc992507c058 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 25 Jan 2011 15:07:29 -0800 Subject: memcg: fix race at move_parent around compound_order() A fix up mem_cgroup_move_parent() which use compound_order() in asynchronous manner. This compound_order() may return unknown value because we don't take lock. Use PageTransHuge() and HPAGE_SIZE instead of it. Also clean up for mem_cgroup_move_parent(). - remove unnecessary initialization of local variable. - rename charge_size -> page_size - remove unnecessary (wrong) comment. - added a comment about THP. Note: Current design take compound_page_lock() in caller of move_account(). This should be revisited when we implement direct move_task of hugepage without splitting. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Johannes Weiner Acked-by: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8ab1d42664fb..3878cfe399dc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2236,7 +2236,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, { int ret = -EINVAL; unsigned long flags; - + /* + * The page is isolated from LRU. So, collapse function + * will not handle this page. But page splitting can happen. + * Do this check under compound_page_lock(). The caller should + * hold it. + */ if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) return -EBUSY; @@ -2268,7 +2273,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, struct cgroup *cg = child->css.cgroup; struct cgroup *pcg = cg->parent; struct mem_cgroup *parent; - int charge = PAGE_SIZE; + int page_size = PAGE_SIZE; unsigned long flags; int ret; @@ -2281,22 +2286,24 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, goto out; if (isolate_lru_page(page)) goto put; - /* The page is isolated from LRU and we have no race with splitting */ - charge = PAGE_SIZE << compound_order(page); + + if (PageTransHuge(page)) + page_size = HPAGE_SIZE; parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, charge); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, + &parent, false, page_size); if (ret || !parent) goto put_back; - if (charge > PAGE_SIZE) + if (page_size > PAGE_SIZE) flags = compound_lock_irqsave(page); - ret = mem_cgroup_move_account(pc, child, parent, true, charge); + ret = mem_cgroup_move_account(pc, child, parent, true, page_size); if (ret) - mem_cgroup_cancel_charge(parent, charge); + mem_cgroup_cancel_charge(parent, page_size); - if (charge > PAGE_SIZE) + if (page_size > PAGE_SIZE) compound_unlock_irqrestore(page, flags); put_back: putback_lru_page(page); -- cgit v1.2.3 From 0a08739e81671de2cb690774937fe510c000b27f Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Sat, 30 Oct 2010 23:43:05 +0200 Subject: kmemleak: remove memset by using kzalloc We don't need to memset if we just use kzalloc() rather than kmalloc() in kmemleak_test_init(). Signed-off-by: Jesper Juhl Reviewed-by: Minchan Kim Signed-off-by: Catalin Marinas --- mm/kmemleak-test.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index 177a5169bbde..ff0d9779cec8 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void) * after the module is removed. */ for (i = 0; i < 10; i++) { - elem = kmalloc(sizeof(*elem), GFP_KERNEL); - pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem); + elem = kzalloc(sizeof(*elem), GFP_KERNEL); + pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem); if (!elem) return -ENOMEM; - memset(elem, 0, sizeof(*elem)); INIT_LIST_HEAD(&elem->list); - list_add_tail(&elem->list, &test_list); } -- cgit v1.2.3 From 6ae4bd1f0bc479984f30061b5e5116060c24a267 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 27 Jan 2011 10:30:26 +0000 Subject: kmemleak: Allow kmemleak metadata allocations to fail This patch adds __GFP_NORETRY and __GFP_NOMEMALLOC flags to the kmemleak metadata allocations so that it has a smaller effect on the users of the kernel slab allocator. Since kmemleak allocations can now fail more often, this patch also reduces the verbosity by passing __GFP_NOWARN and not dumping the stack trace when a kmemleak allocation fails. Signed-off-by: Catalin Marinas Reported-by: Toralf Förster Acked-by: Pekka Enberg Acked-by: David Rientjes Cc: Ted Ts'o --- mm/kmemleak.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index bd9bc214091b..84225f3b7190 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -113,7 +113,9 @@ #define BYTES_PER_POINTER sizeof(void *) /* GFP bitmask for kmemleak internal allocations */ -#define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC) +#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ + __GFP_NORETRY | __GFP_NOMEMALLOC | \ + __GFP_NOWARN) /* scanning area inside a memory block */ struct kmemleak_scan_area { @@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, struct kmemleak_object *object; struct prio_tree_node *node; - object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); + object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); if (!object) { - kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); + pr_warning("Cannot allocate a kmemleak_object structure\n"); + kmemleak_disable(); return NULL; } @@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) return; } - area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK); + area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); if (!area) { - kmemleak_warn("Cannot allocate a scan area\n"); + pr_warning("Cannot allocate a scan area\n"); goto out; } -- cgit v1.2.3 From fdf4c587a793ba87935e38e7f25a9540bc9a7b95 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 31 Jan 2011 17:03:41 -0800 Subject: mlock: operate on any regions with protection != PROT_NONE As Tao Ma noticed, change 5ecfda0 breaks blktrace. This is because blktrace mmaps a file with PROT_WRITE permissions but without PROT_READ, so my attempt to not unnecessarity break COW during mlock ended up causing mlock to fail with a permission problem. I am proposing to let mlock ignore vma protection in all cases except PROT_NONE. In particular, mlock should not fail for PROT_WRITE regions (as in the blktrace case, which broke at 5ecfda0) or for PROT_EXEC regions (which seem to me like they were always broken). Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Signed-off-by: Linus Torvalds --- mm/mlock.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 13e81ee8be9d..c3924c7f00be 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -178,6 +178,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; + /* + * We want mlock to succeed for regions that have any permissions + * other than PROT_NONE. + */ + if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) + gup_flags |= FOLL_FORCE; + if (vma->vm_flags & VM_LOCKED) gup_flags |= FOLL_MLOCK; -- cgit v1.2.3 From fceda1bf498677501befc7da72fd2e4de7f18466 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 1 Feb 2011 15:52:30 -0800 Subject: memsw: handle swapaccount kernel parameter correctly __setup based kernel command line parameters handlers which are handled in obsolete_checksetup are provided with the parameter value including = (more precisely everything right after the parameter name). This means that the current implementation of swapaccount[=1|0] doesn't work at all because if there is a value for the parameter then we are testing for "0" resp. "1" but we are getting "=0" resp. "=1" and if there is no parameter value we are getting an empty string rather than NULL. The original noswapccount parameter, which doesn't care about the value, works correctly. Signed-off-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3878cfe399dc..44f9f9c89f0c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5024,9 +5024,9 @@ struct cgroup_subsys mem_cgroup_subsys = { static int __init enable_swap_account(char *s) { /* consider enabled if no parameter or 1 is given */ - if (!s || !strcmp(s, "1")) + if (!(*s) || !strcmp(s, "=1")) really_do_swap_account = 1; - else if (!strcmp(s, "0")) + else if (!strcmp(s, "=0")) really_do_swap_account = 0; return 1; } @@ -5034,7 +5034,7 @@ __setup("swapaccount", enable_swap_account); static int __init disable_swap_account(char *s) { - enable_swap_account("0"); + enable_swap_account("=0"); return 1; } __setup("noswapaccount", disable_swap_account); -- cgit v1.2.3 From 552b372ba9db85751e7db2998f07cca2e51f5865 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 1 Feb 2011 15:52:31 -0800 Subject: memsw: deprecate noswapaccount kernel parameter and schedule it for removal noswapaccount couldn't be used to control memsw for both on/off cases so we have added swapaccount[=0|1] parameter. This way we can turn the feature in two ways noswapaccount resp. swapaccount=0. We have kept the original noswapaccount but I think we should remove it after some time as it just makes more command line parameters without any advantages and also the code to handle parameters is uglier if we want both parameters. Signed-off-by: Michal Hocko Requested-by: KAMEZAWA Hiroyuki Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/feature-removal-schedule.txt | 16 ++++++++++++++++ mm/memcontrol.c | 1 + 2 files changed, 17 insertions(+) (limited to 'mm') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index b959659c5df4..b3f35e5f9c95 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -603,3 +603,19 @@ Why: The adm9240, w83792d and w83793 hardware monitoring drivers have Who: Jean Delvare ---------------------------- + +What: noswapaccount kernel command line parameter +When: 2.6.40 +Why: The original implementation of memsw feature enabled by + CONFIG_CGROUP_MEM_RES_CTLR_SWAP could be disabled by the noswapaccount + kernel parameter (introduced in 2.6.29-rc1). Later on, this decision + turned out to be not ideal because we cannot have the feature compiled + in and disabled by default and let only interested to enable it + (e.g. general distribution kernels might need it). Therefore we have + added swapaccount[=0|1] parameter (introduced in 2.6.37) which provides + the both possibilities. If we remove noswapaccount we will have + less command line parameters with the same functionality and we + can also cleanup the parameter handling a bit (). +Who: Michal Hocko + +---------------------------- diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 44f9f9c89f0c..79abb1fd39d2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5034,6 +5034,7 @@ __setup("swapaccount", enable_swap_account); static int __init disable_swap_account(char *s) { + printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n"); enable_swap_account("=0"); return 1; } -- cgit v1.2.3 From 57fc4a5ee322cde96c33f101d3c2d3b79011c05c Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 1 Feb 2011 15:52:32 -0800 Subject: mm: when migrate_pages returns 0, all pages must have been released In some cases migrate_pages could return zero while still leaving a few pages in the pagelist (and some caller wouldn't notice it has to call putback_lru_pages after commit cf608ac19c9 ("mm: compaction: fix COMPACTPAGEFAILED counting")). Add one missing putback_lru_pages not added by commit cf608ac19c95 ("mm: compaction: fix COMPACTPAGEFAILED counting"). Signed-off-by: Andrea Arcangeli Signed-off-by: Minchan Kim Reviewed-by: Minchan Kim Cc: Christoph Lameter Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 1 + mm/migrate.c | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 548fbd70f026..75398b0bfede 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1419,6 +1419,7 @@ int soft_offline_page(struct page *page, int flags) ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, true); if (ret) { + putback_lru_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) diff --git a/mm/migrate.c b/mm/migrate.c index 9f29a3b7aac2..155a2e9a8059 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -772,6 +772,7 @@ uncharge: unlock: unlock_page(page); +move_newpage: if (rc != -EAGAIN) { /* * A page that has been migrated has all references @@ -785,8 +786,6 @@ unlock: putback_lru_page(page); } -move_newpage: - /* * Move the new page to the LRU. If migration was not successful * then this will free the page. -- cgit v1.2.3 From 48db54ee2f41e8ae2faf330b55db34a9fffb5b3c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 1 Feb 2011 15:52:33 -0800 Subject: mm/migration: fix page corruption during hugepage migration If migrate_huge_page by memory-failure fails , it calls put_page in itself to decrease page reference and caller of migrate_huge_page also calls putback_lru_pages. It can do double free of page so it can make page corruption on page holder. In addtion, clean of pages on caller is consistent behavior with migrate_pages by cf608ac19c ("mm: compaction: fix COMPACTPAGEFAILED counting"). Signed-off-by: Minchan Kim Cc: Andrea Arcangeli Cc: Christoph Lameter Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 5 ++++- mm/migrate.c | 4 ---- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 75398b0bfede..237aaa488f4e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1295,7 +1295,10 @@ static int soft_offline_huge_page(struct page *page, int flags) ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, true); if (ret) { - putback_lru_pages(&pagelist); + struct page *page1, *page2; + list_for_each_entry_safe(page1, page2, &pagelist, lru) + put_page(page1); + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) diff --git a/mm/migrate.c b/mm/migrate.c index 155a2e9a8059..766115253807 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -980,10 +980,6 @@ int migrate_huge_pages(struct list_head *from, } rc = 0; out: - - list_for_each_entry_safe(page, page2, from, lru) - put_page(page); - if (rc) return rc; -- cgit v1.2.3 From efeda7a41e09efce506a68c3549b60b16dd7dedd Mon Sep 17 00:00:00 2001 From: Jin Dongming Date: Tue, 1 Feb 2011 15:52:39 -0800 Subject: thp: fix splitting of hwpoisoned hugepages The poisoned THP is now split with split_huge_page() in collect_procs_anon(). If kmalloc() is failed in collect_procs(), split_huge_page() could not be called. And the work after split_huge_page() for collecting the processes using poisoned page will not be done, too. So the processes using the poisoned page could not be killed. The condition becomes worse when CONFIG_DEBUG_VM == "Y". Because the poisoned THP could not be split, system panic will be caused by VM_BUG_ON(PageTransHuge(page)) in try_to_unmap(). This patch does: 1. move split_huge_page() to the place before collect_procs(). This can be sure the failure of splitting THP is caused by itself. 2. when splitting THP is failed, stop the operations after it. This can avoid unexpected system panic or non sense works. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Jin Dongming Reviewed-by: Hidetoshi Seto Cc: Andrea Arcangeli Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 237aaa488f4e..1e9c30b241c3 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -386,8 +386,6 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct task_struct *tsk; struct anon_vma *av; - if (!PageHuge(page) && unlikely(split_huge_page(page))) - return; read_lock(&tasklist_lock); av = page_lock_anon_vma(page); if (av == NULL) /* Not actually mapped anymore */ @@ -896,6 +894,34 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } } + if (PageTransHuge(hpage)) { + /* + * Verify that this isn't a hugetlbfs head page, the check for + * PageAnon is just for avoid tripping a split_huge_page + * internal debug check, as split_huge_page refuses to deal with + * anything that isn't an anon page. PageAnon can't go away fro + * under us because we hold a refcount on the hpage, without a + * refcount on the hpage. split_huge_page can't be safely called + * in the first place, having a refcount on the tail isn't + * enough * to be safe. + */ + if (!PageHuge(hpage) && PageAnon(hpage)) { + if (unlikely(split_huge_page(hpage))) { + /* + * FIXME: if splitting THP is failed, it is + * better to stop the following operation rather + * than causing panic by unmapping. System might + * survive if the page is freed later. + */ + printk(KERN_INFO + "MCE %#lx: failed to split THP\n", pfn); + + BUG_ON(!PageHWPoison(p)); + return SWAP_FAIL; + } + } + } + /* * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, -- cgit v1.2.3 From a6d30dddae4648837be5a0c0cb2c0ae9ad0377db Mon Sep 17 00:00:00 2001 From: Jin Dongming Date: Tue, 1 Feb 2011 15:52:40 -0800 Subject: thp: fix the wrong reported address of hwpoisoned hugepages When the tail page of THP is poisoned, the head page will be poisoned too. And the wrong address, address of head page, will be sent with sigbus always. So when the poisoned page is used by Guest OS which is running on KVM, after the address changing(hva->gpa) by qemu, the unexpected process on Guest OS will be killed by sigbus. What we expected is that the process using the poisoned tail page could be killed on Guest OS, but not that the process using the healthy head page is killed. Since it is not good to poison the healthy page, avoid poisoning other than the page which is really poisoned. (While we poison all pages in a huge page in case of hugetlb, we can do this for THP thanks to split_huge_page().) Here we fix two parts: 1. Isolate the poisoned page only to make sure the reported address is the address of poisoned page. 2. make the poisoned page work as the poisoned regular page. [akpm@linux-foundation.org: fix spello in comment] Signed-off-by: Jin Dongming Reviewed-by: Hidetoshi Seto Cc: Andrea Arcangeli Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 7 ++++++- mm/memory-failure.c | 27 ++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e187454d82f6..b6c1ce3c53b5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1162,7 +1162,12 @@ static void __split_huge_page_refcount(struct page *page) /* after clearing PageTail the gup refcount can be released */ smp_mb(); - page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + /* + * retain hwpoison flag of the poisoned tail page: + * fix for the unsuitable process killed on Guest Machine(KVM) + * by the memory-failure. + */ + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; page_tail->flags |= (page->flags & ((1L << PG_referenced) | (1L << PG_swapbacked) | diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1e9c30b241c3..04158d6f44d4 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -854,6 +854,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, int ret; int kill = 1; struct page *hpage = compound_head(p); + struct page *ppage; if (PageReserved(p) || PageSlab(p)) return SWAP_SUCCESS; @@ -894,6 +895,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } } + /* + * ppage: poisoned page + * if p is regular page(4k page) + * ppage == real poisoned page; + * else p is hugetlb or THP, ppage == head page. + */ + ppage = hpage; + if (PageTransHuge(hpage)) { /* * Verify that this isn't a hugetlbfs head page, the check for @@ -919,6 +928,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, BUG_ON(!PageHWPoison(p)); return SWAP_FAIL; } + /* THP is split, so ppage should be the real poisoned page. */ + ppage = p; } } @@ -931,12 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(hpage, &tokill); + collect_procs(ppage, &tokill); - ret = try_to_unmap(hpage, ttu); + if (hpage != ppage) + lock_page_nosync(ppage); + + ret = try_to_unmap(ppage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(hpage)); + pfn, page_mapcount(ppage)); + + if (hpage != ppage) + unlock_page(ppage); /* * Now that the dirty bit has been propagated to the @@ -947,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, + kill_procs_ao(&tokill, !!PageDirty(ppage), trapno, ret != SWAP_SUCCESS, p, pfn); return ret; @@ -1090,7 +1107,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * For error on the tail page, we should set PG_hwpoison * on the head page to show that the hugepage is hwpoisoned */ - if (PageTail(p) && TestSetPageHWPoison(hpage)) { + if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { action_result(pfn, "hugepage already hardware poisoned", IGNORED); unlock_page(hpage); -- cgit v1.2.3 From af241a083404acda7ba3690e5b7697949d729fcc Mon Sep 17 00:00:00 2001 From: Jin Dongming Date: Tue, 1 Feb 2011 15:52:41 -0800 Subject: thp: fix unsuitable behavior for hwpoisoned tail page When a tail page of THP is poisoned, memory-failure will do nothing except setting PG_hwpoison, while the expected behavior is that the process, who is using the poisoned tail page, should be killed. The above problem is caused by lru check of the poisoned tail page of THP. Because PG_lru flag is only set on the head page of THP, the check always consider the poisoned tail page as NON lru page. So the lru check for the tail page of THP should be avoided, as like as hugetlb. This patch adds !PageTransCompound() before lru check for THP, because of the check (!PageHuge() && !PageTransCompound()) the whole branch could be optimized away at build time when both hugetlbfs and THP are set with "N" (or in archs not supporting either of those). [akpm@linux-foundation.org: fix unrelated typo in shake_page() comment] Signed-off-by: Jin Dongming Reviewed-by: Hidetoshi Seto Cc: Andrea Arcangeli Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 04158d6f44d4..0207c2f6f8bd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -233,8 +233,8 @@ void shake_page(struct page *p, int access) } /* - * Only all shrink_slab here (which would also - * shrink other caches) if access is not potentially fatal. + * Only call shrink_slab here (which would also shrink other caches) if + * access is not potentially fatal. */ if (access) { int nr; @@ -1065,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - if (!PageLRU(p) && !PageHuge(p)) - shake_page(p, 0); - if (!PageLRU(p) && !PageHuge(p)) { - /* - * shake_page could have turned it free. - */ - if (is_free_buddy_page(p)) { - action_result(pfn, "free buddy, 2nd try", DELAYED); - return 0; + if (!PageHuge(p) && !PageTransCompound(p)) { + if (!PageLRU(p)) + shake_page(p, 0); + if (!PageLRU(p)) { + /* + * shake_page could have turned it free. + */ + if (is_free_buddy_page(p)) { + action_result(pfn, "free buddy, 2nd try", + DELAYED); + return 0; + } + action_result(pfn, "non LRU", IGNORED); + put_page(p); + return -EBUSY; } - action_result(pfn, "non LRU", IGNORED); - put_page(p); - return -EBUSY; } /* -- cgit v1.2.3 From 9221edb7120e2dc3ae90f1c58514979f7ba40e46 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 1 Feb 2011 15:52:42 -0800 Subject: memcg: prevent endless loop when charging huge pages The charging code can encounter a charge size that is bigger than a regular page in two situations: one is a batched charge to fill the per-cpu stocks, the other is a huge page charge. This code is distributed over two functions, however, and only the outer one is aware of huge pages. In case the charging fails, the inner function will tell the outer function to retry if the charge size is bigger than regular pages--assuming batched charging is the only case. And the outer function will retry forever charging a huge page. This patch makes sure the inner function can distinguish between batch charging and a single huge page charge. It will only signal another attempt if batch charging failed, and go into regular reclaim when it is called on behalf of a huge page. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 79abb1fd39d2..50eb50e100fd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1837,8 +1837,15 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, flags |= MEM_CGROUP_RECLAIM_NOSWAP; } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); - - if (csize > PAGE_SIZE) /* change csize and retry */ + /* + * csize can be either a huge page (HPAGE_SIZE), a batch of + * regular pages (CHARGE_SIZE), or a single regular page + * (PAGE_SIZE). + * + * Never reclaim on behalf of optional batching, retry with a + * single page instead. + */ + if (csize == CHARGE_SIZE) return CHARGE_RETRY; if (!(gfp_mask & __GFP_WAIT)) -- cgit v1.2.3 From 19942822df65ee4a47c2e6d6d70cace1b7f01710 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 1 Feb 2011 15:52:43 -0800 Subject: memcg: prevent endless loop when charging huge pages to near-limit group If reclaim after a failed charging was unsuccessful, the limits are checked again, just in case they settled by means of other tasks. This is all fine as long as every charge is of size PAGE_SIZE, because in that case, being below the limit means having at least PAGE_SIZE bytes available. But with transparent huge pages, we may end up in an endless loop where charging and reclaim fail, but we keep going because the limits are not yet exceeded, although not allowing for a huge page. Fix this up by explicitely checking for enough room, not just whether we are within limits. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/res_counter.h | 20 ++++++++++++++++++++ mm/memcontrol.c | 35 ++++++++++++++++++++++++++++------- 2 files changed, 48 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index fcb9884df618..a5930cb66145 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -182,6 +182,26 @@ static inline bool res_counter_check_under_limit(struct res_counter *cnt) return ret; } +/** + * res_counter_check_margin - check if the counter allows charging + * @cnt: the resource counter to check + * @bytes: the number of bytes to check the remaining space against + * + * Returns a boolean value on whether the counter can be charged + * @bytes or whether this would exceed the limit. + */ +static inline bool res_counter_check_margin(struct res_counter *cnt, + unsigned long bytes) +{ + bool ret; + unsigned long flags; + + spin_lock_irqsave(&cnt->lock, flags); + ret = cnt->limit - cnt->usage >= bytes; + spin_unlock_irqrestore(&cnt->lock, flags); + return ret; +} + static inline bool res_counter_check_under_soft_limit(struct res_counter *cnt) { bool ret; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 50eb50e100fd..0e81eb5f0aea 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1111,6 +1111,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) return false; } +/** + * mem_cgroup_check_margin - check if the memory cgroup allows charging + * @mem: memory cgroup to check + * @bytes: the number of bytes the caller intends to charge + * + * Returns a boolean value on whether @mem can be charged @bytes or + * whether this would exceed the limit. + */ +static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes) +{ + if (!res_counter_check_margin(&mem->res, bytes)) + return false; + if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes)) + return false; + return true; +} + static unsigned int get_swappiness(struct mem_cgroup *memcg) { struct cgroup *cgrp = memcg->css.cgroup; @@ -1852,15 +1869,19 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, return CHARGE_WOULDBLOCK; ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags); + gfp_mask, flags); + if (mem_cgroup_check_margin(mem_over_limit, csize)) + return CHARGE_RETRY; /* - * try_to_free_mem_cgroup_pages() might not give us a full - * picture of reclaim. Some pages are reclaimed and might be - * moved to swap cache or just unmapped from the cgroup. - * Check the limit again to see if the reclaim reduced the - * current usage of the cgroup before giving up + * Even though the limit is exceeded at this point, reclaim + * may have been able to free some pages. Retry the charge + * before killing the task. + * + * Only for regular pages, though: huge pages are rather + * unlikely to succeed so close to the limit, and we fall back + * to regular pages anyway in case of failure. */ - if (ret || mem_cgroup_check_under_limit(mem_over_limit)) + if (csize == PAGE_SIZE && ret) return CHARGE_RETRY; /* -- cgit v1.2.3 From 8493ae439f7038b502df1d687e61dde54c27ca92 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 1 Feb 2011 15:52:44 -0800 Subject: memcg: never OOM when charging huge pages Huge page coverage should obviously have less priority than the continued execution of a process. Never kill a process when charging it a huge page fails. Instead, give up after the first failed reclaim attempt and fall back to regular pages. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0e81eb5f0aea..fc75f34ba609 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2351,13 +2351,19 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, enum charge_type ctype) { struct mem_cgroup *mem = NULL; + int page_size = PAGE_SIZE; struct page_cgroup *pc; + bool oom = true; int ret; - int page_size = PAGE_SIZE; if (PageTransHuge(page)) { page_size <<= compound_order(page); VM_BUG_ON(!PageTransHuge(page)); + /* + * Never OOM-kill a process for a huge page. The + * fault handler will fall back to regular pages. + */ + oom = false; } pc = lookup_page_cgroup(page); @@ -2366,7 +2372,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, return 0; prefetchw(pc); - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size); if (ret || !mem) return ret; -- cgit v1.2.3 From 3751d60430fe4c26460a5ca8ad8672d32f93bcb1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 1 Feb 2011 15:52:45 -0800 Subject: memcg: fix event counting breakage from recent THP update Changes in e401f1761 ("memcg: modify accounting function for supporting THP better") adds nr_pages to support multiple page size in memory_cgroup_charge_statistics. But counting the number of event nees abs(nr_pages) for increasing counters. This patch fixes event counting. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Johannes Weiner Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fc75f34ba609..da53a252b259 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -612,8 +612,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); - else + else { __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); + nr_pages = -nr_pages; /* for event */ + } __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); -- cgit v1.2.3 From e6d2e2b2b1e1455df16d68a78f4a3874c7b3ad20 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 10 Feb 2011 15:01:30 -0800 Subject: memblock: don't adjust size in memblock_find_base() While applying patch to use memblock to find aperture for 64bit x86. Ingo found system with 1g + force_iommu > No AGP bridge found > Node 0: aperture @ 38000000 size 32 MB > Aperture pointing to e820 RAM. Ignoring. > Your BIOS doesn't leave a aperture memory hole > Please enable the IOMMU option in the BIOS setup > This costs you 64 MB of RAM > Cannot allocate aperture memory hole (0,65536K) the corresponding code: addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20); if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) { printk(KERN_ERR "Cannot allocate aperture memory hole (%lx,%uK)\n", addr, aper_size>>10); return 0; } memblock_x86_reserve_range(addr, addr + aper_size, "aperture64") fails because memblock core code align the size with 512M. That could make size way too big. So don't align the size in that case. actually __memblock_alloc_base, the another caller already align that before calling that function. BTW. x86 does not use __memblock_alloc_base... Signed-off-by: Yinghai Lu Cc: Ingo Molnar Cc: David Miller Cc: "H. Peter Anvin" Cc: Benjamin Herrenschmidt Cc: Dave Airlie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index bdba245d8afd..4618fda975a0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -137,8 +137,6 @@ static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size, BUG_ON(0 == size); - size = memblock_align_up(size, align); - /* Pump up max_addr */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE) end = memblock.current_limit; -- cgit v1.2.3 From e15f8c01af924e611bc7be1e45449c4a74e5dfdd Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 10 Feb 2011 15:01:32 -0800 Subject: mlock: fix race when munlocking pages in do_wp_page() vmscan can lazily find pages that are mapped within VM_LOCKED vmas, and set the PageMlocked bit on these pages, transfering them onto the unevictable list. When do_wp_page() breaks COW within a VM_LOCKED vma, it may need to clear PageMlocked on the old page and set it on the new page instead. This change fixes an issue where do_wp_page() was clearing PageMlocked on the old page while the pte was still pointing to it (as well as rmap). Therefore, we were not protected against vmscan immediately transfering the old page back onto the unevictable list. This could cause pages to get stranded there forever. I propose to move the corresponding code to the end of do_wp_page(), after the pte (and rmap) have been pointed to the new page. Additionally, we can use munlock_vma_page() instead of clear_page_mlock(), so that the old page stays mlocked if there are still other VM_LOCKED vmas mapping it. Signed-off-by: Michel Lespinasse Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Hugh Dickins Cc: Rik van Riel Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 31250faff390..32df03cf13a5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2219,7 +2219,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); - page_cache_release(old_page); goto unlock; } page_cache_release(old_page); @@ -2289,7 +2288,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); - page_cache_release(old_page); goto unlock; } @@ -2367,16 +2365,6 @@ gotten: } __SetPageUptodate(new_page); - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if ((vma->vm_flags & VM_LOCKED) && old_page) { - lock_page(old_page); /* for LRU manipulation */ - clear_page_mlock(old_page); - unlock_page(old_page); - } - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; @@ -2444,10 +2432,20 @@ gotten: if (new_page) page_cache_release(new_page); - if (old_page) - page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); + if (old_page) { + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { + lock_page(old_page); /* LRU manipulation */ + munlock_vma_page(old_page); + unlock_page(old_page); + } + page_cache_release(old_page); + } return ret; oom_free_new: page_cache_release(new_page); -- cgit v1.2.3 From 419d8c96dbfa558f00e623023917d0a5afc46129 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 10 Feb 2011 15:01:33 -0800 Subject: mlock: do not munlock pages in __do_fault() If the page is going to be written to, __do_page needs to break COW. However, the old page (before breaking COW) was never mapped mapped into the current pte (__do_fault is only called when the pte is not present), so vmscan can't have marked the old page as PageMlocked due to being mapped in __do_fault's VMA. Therefore, __do_fault() does not need to worry about clearing PageMlocked() on the old page. Signed-off-by: Michel Lespinasse Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Hugh Dickins Cc: Rik van Riel Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 32df03cf13a5..8e8c18324863 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3051,12 +3051,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out; } charged = 1; - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if (vma->vm_flags & VM_LOCKED) - clear_page_mlock(vmf.page); copy_user_highpage(page, vmf.page, address, vma); __SetPageUptodate(page); } else { -- cgit v1.2.3 From f0fdc5e8e6f579310458aef43d1610a0bb5e81a4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 10 Feb 2011 15:01:34 -0800 Subject: vmscan: fix zone shrinking exit when scan work is done Commit 3e7d34497067 ("mm: vmscan: reclaim order-0 and use compaction instead of lumpy reclaim") introduced an indefinite loop in shrink_zone(). It meant to break out of this loop when no pages had been reclaimed and not a single page was even scanned. The way it would detect the latter is by taking a snapshot of sc->nr_scanned at the beginning of the function and comparing it against the new sc->nr_scanned after the scan loop. But it would re-iterate without updating that snapshot, looping forever if sc->nr_scanned changed at least once since shrink_zone() was invoked. This is not the sole condition that would exit that loop, but it requires other processes to change the zone state, as the reclaimer that is stuck obviously can not anymore. This is only happening for higher-order allocations, where reclaim is run back to back with compaction. Signed-off-by: Johannes Weiner Reported-by: Michal Hocko Tested-by: Kent Overstreet Reported-by: Kent Overstreet Acked-by: Mel Gorman Cc: Andrea Arcangeli Cc: Rik van Riel Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 148c6e630df2..17497d0cd8b9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1882,12 +1882,12 @@ static void shrink_zone(int priority, struct zone *zone, unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list l; - unsigned long nr_reclaimed; + unsigned long nr_reclaimed, nr_scanned; unsigned long nr_to_reclaim = sc->nr_to_reclaim; - unsigned long nr_scanned = sc->nr_scanned; restart: nr_reclaimed = 0; + nr_scanned = sc->nr_scanned; get_scan_count(zone, sc, nr, priority); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || -- cgit v1.2.3 From 678ff896a37afdbca292c7846ec895463aed35a5 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 10 Feb 2011 15:01:36 -0800 Subject: memcg: fix leak of accounting at failure path of hugepage collapsing mem_cgroup_uncharge_page() should be called in all failure cases after mem_cgroup_charge_newpage() is called in huge_memory.c::collapse_huge_page() [ 4209.076861] BUG: Bad page state in process khugepaged pfn:1e9800 [ 4209.077601] page:ffffea0006b14000 count:0 mapcount:0 mapping: (null) index:0x2800 [ 4209.078674] page flags: 0x40000000004000(head) [ 4209.079294] pc:ffff880214a30000 pc->flags:2146246697418756 pc->mem_cgroup:ffffc9000177a000 [ 4209.082177] (/A) [ 4209.082500] Pid: 31, comm: khugepaged Not tainted 2.6.38-rc3-mm1 #1 [ 4209.083412] Call Trace: [ 4209.083678] [] ? bad_page+0xe4/0x140 [ 4209.084240] [] ? free_pages_prepare+0xd6/0x120 [ 4209.084837] [] ? rwsem_down_failed_common+0xbd/0x150 [ 4209.085509] [] ? __free_pages_ok+0x32/0xe0 [ 4209.086110] [] ? free_compound_page+0x1b/0x20 [ 4209.086699] [] ? __put_compound_page+0x1c/0x30 [ 4209.087333] [] ? put_compound_page+0x4d/0x200 [ 4209.087935] [] ? put_page+0x45/0x50 [ 4209.097361] [] ? khugepaged+0x9e9/0x1430 [ 4209.098364] [] ? autoremove_wake_function+0x0/0x40 [ 4209.099121] [] ? khugepaged+0x0/0x1430 [ 4209.099780] [] ? kthread+0x96/0xa0 [ 4209.100452] [] ? kernel_thread_helper+0x4/0x10 [ 4209.101214] [] ? kthread+0x0/0xa0 [ 4209.101842] [] ? kernel_thread_helper+0x0/0x10 Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Daisuke Nishimura Reviewed-by: Johannes Weiner Cc: Andrea Arcangeli Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b6c1ce3c53b5..e62ddb8f24b6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1852,7 +1852,6 @@ static void collapse_huge_page(struct mm_struct *mm, set_pmd_at(mm, address, pmd, _pmd); spin_unlock(&mm->page_table_lock); anon_vma_unlock(vma->anon_vma); - mem_cgroup_uncharge_page(new_page); goto out; } @@ -1898,6 +1897,7 @@ out_up_write: return; out: + mem_cgroup_uncharge_page(new_page); #ifdef CONFIG_NUMA put_page(new_page); #endif -- cgit v1.2.3 From a7d6e4ecdb7648478ddec76d30d87d03d6e22b31 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 15 Feb 2011 19:02:45 +0100 Subject: thp: prevent hugepages during args/env copying into the user stack Transparent hugepages can only be created if rmap is fully functional. So we must prevent hugepages to be created while is_vma_temporary_stack() is true. This also optmizes away some harmless but unnecessary setting of khugepaged_scan.address and it switches some BUG_ON to VM_BUG_ON. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 3 ++- mm/huge_memory.c | 35 ++++++++++++++++------------------- 2 files changed, 18 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8e6c8c42bc3c..df29c8fde36b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -57,7 +57,8 @@ extern pmd_t *page_check_address_pmd(struct page *page, (transparent_hugepage_flags & \ (1<vm_flags & VM_HUGEPAGE))) && \ - !((__vma)->vm_flags & VM_NOHUGEPAGE)) + !((__vma)->vm_flags & VM_NOHUGEPAGE) && \ + !is_vma_temporary_stack(__vma)) #define transparent_hugepage_defrag(__vma) \ ((transparent_hugepage_flags & \ (1<anon_vma || vma->vm_ops || vma->vm_file) goto out; + if (is_vma_temporary_stack(vma)) + goto out; VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); pgd = pgd_offset(mm, address); @@ -2032,32 +2034,27 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || (vma->vm_flags & VM_NOHUGEPAGE)) { + skip: progress++; continue; } - /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ - if (!vma->anon_vma || vma->vm_ops || vma->vm_file) { - khugepaged_scan.address = vma->vm_end; - progress++; - continue; - } + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + goto skip; + if (is_vma_temporary_stack(vma)) + goto skip; + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; - if (hstart >= hend) { - progress++; - continue; - } + if (hstart >= hend) + goto skip; + if (khugepaged_scan.address > hend) + goto skip; if (khugepaged_scan.address < hstart) khugepaged_scan.address = hstart; - if (khugepaged_scan.address > hend) { - khugepaged_scan.address = hend + HPAGE_PMD_SIZE; - progress++; - continue; - } - BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); while (khugepaged_scan.address < hend) { int ret; @@ -2086,7 +2083,7 @@ breakouterloop: breakouterloop_mmap_sem: spin_lock(&khugepaged_mm_lock); - BUG_ON(khugepaged_scan.mm_slot != mm_slot); + VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); /* * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. @@ -2241,9 +2238,9 @@ static int khugepaged(void *none) for (;;) { mutex_unlock(&khugepaged_mutex); - BUG_ON(khugepaged_thread != current); + VM_BUG_ON(khugepaged_thread != current); khugepaged_loop(); - BUG_ON(khugepaged_thread != current); + VM_BUG_ON(khugepaged_thread != current); mutex_lock(&khugepaged_mutex); if (!khugepaged_enabled()) -- cgit v1.2.3 From 2aa15890f3c191326678f1bd68af61ec6b8753ec Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 23 Feb 2011 13:49:47 +0100 Subject: mm: prevent concurrent unmap_mapping_range() on the same inode Michael Leun reported that running parallel opens on a fuse filesystem can trigger a "kernel BUG at mm/truncate.c:475" Gurudas Pai reported the same bug on NFS. The reason is, unmap_mapping_range() is not prepared for more than one concurrent invocation per inode. For example: thread1: going through a big range, stops in the middle of a vma and stores the restart address in vm_truncate_count. thread2: comes in with a small (e.g. single page) unmap request on the same vma, somewhere before restart_address, finds that the vma was already unmapped up to the restart address and happily returns without doing anything. Another scenario would be two big unmap requests, both having to restart the unmapping and each one setting vm_truncate_count to its own value. This could go on forever without any of them being able to finish. Truncate and hole punching already serialize with i_mutex. Other callers of unmap_mapping_range() do not, and it's difficult to get i_mutex protection for all callers. In particular ->d_revalidate(), which calls invalidate_inode_pages2_range() in fuse, may be called with or without i_mutex. This patch adds a new mutex to 'struct address_space' to prevent running multiple concurrent unmap_mapping_range() on the same mapping. [ We'll hopefully get rid of all this with the upcoming mm preemptibility series by Peter Zijlstra, the "mm: Remove i_mmap_mutex lockbreak" patch in particular. But that is for 2.6.39 ] Signed-off-by: Miklos Szeredi Reported-by: Michael Leun Reported-by: Gurudas Pai Tested-by: Gurudas Pai Acked-by: Hugh Dickins Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/gfs2/main.c | 9 +-------- fs/inode.c | 22 +++++++++++++++------- fs/nilfs2/btnode.c | 5 ----- fs/nilfs2/btnode.h | 1 - fs/nilfs2/mdt.c | 4 ++-- fs/nilfs2/page.c | 13 ------------- fs/nilfs2/page.h | 1 - fs/nilfs2/super.c | 2 +- include/linux/fs.h | 2 ++ mm/memory.c | 2 ++ 10 files changed, 23 insertions(+), 38 deletions(-) (limited to 'mm') diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 85ba027d1c4d..72c31a315d96 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -59,14 +59,7 @@ static void gfs2_init_gl_aspace_once(void *foo) struct address_space *mapping = (struct address_space *)(gl + 1); gfs2_init_glock_once(gl); - memset(mapping, 0, sizeof(*mapping)); - INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); - spin_lock_init(&mapping->tree_lock); - spin_lock_init(&mapping->i_mmap_lock); - INIT_LIST_HEAD(&mapping->private_list); - spin_lock_init(&mapping->private_lock); - INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); - INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); + address_space_init_once(mapping); } /** diff --git a/fs/inode.c b/fs/inode.c index da85e56378f3..9c2b795ccc93 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -295,6 +295,20 @@ static void destroy_inode(struct inode *inode) call_rcu(&inode->i_rcu, i_callback); } +void address_space_init_once(struct address_space *mapping) +{ + memset(mapping, 0, sizeof(*mapping)); + INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); + spin_lock_init(&mapping->tree_lock); + spin_lock_init(&mapping->i_mmap_lock); + INIT_LIST_HEAD(&mapping->private_list); + spin_lock_init(&mapping->private_lock); + INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); + INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); + mutex_init(&mapping->unmap_mutex); +} +EXPORT_SYMBOL(address_space_init_once); + /* * These are initializations that only need to be done * once, because the fields are idempotent across use @@ -308,13 +322,7 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); - INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); - spin_lock_init(&inode->i_data.tree_lock); - spin_lock_init(&inode->i_data.i_mmap_lock); - INIT_LIST_HEAD(&inode->i_data.private_list); - spin_lock_init(&inode->i_data.private_lock); - INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap); - INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear); + address_space_init_once(&inode->i_data); i_size_ordered_init(inode); #ifdef CONFIG_FSNOTIFY INIT_HLIST_HEAD(&inode->i_fsnotify_marks); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 388e9e8f5286..85f7baa15f5d 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -35,11 +35,6 @@ #include "btnode.h" -void nilfs_btnode_cache_init_once(struct address_space *btnc) -{ - nilfs_mapping_init_once(btnc); -} - static const struct address_space_operations def_btnode_aops = { .sync_page = block_sync_page, }; diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index 79037494f1e0..1b8ebd888c28 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt { struct buffer_head *newbh; }; -void nilfs_btnode_cache_init_once(struct address_space *); void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *); void nilfs_btnode_cache_clear(struct address_space *); struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 6a0e2a189f60..a0babd2bff6a 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -454,9 +454,9 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode, struct backing_dev_info *bdi = inode->i_sb->s_bdi; INIT_LIST_HEAD(&shadow->frozen_buffers); - nilfs_mapping_init_once(&shadow->frozen_data); + address_space_init_once(&shadow->frozen_data); nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops); - nilfs_mapping_init_once(&shadow->frozen_btnodes); + address_space_init_once(&shadow->frozen_btnodes); nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops); mi->mi_shadow = shadow; return 0; diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 0c432416cfef..a585b35fd6bc 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -492,19 +492,6 @@ unsigned nilfs_page_count_clean_buffers(struct page *page, return nc; } -void nilfs_mapping_init_once(struct address_space *mapping) -{ - memset(mapping, 0, sizeof(*mapping)); - INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); - spin_lock_init(&mapping->tree_lock); - INIT_LIST_HEAD(&mapping->private_list); - spin_lock_init(&mapping->private_lock); - - spin_lock_init(&mapping->i_mmap_lock); - INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); - INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); -} - void nilfs_mapping_init(struct address_space *mapping, struct backing_dev_info *bdi, const struct address_space_operations *aops) diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index 622df27cd891..2a00953ebd5f 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -61,7 +61,6 @@ void nilfs_free_private_page(struct page *); int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); void nilfs_copy_back_pages(struct address_space *, struct address_space *); void nilfs_clear_dirty_pages(struct address_space *); -void nilfs_mapping_init_once(struct address_space *mapping); void nilfs_mapping_init(struct address_space *mapping, struct backing_dev_info *bdi, const struct address_space_operations *aops); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 58fd707174e1..1673b3d99842 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1279,7 +1279,7 @@ static void nilfs_inode_init_once(void *obj) #ifdef CONFIG_NILFS_XATTR init_rwsem(&ii->xattr_sem); #endif - nilfs_btnode_cache_init_once(&ii->i_btnode_cache); + address_space_init_once(&ii->i_btnode_cache); ii->i_bmap = &ii->i_bmap_data; inode_init_once(&ii->vfs_inode); } diff --git a/include/linux/fs.h b/include/linux/fs.h index bd3215940c37..97d08d8a7de8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -649,6 +649,7 @@ struct address_space { spinlock_t private_lock; /* for use by the address_space */ struct list_head private_list; /* ditto */ struct address_space *assoc_mapping; /* ditto */ + struct mutex unmap_mutex; /* to protect unmapping */ } __attribute__((aligned(sizeof(long)))); /* * On most architectures that alignment is already the case; but @@ -2225,6 +2226,7 @@ extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin); extern int inode_init_always(struct super_block *, struct inode *); extern void inode_init_once(struct inode *); +extern void address_space_init_once(struct address_space *mapping); extern void ihold(struct inode * inode); extern void iput(struct inode *); extern struct inode * igrab(struct inode *); diff --git a/mm/memory.c b/mm/memory.c index 8e8c18324863..5823698c2b71 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2648,6 +2648,7 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; details.i_mmap_lock = &mapping->i_mmap_lock; + mutex_lock(&mapping->unmap_mutex); spin_lock(&mapping->i_mmap_lock); /* Protect against endless unmapping loops */ @@ -2664,6 +2665,7 @@ void unmap_mapping_range(struct address_space *mapping, if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->unmap_mutex); } EXPORT_SYMBOL(unmap_mapping_range); -- cgit v1.2.3 From a3e8cc643d22d2c8ed36b9be7d9c9ca21efcf7f7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 23 Feb 2011 21:39:49 -0800 Subject: mm: fix possible cause of a page_mapped BUG Robert Swiecki reported a BUG_ON(page_mapped) from a fuzzer, punching a hole with madvise(,, MADV_REMOVE). That path is under mutex, and cannot be explained by lack of serialization in unmap_mapping_range(). Reviewing the code, I found one place where vm_truncate_count handling should have been updated, when I switched at the last minute from one way of managing the restart_addr to another: mremap move changes the virtual addresses, so it ought to adjust the restart_addr. But rather than exporting the notion of restart_addr from memory.c, or converting to restart_pgoff throughout, simply reset vm_truncate_count to 0 to force a rescan if mremap move races with preempted truncation. We have no confirmation that this fixes Robert's BUG, but it is a fix that's worth making anyway. Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/mremap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mremap.c b/mm/mremap.c index 9925b6391b80..1de98d492ddc 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -94,9 +94,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, */ mapping = vma->vm_file->f_mapping; spin_lock(&mapping->i_mmap_lock); - if (new_vma->vm_truncate_count && - new_vma->vm_truncate_count != vma->vm_truncate_count) - new_vma->vm_truncate_count = 0; + new_vma->vm_truncate_count = 0; } /* -- cgit v1.2.3 From 8074b26f67165bf045d92e778c9c10dc5e207fc6 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 24 Feb 2011 15:49:53 +0100 Subject: mm: fix refcounting in swapon Grab a reference to bdev before calling blkdev_get(), which expects the refcount to be already incremented and either returns success or decrements the refcount and returns an error. The bug was introduced by e525fd89 (block: make blkdev_get/put() handle exclusive access), which didn't take into account this behavior of blkdev_get(). Acked-by: Tejun Heo Signed-off-by: Miklos Szeredi Signed-off-by: Linus Torvalds --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 07a458d72fa8..0341c5700e34 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1940,7 +1940,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -EINVAL; if (S_ISBLK(inode->i_mode)) { - bdev = I_BDEV(inode); + bdev = bdgrab(I_BDEV(inode)); error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, sys_swapon); if (error < 0) { -- cgit v1.2.3 From a879bf582dfb3a79d30d76ca3af2ae8a0f39010c Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 25 Feb 2011 14:44:13 -0800 Subject: mm: grab rcu read lock in move_pages() The move_pages() usage of find_task_by_vpid() requires rcu_read_lock() to prevent free_pid() from reclaiming the pid. Without this patch, RCU warnings are printed in v2.6.38-rc4 move_pages() with: CONFIG_LOCKUP_DETECTOR=y CONFIG_PREEMPT=y CONFIG_LOCKDEP=y CONFIG_PROVE_LOCKING=y CONFIG_PROVE_RCU=y Previously, migrate_pages() went through a similar transformation replacing usage of tasklist_lock with rcu read lock: commit 55cfaa3cbdd29c4919ecb5fb8965c310f357e48c Author: Zeng Zhaoming Date: Thu Dec 2 14:31:13 2010 -0800 mm/mempolicy.c: add rcu read lock to protect pid structure commit 1e50df39f6e2c3a4a3394df62baa8a213df16c54 Author: KOSAKI Motohiro Date: Thu Jan 13 15:46:14 2011 -0800 mempolicy: remove tasklist_lock from migrate_pages Signed-off-by: Greg Thelen Cc: Mel Gorman Cc: Minchan Kim Cc: Rik van Riel Cc: KAMEZAWA Hiroyuki Cc: "Paul E. McKenney" Cc: Tetsuo Handa Cc: Sergey Senozhatsky Cc: Oleg Nesterov Cc: Zeng Zhaoming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 766115253807..352de555626c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1287,14 +1287,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, return -EPERM; /* Find the mm_struct */ - read_lock(&tasklist_lock); + rcu_read_lock(); task = pid ? find_task_by_vpid(pid) : current; if (!task) { - read_unlock(&tasklist_lock); + rcu_read_unlock(); return -ESRCH; } mm = get_task_mm(task); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!mm) return -EINVAL; -- cgit v1.2.3 From 2876592f231d436c295b67726313f6f3cfb6e243 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 25 Feb 2011 14:44:20 -0800 Subject: mm: vmscan: stop reclaim/compaction earlier due to insufficient progress if !__GFP_REPEAT should_continue_reclaim() for reclaim/compaction allows scanning to continue even if pages are not being reclaimed until the full list is scanned. In terms of allocation success, this makes sense but potentially it introduces unwanted latency for high-order allocations such as transparent hugepages and network jumbo frames that would prefer to fail the allocation attempt and fallback to order-0 pages. Worse, there is a potential that the full LRU scan will clear all the young bits, distort page aging information and potentially push pages into swap that would have otherwise remained resident. This patch will stop reclaim/compaction if no pages were reclaimed in the last SWAP_CLUSTER_MAX pages that were considered. For allocations such as hugetlbfs that use __GFP_REPEAT and have fewer fallback options, the full LRU list may still be scanned. Order-0 allocation should not be affected because RECLAIM_MODE_COMPACTION is not set so the following avoids the gfp_mask being examined: if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) return false; A tool was developed based on ftrace that tracked the latency of high-order allocations while transparent hugepage support was enabled and three benchmarks were run. The "fix-infinite" figures are 2.6.38-rc4 with Johannes's patch "vmscan: fix zone shrinking exit when scan work is done" applied. STREAM Highorder Allocation Latency Statistics fix-infinite break-early 1 :: Count 10298 10229 1 :: Min 0.4560 0.4640 1 :: Mean 1.0589 1.0183 1 :: Max 14.5990 11.7510 1 :: Stddev 0.5208 0.4719 2 :: Count 2 1 2 :: Min 1.8610 3.7240 2 :: Mean 3.4325 3.7240 2 :: Max 5.0040 3.7240 2 :: Stddev 1.5715 0.0000 9 :: Count 111696 111694 9 :: Min 0.5230 0.4110 9 :: Mean 10.5831 10.5718 9 :: Max 38.4480 43.2900 9 :: Stddev 1.1147 1.1325 Mean time for order-1 allocations is reduced. order-2 looks increased but with so few allocations, it's not particularly significant. THP mean allocation latency is also reduced. That said, allocation time varies so significantly that the reductions are within noise. Max allocation time is reduced by a significant amount for low-order allocations but reduced for THP allocations which presumably are now breaking before reclaim has done enough work. SysBench Highorder Allocation Latency Statistics fix-infinite break-early 1 :: Count 15745 15677 1 :: Min 0.4250 0.4550 1 :: Mean 1.1023 1.0810 1 :: Max 14.4590 10.8220 1 :: Stddev 0.5117 0.5100 2 :: Count 1 1 2 :: Min 3.0040 2.1530 2 :: Mean 3.0040 2.1530 2 :: Max 3.0040 2.1530 2 :: Stddev 0.0000 0.0000 9 :: Count 2017 1931 9 :: Min 0.4980 0.7480 9 :: Mean 10.4717 10.3840 9 :: Max 24.9460 26.2500 9 :: Stddev 1.1726 1.1966 Again, mean time for order-1 allocations is reduced while order-2 allocations are too few to draw conclusions from. The mean time for THP allocations is also slightly reduced albeit the reductions are within varianes. Once again, our maximum allocation time is significantly reduced for low-order allocations and slightly increased for THP allocations. Anon stream mmap reference Highorder Allocation Latency Statistics 1 :: Count 1376 1790 1 :: Min 0.4940 0.5010 1 :: Mean 1.0289 0.9732 1 :: Max 6.2670 4.2540 1 :: Stddev 0.4142 0.2785 2 :: Count 1 - 2 :: Min 1.9060 - 2 :: Mean 1.9060 - 2 :: Max 1.9060 - 2 :: Stddev 0.0000 - 9 :: Count 11266 11257 9 :: Min 0.4990 0.4940 9 :: Mean 27250.4669 24256.1919 9 :: Max 11439211.0000 6008885.0000 9 :: Stddev 226427.4624 186298.1430 This benchmark creates one thread per CPU which references an amount of anonymous memory 1.5 times the size of physical RAM. This pounds swap quite heavily and is intended to exercise THP a bit. Mean allocation time for order-1 is reduced as before. It's also reduced for THP allocations but the variations here are pretty massive due to swap. As before, maximum allocation times are significantly reduced. Overall, the patch reduces the mean and maximum allocation latencies for the smaller high-order allocations. This was with Slab configured so it would be expected to be more significant with Slub which uses these size allocations more aggressively. The mean allocation times for THP allocations are also slightly reduced. The maximum latency was slightly increased as predicted by the comments due to reclaim/compaction breaking early. However, workloads care more about the latency of lower-order allocations than THP so it's an acceptable trade-off. Signed-off-by: Mel Gorman Acked-by: Andrea Arcangeli Acked-by: Johannes Weiner Reviewed-by: Minchan Kim Acked-by: Andrea Arcangeli Acked-by: Rik van Riel Cc: Michal Hocko Cc: Kent Overstreet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 17497d0cd8b9..6771ea70bfe7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1841,16 +1841,28 @@ static inline bool should_continue_reclaim(struct zone *zone, if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) return false; - /* - * If we failed to reclaim and have scanned the full list, stop. - * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far - * faster but obviously would be less likely to succeed - * allocation. If this is desirable, use GFP_REPEAT to decide - * if both reclaimed and scanned should be checked or just - * reclaimed - */ - if (!nr_reclaimed && !nr_scanned) - return false; + /* Consider stopping depending on scan and reclaim activity */ + if (sc->gfp_mask & __GFP_REPEAT) { + /* + * For __GFP_REPEAT allocations, stop reclaiming if the + * full LRU list has been scanned and we are still failing + * to reclaim pages. This full LRU scan is potentially + * expensive but a __GFP_REPEAT caller really wants to succeed + */ + if (!nr_reclaimed && !nr_scanned) + return false; + } else { + /* + * For non-__GFP_REPEAT allocations which can presumably + * fail without consequence, stop if we failed to reclaim + * any pages from the last SWAP_CLUSTER_MAX number of + * pages that were scanned. This will return to the + * caller faster at the risk reclaim/compaction and + * the resulting allocation attempt fails + */ + if (!nr_reclaimed) + return false; + } /* * If we have not reclaimed enough pages for compaction and the -- cgit v1.2.3 From 29723fccc837d20039078f7a571e8d457eb0d6c6 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 25 Feb 2011 14:44:25 -0800 Subject: mm: fix dubious code in __count_immobile_pages() When pfn_valid_within() failed 'iter' was incremented twice. Signed-off-by: Namhyung Kim Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a873e61e312e..cdef1d4b4e47 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5376,10 +5376,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { unsigned long check = pfn + iter; - if (!pfn_valid_within(check)) { - iter++; + if (!pfn_valid_within(check)) continue; - } + page = pfn_to_page(check); if (!page_count(page)) { if (PageBuddy(page)) -- cgit v1.2.3 From 8eac563c1c3a2047083022357ae63722b19e4e08 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 25 Feb 2011 14:44:28 -0800 Subject: thp: fix interleaving for transparent hugepages The THP code didn't pass the correct interleaving shift to the memory policy code. Fix this here by adjusting for the order. Signed-off-by: Andi Kleen Reviewed-by: Christoph Lameter Acked-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 368fc9d23610..49355a970be2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1830,7 +1830,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); put_mems_allowed(); -- cgit v1.2.3 From e5598f8bf5449bc09e4005600ead32e6f2a3e79b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 25 Feb 2011 14:44:29 -0800 Subject: memcg: more mem_cgroup_uncharge() batching It seems odd that truncate_inode_pages_range(), called not only when truncating but also when evicting inodes, has mem_cgroup_uncharge_start and _end() batching in its second loop to clear up a few leftovers, but not in its first loop that does almost all the work: add them there too. Signed-off-by: Hugh Dickins Acked-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Acked-by: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index 49feb46e77b8..d64296be00d3 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -225,6 +225,7 @@ void truncate_inode_pages_range(struct address_space *mapping, next = start; while (next <= end && pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t page_index = page->index; @@ -247,6 +248,7 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); } pagevec_release(&pvec); + mem_cgroup_uncharge_end(); cond_resched(); } -- cgit v1.2.3 From 2f5f9486f8c12e3aa40fe3775a18cb14efc5cea2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 4 Mar 2011 17:36:29 -0800 Subject: mm: change alloc_pages_vma to pass down the policy node for local policy Currently alloc_pages_vma() always uses the local node as policy node for the LOCAL policy. Pass this node down as an argument instead. No behaviour change from this patch, but will be needed for followons. Acked-by: Andrea Arcangeli Signed-off-by: Andi Kleen Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 9 +++++---- mm/huge_memory.c | 2 +- mm/mempolicy.c | 11 +++++------ 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 0b84c61607e8..37b8af5db091 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -332,16 +332,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) return alloc_pages_current(gfp_mask, order); } extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, - struct vm_area_struct *vma, unsigned long addr); + struct vm_area_struct *vma, unsigned long addr, + int node); #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_pages_vma(gfp_mask, order, vma, addr) \ +#define alloc_pages_vma(gfp_mask, order, vma, addr, node) \ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) -#define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr) +#define alloc_page_vma(gfp_mask, vma, addr) \ + alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id()) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3e29781ee762..c7c2cd925599 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -653,7 +653,7 @@ static inline struct page *alloc_hugepage_vma(int defrag, unsigned long haddr) { return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), - HPAGE_PMD_ORDER, vma, haddr); + HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); } #ifndef CONFIG_NUMA diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 49355a970be2..25a5a9146619 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) } /* Return a zonelist indicated by gfp for node representing a mempolicy */ -static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) +static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, + int nd) { - int nd = numa_node_id(); - switch (policy->mode) { case MPOL_PREFERRED: if (!(policy->flags & MPOL_F_LOCAL)) @@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, zl = node_zonelist(interleave_nid(*mpol, vma, addr, huge_page_shift(hstate_vma(vma))), gfp_flags); } else { - zl = policy_zonelist(gfp_flags, *mpol); + zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); if ((*mpol)->mode == MPOL_BIND) *nodemask = &(*mpol)->v.nodes; } @@ -1820,7 +1819,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, */ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, int node) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; @@ -1836,7 +1835,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, put_mems_allowed(); return page; } - zl = policy_zonelist(gfp, pol); + zl = policy_zonelist(gfp, pol, node); if (unlikely(mpol_needs_cond_ref(pol))) { /* * slow path: ref counted shared policy -- cgit v1.2.3 From 19ee151e140daa5183c4984981801e542e0544fb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 4 Mar 2011 17:36:31 -0800 Subject: mm: preserve original node for transparent huge page copies This makes a difference for LOCAL policy, where the node cannot be determined from the policy itself, but has to be gotten from the original page. Acked-by: Andrea Arcangeli Signed-off-by: Andi Kleen Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c7c2cd925599..1802db819e28 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -799,8 +799,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, } for (i = 0; i < HPAGE_PMD_NR; i++) { - pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, - vma, address); + pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, + vma, address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_newpage_charge(pages[i], mm, GFP_KERNEL))) { -- cgit v1.2.3 From 5c4b4be3b6b937256103a5ae49177e0c3a17cb8f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 4 Mar 2011 17:36:32 -0800 Subject: mm: use correct numa policy node for transparent hugepages Pass down the correct node for a transparent hugepage allocation. Most callers continue to use the current node, however the hugepaged daemon now uses the previous node of the first to be collapsed page instead. This ensures that khugepaged does not mess up local memory for an existing process which uses local policy. The choice of node is somewhat primitive currently: it just uses the node of the first page in the pmd range. An alternative would be to look at multiple pages and use the most popular node. I used the simplest variant for now which should work well enough for the case of all pages being on the same node. [akpm@linux-foundation.org: coding-style fixes] Acked-by: Andrea Arcangeli Signed-off-by: Andi Kleen Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 24 +++++++++++++++++------- mm/mempolicy.c | 3 ++- 2 files changed, 19 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1802db819e28..dbe99a5f2073 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -650,10 +650,10 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag) static inline struct page *alloc_hugepage_vma(int defrag, struct vm_area_struct *vma, - unsigned long haddr) + unsigned long haddr, int nd) { return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), - HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); + HPAGE_PMD_ORDER, vma, haddr, nd); } #ifndef CONFIG_NUMA @@ -678,7 +678,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(khugepaged_enter(vma))) return VM_FAULT_OOM; page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr); + vma, haddr, numa_node_id()); if (unlikely(!page)) goto out; if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { @@ -902,7 +902,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr); + vma, haddr, numa_node_id()); else new_page = NULL; @@ -1745,7 +1745,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, - struct vm_area_struct *vma) + struct vm_area_struct *vma, + int node) { pgd_t *pgd; pud_t *pud; @@ -1773,7 +1774,8 @@ static void collapse_huge_page(struct mm_struct *mm, * mmap_sem in read mode is good idea also to allow greater * scalability. */ - new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); + new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, + node); if (unlikely(!new_page)) { up_read(&mm->mmap_sem); *hpage = ERR_PTR(-ENOMEM); @@ -1919,6 +1921,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct page *page; unsigned long _address; spinlock_t *ptl; + int node = -1; VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -1949,6 +1952,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, page = vm_normal_page(vma, _address, pteval); if (unlikely(!page)) goto out_unmap; + /* + * Chose the node of the first page. This could + * be more sophisticated and look at more pages, + * but isn't for now. + */ + if (node == -1) + node = page_to_nid(page); VM_BUG_ON(PageCompound(page)); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) goto out_unmap; @@ -1965,7 +1975,7 @@ out_unmap: pte_unmap_unlock(pte, ptl); if (ret) /* collapse_huge_page will return with the mmap_sem released */ - collapse_huge_page(mm, address, hpage, vma); + collapse_huge_page(mm, address, hpage, vma, node); out: return ret; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 25a5a9146619..b53ec99f1428 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1891,7 +1891,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else page = __alloc_pages_nodemask(gfp, order, - policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); + policy_zonelist(gfp, pol, numa_node_id()), + policy_nodemask(gfp, pol)); put_mems_allowed(); return page; } -- cgit v1.2.3 From ab9a0f196f2f4f080df54402493ea3dc31b5243e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 10 Mar 2011 15:21:48 +0800 Subject: slub: automatically reserve bytes at the end of slab There is no "struct" for slub's slab, it shares with struct page. But struct page is very small, it is insufficient when we need to add some metadata for slab. So we add a field "reserved" to struct kmem_cache, when a slab is allocated, kmem_cache->reserved bytes are automatically reserved at the end of the slab for slab's metadata. Changed from v1: Export the reserved field via sysfs Acked-by: Christoph Lameter Signed-off-by: Lai Jiangshan Signed-off-by: Pekka Enberg --- include/linux/slub_def.h | 1 + mm/slub.c | 47 ++++++++++++++++++++++++++++++----------------- 2 files changed, 31 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 8b6e8ae5d5ca..ae0093cc5189 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -83,6 +83,7 @@ struct kmem_cache { void (*ctor)(void *); int inuse; /* Offset to metadata */ int align; /* Alignment */ + int reserved; /* Reserved bytes at the end of slabs */ unsigned long min_partial; const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ diff --git a/mm/slub.c b/mm/slub.c index e15aa7f193c9..d3d17677bab5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -281,11 +281,16 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } +static inline int order_objects(int order, unsigned long size, int reserved) +{ + return ((PAGE_SIZE << order) - reserved) / size; +} + static inline struct kmem_cache_order_objects oo_make(int order, - unsigned long size) + unsigned long size, int reserved) { struct kmem_cache_order_objects x = { - (order << OO_SHIFT) + (PAGE_SIZE << order) / size + (order << OO_SHIFT) + order_objects(order, size, reserved) }; return x; @@ -617,7 +622,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - length = (PAGE_SIZE << compound_order(page)); + length = (PAGE_SIZE << compound_order(page)) - s->reserved; end = start + length; remainder = length % s->size; if (!remainder) @@ -698,7 +703,7 @@ static int check_slab(struct kmem_cache *s, struct page *page) return 0; } - maxobj = (PAGE_SIZE << compound_order(page)) / s->size; + maxobj = order_objects(compound_order(page), s->size, s->reserved); if (page->objects > maxobj) { slab_err(s, page, "objects %u > max %u", s->name, page->objects, maxobj); @@ -748,7 +753,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) nr++; } - max_objects = (PAGE_SIZE << compound_order(page)) / s->size; + max_objects = order_objects(compound_order(page), s->size, s->reserved); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; @@ -1988,13 +1993,13 @@ static int slub_nomerge; * the smallest order which will fit the object. */ static inline int slab_order(int size, int min_objects, - int max_order, int fract_leftover) + int max_order, int fract_leftover, int reserved) { int order; int rem; int min_order = slub_min_order; - if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE) + if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; for (order = max(min_order, @@ -2003,10 +2008,10 @@ static inline int slab_order(int size, int min_objects, unsigned long slab_size = PAGE_SIZE << order; - if (slab_size < min_objects * size) + if (slab_size < min_objects * size + reserved) continue; - rem = slab_size % size; + rem = (slab_size - reserved) % size; if (rem <= slab_size / fract_leftover) break; @@ -2016,7 +2021,7 @@ static inline int slab_order(int size, int min_objects, return order; } -static inline int calculate_order(int size) +static inline int calculate_order(int size, int reserved) { int order; int min_objects; @@ -2034,14 +2039,14 @@ static inline int calculate_order(int size) min_objects = slub_min_objects; if (!min_objects) min_objects = 4 * (fls(nr_cpu_ids) + 1); - max_objects = (PAGE_SIZE << slub_max_order)/size; + max_objects = order_objects(slub_max_order, size, reserved); min_objects = min(min_objects, max_objects); while (min_objects > 1) { fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, - slub_max_order, fraction); + slub_max_order, fraction, reserved); if (order <= slub_max_order) return order; fraction /= 2; @@ -2053,14 +2058,14 @@ static inline int calculate_order(int size) * We were unable to place multiple objects in a slab. Now * lets see if we can place a single object there. */ - order = slab_order(size, 1, slub_max_order, 1); + order = slab_order(size, 1, slub_max_order, 1, reserved); if (order <= slub_max_order) return order; /* * Doh this slab cannot be placed using slub_max_order. */ - order = slab_order(size, 1, MAX_ORDER, 1); + order = slab_order(size, 1, MAX_ORDER, 1, reserved); if (order < MAX_ORDER) return order; return -ENOSYS; @@ -2311,7 +2316,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) if (forced_order >= 0) order = forced_order; else - order = calculate_order(size); + order = calculate_order(size, s->reserved); if (order < 0) return 0; @@ -2329,8 +2334,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) /* * Determine the number of objects per slab */ - s->oo = oo_make(order, size); - s->min = oo_make(get_order(size), size); + s->oo = oo_make(order, size, s->reserved); + s->min = oo_make(get_order(size), size, s->reserved); if (oo_objects(s->oo) > oo_objects(s->max)) s->max = s->oo; @@ -2349,6 +2354,7 @@ static int kmem_cache_open(struct kmem_cache *s, s->objsize = size; s->align = align; s->flags = kmem_cache_flags(size, flags, name, ctor); + s->reserved = 0; if (!calculate_sizes(s, -1)) goto error; @@ -4017,6 +4023,12 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(destroy_by_rcu); +static ssize_t reserved_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->reserved); +} +SLAB_ATTR_RO(reserved); + #ifdef CONFIG_SLUB_DEBUG static ssize_t slabs_show(struct kmem_cache *s, char *buf) { @@ -4303,6 +4315,7 @@ static struct attribute *slab_attrs[] = { &reclaim_account_attr.attr, &destroy_by_rcu_attr.attr, &shrink_attr.attr, + &reserved_attr.attr, #ifdef CONFIG_SLUB_DEBUG &total_objects_attr.attr, &slabs_attr.attr, -- cgit v1.2.3 From da9a638c6f8fc0633fa94a334f1c053f5e307177 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 10 Mar 2011 15:22:00 +0800 Subject: slub,rcu: don't assume the size of struct rcu_head The size of struct rcu_head may be changed. When it becomes larger, it will pollute the page array. We reserve some some bytes for struct rcu_head when a slab is allocated in this situation. Changed from V1: use VM_BUG_ON instead BUG_ON Acked-by: Christoph Lameter Signed-off-by: Lai Jiangshan Signed-off-by: Pekka Enberg --- mm/slub.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index d3d17677bab5..ebba3eb19369 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1254,21 +1254,38 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __free_pages(page, order); } +#define need_reserve_slab_rcu \ + (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) + static void rcu_free_slab(struct rcu_head *h) { struct page *page; - page = container_of((struct list_head *)h, struct page, lru); + if (need_reserve_slab_rcu) + page = virt_to_head_page(h); + else + page = container_of((struct list_head *)h, struct page, lru); + __free_slab(page->slab, page); } static void free_slab(struct kmem_cache *s, struct page *page) { if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { - /* - * RCU free overloads the RCU head over the LRU - */ - struct rcu_head *head = (void *)&page->lru; + struct rcu_head *head; + + if (need_reserve_slab_rcu) { + int order = compound_order(page); + int offset = (PAGE_SIZE << order) - s->reserved; + + VM_BUG_ON(s->reserved != sizeof(*head)); + head = page_address(page) + offset; + } else { + /* + * RCU free overloads the RCU head over the LRU + */ + head = (void *)&page->lru; + } call_rcu(head, rcu_free_slab); } else @@ -2356,6 +2373,9 @@ static int kmem_cache_open(struct kmem_cache *s, s->flags = kmem_cache_flags(size, flags, name, ctor); s->reserved = 0; + if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) + s->reserved = sizeof(struct rcu_head); + if (!calculate_sizes(s, -1)) goto error; if (disable_higher_order_debug) { -- cgit v1.2.3 From 5bfe53a77e8a3ffce4a10003c75f464a138e272d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 10 Mar 2011 15:22:24 +0800 Subject: slab,rcu: don't assume the size of struct rcu_head The size of struct rcu_head may be changed. When it becomes larger, it may pollute the data after struct slab. Acked-by: Christoph Lameter Signed-off-by: Lai Jiangshan Signed-off-by: Pekka Enberg --- mm/slab.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 37961d1f584f..52cf0b4634d4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -190,22 +190,6 @@ typedef unsigned int kmem_bufctl_t; #define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) -/* - * struct slab - * - * Manages the objs in a slab. Placed either at the beginning of mem allocated - * for a slab, or allocated from an general cache. - * Slabs are chained into three list: fully used, partial, fully free slabs. - */ -struct slab { - struct list_head list; - unsigned long colouroff; - void *s_mem; /* including colour offset */ - unsigned int inuse; /* num of objs active in slab */ - kmem_bufctl_t free; - unsigned short nodeid; -}; - /* * struct slab_rcu * @@ -219,8 +203,6 @@ struct slab { * * rcu_read_lock before reading the address, then rcu_read_unlock after * taking the spinlock within the structure expected at that address. - * - * We assume struct slab_rcu can overlay struct slab when destroying. */ struct slab_rcu { struct rcu_head head; @@ -228,6 +210,27 @@ struct slab_rcu { void *addr; }; +/* + * struct slab + * + * Manages the objs in a slab. Placed either at the beginning of mem allocated + * for a slab, or allocated from an general cache. + * Slabs are chained into three list: fully used, partial, fully free slabs. + */ +struct slab { + union { + struct { + struct list_head list; + unsigned long colouroff; + void *s_mem; /* including colour offset */ + unsigned int inuse; /* num of objs active in slab */ + kmem_bufctl_t free; + unsigned short nodeid; + }; + struct slab_rcu __slab_cover_slab_rcu; + }; +}; + /* * struct array_cache * -- cgit v1.2.3