From f95ba941d1bee594d536cdcbf879a0865381b903 Mon Sep 17 00:00:00 2001
From: Andrew Morton
Date: Tue, 25 Jan 2011 15:07:11 -0800
Subject: mm/pgtable-generic.c: fix CONFIG_SWAP=n build

mips (and sparc32):

  In file included from arch/mips/include/asm/tlb.h:21,
                   from mm/pgtable-generic.c:9:
  include/asm-generic/tlb.h: In function `tlb_flush_mmu':
  include/asm-generic/tlb.h:76: error: implicit declaration of function `release_pages'
  include/asm-generic/tlb.h: In function `tlb_remove_page':
  include/asm-generic/tlb.h:105: error: implicit declaration of function `page_cache_release'

free_pages_and_swap_cache() and free_page_and_swap_cache() are macros
which call release_pages() and page_cache_release().  The obvious fix is
to include pagemap.h in swap.h, where those macros are defined.  But that
breaks sparc for weird reasons.

So fix it within mm/pgtable-generic.c instead.

Reported-by: Yoichi Yuasa <yuasa@linux-mips.org>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Sergei Shtylyov <sshtylyov@mvista.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/pgtable-generic.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 0369f5b3ba1b..eb663fb533e0 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -6,6 +6,7 @@
  *  Copyright (C) 2010  Linus Torvalds
  */
 
+#include <linux/pagemap.h>
 #include <asm/tlb.h>
 #include <asm-generic/pgtable.h>
 
-- 
cgit v1.2.3


From f33261d75b88f55a08e6a9648cef73509979bfba Mon Sep 17 00:00:00 2001
From: David Rientjes
Date: Tue, 25 Jan 2011 15:07:20 -0800
Subject: mm: fix deferred congestion timeout if preferred zone is not allowed

Before 0e093d99763e ("writeback: do not sleep on the congestion queue if
there are no congested BDIs or if significant congestion is not being
encountered in the current zone"), preferred_zone was only used for NUMA
statistics, to determine the zoneidx from which to allocate from given
the type requested, and whether to utilize memory compaction.

wait_iff_congested(), though, uses preferred_zone to determine if the
congestion wait should be deferred because its dirty pages are backed by
a congested bdi.  This incorrectly defers the timeout and busy loops in
the page allocator with various cond_resched() calls if preferred_zone
is not allowed in the current context, usually consuming 100% of a cpu.

This patch ensures preferred_zone is an allowed zone in the fastpath
depending on whether current is constrained by its cpuset or nodes in
its mempolicy (when the nodemask passed is non-NULL).  This is correct
since the fastpath allocation always passes ALLOC_CPUSET when trying to
allocate memory.  In the slowpath, this patch resets preferred_zone to
the first zone of the allowed type when the allocation is not
constrained by current's cpuset, i.e.  it does not pass ALLOC_CPUSET.

This patch also ensures preferred_zone is from the set of allowed nodes
when called from within direct reclaim since allocations are always
constrained by cpusets in this context (it is blockable).

Both of these uses of cpuset_current_mems_allowed are protected by
get_mems_allowed().

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 12 +++++++++++-
 mm/vmscan.c     |  3 ++-
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 90c1439549fd..f4967910c967 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2034,6 +2034,14 @@ restart:
 	 */
 	alloc_flags = gfp_to_alloc_flags(gfp_mask);
 
+	/*
+	 * Find the true preferred zone if the allocation is unconstrained by
+	 * cpusets.
+	 */
+	if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+		first_zones_zonelist(zonelist, high_zoneidx, NULL,
+					&preferred_zone);
+
 	/* This is the last chance, in general, before the goto nopage. */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
 			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2192,7 +2200,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 
 	get_mems_allowed();
 	/* The preferred zone is used for statistics later */
-	first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
+	first_zones_zonelist(zonelist, high_zoneidx,
+				nodemask ? : &cpuset_current_mems_allowed,
+				&preferred_zone);
 	if (!preferred_zone) {
 		put_mems_allowed();
 		return NULL;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f5d90dedebba..148c6e630df2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2083,7 +2083,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 			struct zone *preferred_zone;
 
 			first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
-							NULL, &preferred_zone);
+						&cpuset_current_mems_allowed,
+						&preferred_zone);
 			wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
 		}
 	}
-- 
cgit v1.2.3


From 2ff754fa8f416e82327f2d8f1354a033b66286df Mon Sep 17 00:00:00 2001
From: David Rientjes
Date: Tue, 25 Jan 2011 15:07:23 -0800
Subject: mm: clear pages_scanned only if draining a pcp adds pages to the
 buddy allocator

Commit 0e093d99763e ("writeback: do not sleep on the congestion queue if
there are no congested BDIs or if significant congestion is not being
encountered in the current zone") uncovered a livelock in the page
allocator that resulted in tasks infinitely looping trying to find
memory and kswapd running at 100% cpu.

The issue occurs because drain_all_pages() is called immediately
following direct reclaim when no memory is freed and try_to_free_pages()
returns non-zero because all zones in the zonelist do not have their
all_unreclaimable flag set.

When draining the per-cpu pagesets back to the buddy allocator for each
zone, the zone->pages_scanned counter is cleared to avoid erroneously
setting zone->all_unreclaimable later.  The problem is that no pages may
actually be drained and, thus, the unreclaimable logic never fails
direct reclaim so the oom killer may be invoked.

This apparently only manifested after wait_iff_congested() was
introduced and the zone was full of anonymous memory that would not
congest the backing store.  The page allocator would infinitely loop if
there were no other tasks waiting to be scheduled and clear
zone->pages_scanned because of drain_all_pages() as the result of this
change before kswapd could scan enough pages to trigger the reclaim
logic.  Additionally, with every loop of the page allocator and in the
reclaim path, kswapd would be kicked and would end up running at 100%
cpu.  In this scenario, current and kswapd are all running continuously
with kswapd incrementing zone->pages_scanned and current clearing it.

The problem is even more pronounced when current swaps some of its
memory to swap cache and the reclaimable logic then considers all active
anonymous memory in the all_unreclaimable logic, which requires a much
higher zone->pages_scanned value for try_to_free_pages() to return zero
that is never attainable in this scenario.

Before wait_iff_congested(), the page allocator would incur an
unconditional timeout and allow kswapd to elevate zone->pages_scanned to
a level that the oom killer would be called the next time it loops.

The fix is to only attempt to drain pcp pages if there is actually a
quantity to be drained.  The unconditional clearing of
zone->pages_scanned in free_pcppages_bulk() need not be changed since
other callers already ensure that draining will occur.  This patch
ensures that free_pcppages_bulk() will actually free memory before
calling into it from drain_all_pages() so zone->pages_scanned is only
cleared if appropriate.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f4967910c967..a873e61e312e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1088,8 +1088,10 @@ static void drain_pages(unsigned int cpu)
 		pset = per_cpu_ptr(zone->pageset, cpu);
 
 		pcp = &pset->pcp;
-		free_pcppages_bulk(zone, pcp->count, pcp);
-		pcp->count = 0;
+		if (pcp->count) {
+			free_pcppages_bulk(zone, pcp->count, pcp);
+			pcp->count = 0;
+		}
 		local_irq_restore(flags);
 	}
 }
-- 
cgit v1.2.3


From 8dba474f034c322d96ada39cb20cac711d80dcb2 Mon Sep 17 00:00:00 2001
From: Jesper Juhl
Date: Tue, 25 Jan 2011 15:07:24 -0800
Subject: mm/memcontrol.c: fix uninitialized variable use in
 mem_cgroup_move_parent()

In mm/memcontrol.c::mem_cgroup_move_parent() there's a path that jumps
to the 'put_back' label

  	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, charge);
  	if (ret || !parent)
  		goto put_back;

where we'll

  	if (charge > PAGE_SIZE)
  		compound_unlock_irqrestore(page, flags);

but, we have not assigned anything to 'flags' at this point, nor have we
called 'compound_lock_irqsave()' (which is what sets 'flags').  The
'put_back' label should be moved below the call to
compound_unlock_irqrestore() as per this patch.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index db76ef726293..4fcf47a62550 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2292,9 +2292,10 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
 	ret = mem_cgroup_move_account(pc, child, parent, true, charge);
 	if (ret)
 		mem_cgroup_cancel_charge(parent, charge);
-put_back:
+
 	if (charge > PAGE_SIZE)
 		compound_unlock_irqrestore(page, flags);
+put_back:
 	putback_lru_page(page);
 put:
 	put_page(page);
-- 
cgit v1.2.3


From 33a938774fdb9933e9c77504b035f4f87c0859df Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli
Date: Tue, 25 Jan 2011 15:07:25 -0800
Subject: mm: compaction: don't depend on HUGETLB_PAGE

Commit 5d6892407 ("thp: select CONFIG_COMPACTION if TRANSPARENT_HUGEPAGE
enabled") causes this warning during the configuration process:

  warning: (TRANSPARENT_HUGEPAGE) selects COMPACTION which has unmet
  direct dependencies (EXPERIMENTAL && HUGETLB_PAGE && MMU)

COMPACTION doesn't depend on HUGETLB_PAGE, it doesn't depend on THP
either, it is also useful for regular alloc_pages(order > 0) including
the very kernel stack during fork (THREAD_ORDER = 1).  It's always
better to enable COMPACTION.

The warning should be an error because we would end up with MIGRATION
not selected, and COMPACTION wouldn't work without migration (despite it
seems to build with an inline migrate_pages returning -ENOSYS).

I'd also like to remove EXPERIMENTAL: compaction has been in the kernel
for some releases (for full safety the default remains disabled which I
think is enough).

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: Luca Tettamanti <kronos.it@gmail.com>
Tested-by: Luca Tettamanti <kronos.it@gmail.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/Kconfig b/mm/Kconfig
index 3ad483bdf505..e9c0c61f2ddd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS
 config COMPACTION
 	bool "Allow for memory compaction"
 	select MIGRATION
-	depends on EXPERIMENTAL && HUGETLB_PAGE && MMU
+	depends on MMU
 	help
 	  Allows the compaction of memory for the allocation of huge pages.
 
-- 
cgit v1.2.3


From 28bd65781c848d95ba6a7f58b5c4b8265a804ec6 Mon Sep 17 00:00:00 2001
From: Minchan Kim
Date: Tue, 25 Jan 2011 15:07:26 -0800
Subject: mm: migration: clarify migrate_pages() comment

Callers of migrate_pages should putback_lru_pages to return pages
isolated to LRU or free list.  Now comment is rather confusing.  It says
caller always have to call it.

It is more clear to point out that the caller has to call it if
migrate_pages's return value isn't zero.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 46fe8cc13d67..9f29a3b7aac2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -888,7 +888,7 @@ out:
  * are movable anymore because to has become empty
  * or no retryable pages exist anymore.
  * Caller should call putback_lru_pages to return pages to the LRU
- * or free list.
+ * or free list only if ret != 0.
  *
  * Return: Number of pages not migrated or error code.
  */
-- 
cgit v1.2.3


From 01c88e2d6b7330c0cc5867fe2297e7d826e1337d Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki
Date: Tue, 25 Jan 2011 15:07:27 -0800
Subject: memcg: fix account leak at failure of memsw acconting

Commit 4b53433468 ("memcg: clean up try_charge main loop") removes a
cancel of charge at case: memory charge-> success.  mem+swap charge->
failure.

This leaks usage of memory.  Fix it.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: <stable@kernel.org>	[2.6.36+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4fcf47a62550..1eb1a04f874c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1832,6 +1832,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
 		if (likely(!ret))
 			return CHARGE_OK;
 
+		res_counter_uncharge(&mem->res, csize);
 		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
 		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
 	} else
-- 
cgit v1.2.3


From 3d37c4a9199920964ffdfaec6335d93b9dcf9ca5 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki
Date: Tue, 25 Jan 2011 15:07:28 -0800
Subject: memcg: bugfix check mem_cgroup_disabled() at split fixup

mem_cgroup_disabled() should be checked at splitting.  If disabled, no
heavy work is necesary.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1eb1a04f874c..8ab1d42664fb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2145,6 +2145,8 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
 	struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
 	unsigned long flags;
 
+	if (mem_cgroup_disabled())
+		return;
 	/*
 	 * We have no races with charge/uncharge but will have races with
 	 * page state accounting.
-- 
cgit v1.2.3


From 52dbb9050936fd33ceb45f10529dbc992507c058 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki
Date: Tue, 25 Jan 2011 15:07:29 -0800
Subject: memcg: fix race at move_parent around compound_order()

A fix up mem_cgroup_move_parent() which use compound_order() in
asynchronous manner.  This compound_order() may return unknown value
because we don't take lock.  Use PageTransHuge() and HPAGE_SIZE instead
of it.

Also clean up for mem_cgroup_move_parent().
 - remove unnecessary initialization of local variable.
 - rename charge_size -> page_size
 - remove unnecessary (wrong) comment.
 - added a comment about THP.

Note:
 Current design take compound_page_lock() in caller of move_account().
 This should be revisited when we implement direct move_task of hugepage
 without splitting.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8ab1d42664fb..3878cfe399dc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2236,7 +2236,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 {
 	int ret = -EINVAL;
 	unsigned long flags;
-
+	/*
+	 * The page is isolated from LRU. So, collapse function
+	 * will not handle this page. But page splitting can happen.
+	 * Do this check under compound_page_lock(). The caller should
+	 * hold it.
+	 */
 	if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
 		return -EBUSY;
 
@@ -2268,7 +2273,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
 	struct cgroup *cg = child->css.cgroup;
 	struct cgroup *pcg = cg->parent;
 	struct mem_cgroup *parent;
-	int charge = PAGE_SIZE;
+	int page_size = PAGE_SIZE;
 	unsigned long flags;
 	int ret;
 
@@ -2281,22 +2286,24 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
 		goto out;
 	if (isolate_lru_page(page))
 		goto put;
-	/* The page is isolated from LRU and we have no race with splitting */
-	charge = PAGE_SIZE << compound_order(page);
+
+	if (PageTransHuge(page))
+		page_size = HPAGE_SIZE;
 
 	parent = mem_cgroup_from_cont(pcg);
-	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, charge);
+	ret = __mem_cgroup_try_charge(NULL, gfp_mask,
+				&parent, false, page_size);
 	if (ret || !parent)
 		goto put_back;
 
-	if (charge > PAGE_SIZE)
+	if (page_size > PAGE_SIZE)
 		flags = compound_lock_irqsave(page);
 
-	ret = mem_cgroup_move_account(pc, child, parent, true, charge);
+	ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
 	if (ret)
-		mem_cgroup_cancel_charge(parent, charge);
+		mem_cgroup_cancel_charge(parent, page_size);
 
-	if (charge > PAGE_SIZE)
+	if (page_size > PAGE_SIZE)
 		compound_unlock_irqrestore(page, flags);
 put_back:
 	putback_lru_page(page);
-- 
cgit v1.2.3


From 0a08739e81671de2cb690774937fe510c000b27f Mon Sep 17 00:00:00 2001
From: Jesper Juhl
Date: Sat, 30 Oct 2010 23:43:05 +0200
Subject: kmemleak: remove memset by using kzalloc

We don't need to memset if we just use kzalloc() rather than kmalloc() in
kmemleak_test_init().

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 mm/kmemleak-test.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index 177a5169bbde..ff0d9779cec8 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void)
 	 * after the module is removed.
 	 */
 	for (i = 0; i < 10; i++) {
-		elem = kmalloc(sizeof(*elem), GFP_KERNEL);
-		pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem);
+		elem = kzalloc(sizeof(*elem), GFP_KERNEL);
+		pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem);
 		if (!elem)
 			return -ENOMEM;
-		memset(elem, 0, sizeof(*elem));
 		INIT_LIST_HEAD(&elem->list);
-
 		list_add_tail(&elem->list, &test_list);
 	}
 
-- 
cgit v1.2.3


From 6ae4bd1f0bc479984f30061b5e5116060c24a267 Mon Sep 17 00:00:00 2001
From: Catalin Marinas
Date: Thu, 27 Jan 2011 10:30:26 +0000
Subject: kmemleak: Allow kmemleak metadata allocations to fail

This patch adds __GFP_NORETRY and __GFP_NOMEMALLOC flags to the kmemleak
metadata allocations so that it has a smaller effect on the users of the
kernel slab allocator. Since kmemleak allocations can now fail more
often, this patch also reduces the verbosity by passing __GFP_NOWARN and
not dumping the stack trace when a kmemleak allocation fails.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Toralf Förster <toralf.foerster@gmx.de>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Ted Ts'o <tytso@mit.edu>
---
 mm/kmemleak.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index bd9bc214091b..84225f3b7190 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -113,7 +113,9 @@
 #define BYTES_PER_POINTER	sizeof(void *)
 
 /* GFP bitmask for kmemleak internal allocations */
-#define GFP_KMEMLEAK_MASK	(GFP_KERNEL | GFP_ATOMIC)
+#define gfp_kmemleak_mask(gfp)	(((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
+				 __GFP_NORETRY | __GFP_NOMEMALLOC | \
+				 __GFP_NOWARN)
 
 /* scanning area inside a memory block */
 struct kmemleak_scan_area {
@@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
 	struct kmemleak_object *object;
 	struct prio_tree_node *node;
 
-	object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK);
+	object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
 	if (!object) {
-		kmemleak_stop("Cannot allocate a kmemleak_object structure\n");
+		pr_warning("Cannot allocate a kmemleak_object structure\n");
+		kmemleak_disable();
 		return NULL;
 	}
 
@@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
 		return;
 	}
 
-	area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK);
+	area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
 	if (!area) {
-		kmemleak_warn("Cannot allocate a scan area\n");
+		pr_warning("Cannot allocate a scan area\n");
 		goto out;
 	}
 
-- 
cgit v1.2.3


From fdf4c587a793ba87935e38e7f25a9540bc9a7b95 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse
Date: Mon, 31 Jan 2011 17:03:41 -0800
Subject: mlock: operate on any regions with protection != PROT_NONE

As Tao Ma noticed, change 5ecfda0 breaks blktrace. This is because
blktrace mmaps a file with PROT_WRITE permissions but without PROT_READ,
so my attempt to not unnecessarity break COW during mlock ended up
causing mlock to fail with a permission problem.

I am proposing to let mlock ignore vma protection in all cases except
PROT_NONE. In particular, mlock should not fail for PROT_WRITE regions
(as in the blktrace case, which broke at 5ecfda0) or for PROT_EXEC
regions (which seem to me like they were always broken).

Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mlock.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'mm')

diff --git a/mm/mlock.c b/mm/mlock.c
index 13e81ee8be9d..c3924c7f00be 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -178,6 +178,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
 	if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
 		gup_flags |= FOLL_WRITE;
 
+	/*
+	 * We want mlock to succeed for regions that have any permissions
+	 * other than PROT_NONE.
+	 */
+	if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+		gup_flags |= FOLL_FORCE;
+
 	if (vma->vm_flags & VM_LOCKED)
 		gup_flags |= FOLL_MLOCK;
 
-- 
cgit v1.2.3


From fceda1bf498677501befc7da72fd2e4de7f18466 Mon Sep 17 00:00:00 2001
From: Michal Hocko
Date: Tue, 1 Feb 2011 15:52:30 -0800
Subject: memsw: handle swapaccount kernel parameter correctly

__setup based kernel command line parameters handlers which are handled in
obsolete_checksetup are provided with the parameter value including =
(more precisely everything right after the parameter name).

This means that the current implementation of swapaccount[=1|0] doesn't
work at all because if there is a value for the parameter then we are
testing for "0" resp.  "1" but we are getting "=0" resp.  "=1" and if
there is no parameter value we are getting an empty string rather than
NULL.

The original noswapccount parameter, which doesn't care about the value,
works correctly.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3878cfe399dc..44f9f9c89f0c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5024,9 +5024,9 @@ struct cgroup_subsys mem_cgroup_subsys = {
 static int __init enable_swap_account(char *s)
 {
 	/* consider enabled if no parameter or 1 is given */
-	if (!s || !strcmp(s, "1"))
+	if (!(*s) || !strcmp(s, "=1"))
 		really_do_swap_account = 1;
-	else if (!strcmp(s, "0"))
+	else if (!strcmp(s, "=0"))
 		really_do_swap_account = 0;
 	return 1;
 }
@@ -5034,7 +5034,7 @@ __setup("swapaccount", enable_swap_account);
 
 static int __init disable_swap_account(char *s)
 {
-	enable_swap_account("0");
+	enable_swap_account("=0");
 	return 1;
 }
 __setup("noswapaccount", disable_swap_account);
-- 
cgit v1.2.3


From 552b372ba9db85751e7db2998f07cca2e51f5865 Mon Sep 17 00:00:00 2001
From: Michal Hocko
Date: Tue, 1 Feb 2011 15:52:31 -0800
Subject: memsw: deprecate noswapaccount kernel parameter and schedule it for
 removal

noswapaccount couldn't be used to control memsw for both on/off cases so
we have added swapaccount[=0|1] parameter.  This way we can turn the
feature in two ways noswapaccount resp.  swapaccount=0.  We have kept the
original noswapaccount but I think we should remove it after some time as
it just makes more command line parameters without any advantages and also
the code to handle parameters is uglier if we want both parameters.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Requested-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/feature-removal-schedule.txt | 16 ++++++++++++++++
 mm/memcontrol.c                            |  1 +
 2 files changed, 17 insertions(+)

(limited to 'mm')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index b959659c5df4..b3f35e5f9c95 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -603,3 +603,19 @@ Why:	The adm9240, w83792d and w83793 hardware monitoring drivers have
 Who:	Jean Delvare <khali@linux-fr.org>
 
 ----------------------------
+
+What:	noswapaccount kernel command line parameter
+When:	2.6.40
+Why:	The original implementation of memsw feature enabled by
+	CONFIG_CGROUP_MEM_RES_CTLR_SWAP could be disabled by the noswapaccount
+	kernel parameter (introduced in 2.6.29-rc1). Later on, this decision
+	turned out to be not ideal because we cannot have the feature compiled
+	in and disabled by default and let only interested to enable it
+	(e.g. general distribution kernels might need it). Therefore we have
+	added swapaccount[=0|1] parameter (introduced in 2.6.37) which provides
+	the both possibilities. If we remove noswapaccount we will have
+	less command line parameters with the same functionality and we
+	can also cleanup the parameter handling a bit ().
+Who:	Michal Hocko <mhocko@suse.cz>
+
+----------------------------
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 44f9f9c89f0c..79abb1fd39d2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5034,6 +5034,7 @@ __setup("swapaccount", enable_swap_account);
 
 static int __init disable_swap_account(char *s)
 {
+	printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
 	enable_swap_account("=0");
 	return 1;
 }
-- 
cgit v1.2.3


From 57fc4a5ee322cde96c33f101d3c2d3b79011c05c Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli
Date: Tue, 1 Feb 2011 15:52:32 -0800
Subject: mm: when migrate_pages returns 0, all pages must have been released

In some cases migrate_pages could return zero while still leaving a few
pages in the pagelist (and some caller wouldn't notice it has to call
putback_lru_pages after commit cf608ac19c9 ("mm: compaction: fix
COMPACTPAGEFAILED counting")).

Add one missing putback_lru_pages not added by commit cf608ac19c95 ("mm:
compaction: fix COMPACTPAGEFAILED counting").

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c | 1 +
 mm/migrate.c        | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 548fbd70f026..75398b0bfede 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1419,6 +1419,7 @@ int soft_offline_page(struct page *page, int flags)
 		ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
 								0, true);
 		if (ret) {
+			putback_lru_pages(&pagelist);
 			pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
 				pfn, ret, page->flags);
 			if (ret > 0)
diff --git a/mm/migrate.c b/mm/migrate.c
index 9f29a3b7aac2..155a2e9a8059 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -772,6 +772,7 @@ uncharge:
 unlock:
 	unlock_page(page);
 
+move_newpage:
 	if (rc != -EAGAIN) {
  		/*
  		 * A page that has been migrated has all references
@@ -785,8 +786,6 @@ unlock:
 		putback_lru_page(page);
 	}
 
-move_newpage:
-
 	/*
 	 * Move the new page to the LRU. If migration was not successful
 	 * then this will free the page.
-- 
cgit v1.2.3


From 48db54ee2f41e8ae2faf330b55db34a9fffb5b3c Mon Sep 17 00:00:00 2001
From: Minchan Kim
Date: Tue, 1 Feb 2011 15:52:33 -0800
Subject: mm/migration: fix page corruption during hugepage migration

If migrate_huge_page by memory-failure fails , it calls put_page in itself
to decrease page reference and caller of migrate_huge_page also calls
putback_lru_pages.  It can do double free of page so it can make page
corruption on page holder.

In addtion, clean of pages on caller is consistent behavior with
migrate_pages by cf608ac19c ("mm: compaction: fix COMPACTPAGEFAILED
counting").

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c | 5 ++++-
 mm/migrate.c        | 4 ----
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 75398b0bfede..237aaa488f4e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1295,7 +1295,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
 	ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
 				true);
 	if (ret) {
-		putback_lru_pages(&pagelist);
+		struct page *page1, *page2;
+		list_for_each_entry_safe(page1, page2, &pagelist, lru)
+			put_page(page1);
+
 		pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
 			 pfn, ret, page->flags);
 		if (ret > 0)
diff --git a/mm/migrate.c b/mm/migrate.c
index 155a2e9a8059..766115253807 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -980,10 +980,6 @@ int migrate_huge_pages(struct list_head *from,
 	}
 	rc = 0;
 out:
-
-	list_for_each_entry_safe(page, page2, from, lru)
-		put_page(page);
-
 	if (rc)
 		return rc;
 
-- 
cgit v1.2.3


From efeda7a41e09efce506a68c3549b60b16dd7dedd Mon Sep 17 00:00:00 2001
From: Jin Dongming
Date: Tue, 1 Feb 2011 15:52:39 -0800
Subject: thp: fix splitting of hwpoisoned hugepages

The poisoned THP is now split with split_huge_page() in
collect_procs_anon().  If kmalloc() is failed in collect_procs(),
split_huge_page() could not be called.  And the work after
split_huge_page() for collecting the processes using poisoned page will
not be done, too.  So the processes using the poisoned page could not be
killed.

The condition becomes worse when CONFIG_DEBUG_VM == "Y".  Because the
poisoned THP could not be split, system panic will be caused by
VM_BUG_ON(PageTransHuge(page)) in try_to_unmap().

This patch does:
  1. move split_huge_page() to the place before collect_procs().
     This can be sure the failure of splitting THP is caused by itself.
  2. when splitting THP is failed, stop the operations after it.
     This can avoid unexpected system panic or non sense works.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Jin Dongming <jin.dongming@np.css.fujitsu.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 237aaa488f4e..1e9c30b241c3 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -386,8 +386,6 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 	struct task_struct *tsk;
 	struct anon_vma *av;
 
-	if (!PageHuge(page) && unlikely(split_huge_page(page)))
-		return;
 	read_lock(&tasklist_lock);
 	av = page_lock_anon_vma(page);
 	if (av == NULL)	/* Not actually mapped anymore */
@@ -896,6 +894,34 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 		}
 	}
 
+	if (PageTransHuge(hpage)) {
+		/*
+		 * Verify that this isn't a hugetlbfs head page, the check for
+		 * PageAnon is just for avoid tripping a split_huge_page
+		 * internal debug check, as split_huge_page refuses to deal with
+		 * anything that isn't an anon page. PageAnon can't go away fro
+		 * under us because we hold a refcount on the hpage, without a
+		 * refcount on the hpage. split_huge_page can't be safely called
+		 * in the first place, having a refcount on the tail isn't
+		 * enough * to be safe.
+		 */
+		if (!PageHuge(hpage) && PageAnon(hpage)) {
+			if (unlikely(split_huge_page(hpage))) {
+				/*
+				 * FIXME: if splitting THP is failed, it is
+				 * better to stop the following operation rather
+				 * than causing panic by unmapping. System might
+				 * survive if the page is freed later.
+				 */
+				printk(KERN_INFO
+					"MCE %#lx: failed to split THP\n", pfn);
+
+				BUG_ON(!PageHWPoison(p));
+				return SWAP_FAIL;
+			}
+		}
+	}
+
 	/*
 	 * First collect all the processes that have the page
 	 * mapped in dirty form.  This has to be done before try_to_unmap,
-- 
cgit v1.2.3


From a6d30dddae4648837be5a0c0cb2c0ae9ad0377db Mon Sep 17 00:00:00 2001
From: Jin Dongming
Date: Tue, 1 Feb 2011 15:52:40 -0800
Subject: thp: fix the wrong reported address of hwpoisoned hugepages

When the tail page of THP is poisoned, the head page will be poisoned too.
 And the wrong address, address of head page, will be sent with sigbus
always.

So when the poisoned page is used by Guest OS which is running on KVM,
after the address changing(hva->gpa) by qemu, the unexpected process on
Guest OS will be killed by sigbus.

What we expected is that the process using the poisoned tail page could be
killed on Guest OS, but not that the process using the healthy head page
is killed.

Since it is not good to poison the healthy page, avoid poisoning other
than the page which is really poisoned.
  (While we poison all pages in a huge page in case of hugetlb,
   we can do this for THP thanks to split_huge_page().)

Here we fix two parts:
  1. Isolate the poisoned page only to make sure
     the reported address is the address of poisoned page.
  2. make the poisoned page work as the poisoned regular page.

[akpm@linux-foundation.org: fix spello in comment]
Signed-off-by: Jin Dongming <jin.dongming@np.css.fujitsu.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c    |  7 ++++++-
 mm/memory-failure.c | 27 ++++++++++++++++++++++-----
 2 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e187454d82f6..b6c1ce3c53b5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1162,7 +1162,12 @@ static void __split_huge_page_refcount(struct page *page)
 		/* after clearing PageTail the gup refcount can be released */
 		smp_mb();
 
-		page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+		/*
+		 * retain hwpoison flag of the poisoned tail page:
+		 *   fix for the unsuitable process killed on Guest Machine(KVM)
+		 *   by the memory-failure.
+		 */
+		page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
 		page_tail->flags |= (page->flags &
 				     ((1L << PG_referenced) |
 				      (1L << PG_swapbacked) |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1e9c30b241c3..04158d6f44d4 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -854,6 +854,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	int ret;
 	int kill = 1;
 	struct page *hpage = compound_head(p);
+	struct page *ppage;
 
 	if (PageReserved(p) || PageSlab(p))
 		return SWAP_SUCCESS;
@@ -894,6 +895,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 		}
 	}
 
+	/*
+	 * ppage: poisoned page
+	 *   if p is regular page(4k page)
+	 *        ppage == real poisoned page;
+	 *   else p is hugetlb or THP, ppage == head page.
+	 */
+	ppage = hpage;
+
 	if (PageTransHuge(hpage)) {
 		/*
 		 * Verify that this isn't a hugetlbfs head page, the check for
@@ -919,6 +928,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 				BUG_ON(!PageHWPoison(p));
 				return SWAP_FAIL;
 			}
+			/* THP is split, so ppage should be the real poisoned page. */
+			ppage = p;
 		}
 	}
 
@@ -931,12 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	 * there's nothing that can be done.
 	 */
 	if (kill)
-		collect_procs(hpage, &tokill);
+		collect_procs(ppage, &tokill);
 
-	ret = try_to_unmap(hpage, ttu);
+	if (hpage != ppage)
+		lock_page_nosync(ppage);
+
+	ret = try_to_unmap(ppage, ttu);
 	if (ret != SWAP_SUCCESS)
 		printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
-				pfn, page_mapcount(hpage));
+				pfn, page_mapcount(ppage));
+
+	if (hpage != ppage)
+		unlock_page(ppage);
 
 	/*
 	 * Now that the dirty bit has been propagated to the
@@ -947,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	 * use a more force-full uncatchable kill to prevent
 	 * any accesses to the poisoned memory.
 	 */
-	kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
+	kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
 		      ret != SWAP_SUCCESS, p, pfn);
 
 	return ret;
@@ -1090,7 +1107,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 	 * For error on the tail page, we should set PG_hwpoison
 	 * on the head page to show that the hugepage is hwpoisoned
 	 */
-	if (PageTail(p) && TestSetPageHWPoison(hpage)) {
+	if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
 		action_result(pfn, "hugepage already hardware poisoned",
 				IGNORED);
 		unlock_page(hpage);
-- 
cgit v1.2.3


From af241a083404acda7ba3690e5b7697949d729fcc Mon Sep 17 00:00:00 2001
From: Jin Dongming
Date: Tue, 1 Feb 2011 15:52:41 -0800
Subject: thp: fix unsuitable behavior for hwpoisoned tail page

When a tail page of THP is poisoned, memory-failure will do nothing except
setting PG_hwpoison, while the expected behavior is that the process, who
is using the poisoned tail page, should be killed.

The above problem is caused by lru check of the poisoned tail page of THP.
Because PG_lru flag is only set on the head page of THP, the check always
consider the poisoned tail page as NON lru page.

So the lru check for the tail page of THP should be avoided, as like as
hugetlb.

This patch adds !PageTransCompound() before lru check for THP, because of
the check (!PageHuge() && !PageTransCompound()) the whole branch could be
optimized away at build time when both hugetlbfs and THP are set with "N"
(or in archs not supporting either of those).

[akpm@linux-foundation.org: fix unrelated typo in shake_page() comment]
Signed-off-by: Jin Dongming <jin.dongming@np.css.fujitsu.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 04158d6f44d4..0207c2f6f8bd 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -233,8 +233,8 @@ void shake_page(struct page *p, int access)
 	}
 
 	/*
-	 * Only all shrink_slab here (which would also
-	 * shrink other caches) if access is not potentially fatal.
+	 * Only call shrink_slab here (which would also shrink other caches) if
+	 * access is not potentially fatal.
 	 */
 	if (access) {
 		int nr;
@@ -1065,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
 	 * The check (unnecessarily) ignores LRU pages being isolated and
 	 * walked by the page reclaim code, however that's not a big loss.
 	 */
-	if (!PageLRU(p) && !PageHuge(p))
-		shake_page(p, 0);
-	if (!PageLRU(p) && !PageHuge(p)) {
-		/*
-		 * shake_page could have turned it free.
-		 */
-		if (is_free_buddy_page(p)) {
-			action_result(pfn, "free buddy, 2nd try", DELAYED);
-			return 0;
+	if (!PageHuge(p) && !PageTransCompound(p)) {
+		if (!PageLRU(p))
+			shake_page(p, 0);
+		if (!PageLRU(p)) {
+			/*
+			 * shake_page could have turned it free.
+			 */
+			if (is_free_buddy_page(p)) {
+				action_result(pfn, "free buddy, 2nd try",
+						DELAYED);
+				return 0;
+			}
+			action_result(pfn, "non LRU", IGNORED);
+			put_page(p);
+			return -EBUSY;
 		}
-		action_result(pfn, "non LRU", IGNORED);
-		put_page(p);
-		return -EBUSY;
 	}
 
 	/*
-- 
cgit v1.2.3


From 9221edb7120e2dc3ae90f1c58514979f7ba40e46 Mon Sep 17 00:00:00 2001
From: Johannes Weiner
Date: Tue, 1 Feb 2011 15:52:42 -0800
Subject: memcg: prevent endless loop when charging huge pages

The charging code can encounter a charge size that is bigger than a
regular page in two situations: one is a batched charge to fill the
per-cpu stocks, the other is a huge page charge.

This code is distributed over two functions, however, and only the outer
one is aware of huge pages.  In case the charging fails, the inner
function will tell the outer function to retry if the charge size is
bigger than regular pages--assuming batched charging is the only case.
And the outer function will retry forever charging a huge page.

This patch makes sure the inner function can distinguish between batch
charging and a single huge page charge.  It will only signal another
attempt if batch charging failed, and go into regular reclaim when it is
called on behalf of a huge page.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 79abb1fd39d2..50eb50e100fd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1837,8 +1837,15 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
 		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
 	} else
 		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
-
-	if (csize > PAGE_SIZE) /* change csize and retry */
+	/*
+	 * csize can be either a huge page (HPAGE_SIZE), a batch of
+	 * regular pages (CHARGE_SIZE), or a single regular page
+	 * (PAGE_SIZE).
+	 *
+	 * Never reclaim on behalf of optional batching, retry with a
+	 * single page instead.
+	 */
+	if (csize == CHARGE_SIZE)
 		return CHARGE_RETRY;
 
 	if (!(gfp_mask & __GFP_WAIT))
-- 
cgit v1.2.3


From 19942822df65ee4a47c2e6d6d70cace1b7f01710 Mon Sep 17 00:00:00 2001
From: Johannes Weiner
Date: Tue, 1 Feb 2011 15:52:43 -0800
Subject: memcg: prevent endless loop when charging huge pages to near-limit
 group

If reclaim after a failed charging was unsuccessful, the limits are
checked again, just in case they settled by means of other tasks.

This is all fine as long as every charge is of size PAGE_SIZE, because in
that case, being below the limit means having at least PAGE_SIZE bytes
available.

But with transparent huge pages, we may end up in an endless loop where
charging and reclaim fail, but we keep going because the limits are not
yet exceeded, although not allowing for a huge page.

Fix this up by explicitely checking for enough room, not just whether we
are within limits.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/res_counter.h | 20 ++++++++++++++++++++
 mm/memcontrol.c             | 35 ++++++++++++++++++++++++++++-------
 2 files changed, 48 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index fcb9884df618..a5930cb66145 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -182,6 +182,26 @@ static inline bool res_counter_check_under_limit(struct res_counter *cnt)
 	return ret;
 }
 
+/**
+ * res_counter_check_margin - check if the counter allows charging
+ * @cnt: the resource counter to check
+ * @bytes: the number of bytes to check the remaining space against
+ *
+ * Returns a boolean value on whether the counter can be charged
+ * @bytes or whether this would exceed the limit.
+ */
+static inline bool res_counter_check_margin(struct res_counter *cnt,
+					    unsigned long bytes)
+{
+	bool ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cnt->lock, flags);
+	ret = cnt->limit - cnt->usage >= bytes;
+	spin_unlock_irqrestore(&cnt->lock, flags);
+	return ret;
+}
+
 static inline bool res_counter_check_under_soft_limit(struct res_counter *cnt)
 {
 	bool ret;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 50eb50e100fd..0e81eb5f0aea 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1111,6 +1111,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
 	return false;
 }
 
+/**
+ * mem_cgroup_check_margin - check if the memory cgroup allows charging
+ * @mem: memory cgroup to check
+ * @bytes: the number of bytes the caller intends to charge
+ *
+ * Returns a boolean value on whether @mem can be charged @bytes or
+ * whether this would exceed the limit.
+ */
+static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes)
+{
+	if (!res_counter_check_margin(&mem->res, bytes))
+		return false;
+	if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes))
+		return false;
+	return true;
+}
+
 static unsigned int get_swappiness(struct mem_cgroup *memcg)
 {
 	struct cgroup *cgrp = memcg->css.cgroup;
@@ -1852,15 +1869,19 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
 		return CHARGE_WOULDBLOCK;
 
 	ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
-					gfp_mask, flags);
+					      gfp_mask, flags);
+	if (mem_cgroup_check_margin(mem_over_limit, csize))
+		return CHARGE_RETRY;
 	/*
-	 * try_to_free_mem_cgroup_pages() might not give us a full
-	 * picture of reclaim. Some pages are reclaimed and might be
-	 * moved to swap cache or just unmapped from the cgroup.
-	 * Check the limit again to see if the reclaim reduced the
-	 * current usage of the cgroup before giving up
+	 * Even though the limit is exceeded at this point, reclaim
+	 * may have been able to free some pages.  Retry the charge
+	 * before killing the task.
+	 *
+	 * Only for regular pages, though: huge pages are rather
+	 * unlikely to succeed so close to the limit, and we fall back
+	 * to regular pages anyway in case of failure.
 	 */
-	if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+	if (csize == PAGE_SIZE && ret)
 		return CHARGE_RETRY;
 
 	/*
-- 
cgit v1.2.3


From 8493ae439f7038b502df1d687e61dde54c27ca92 Mon Sep 17 00:00:00 2001
From: Johannes Weiner
Date: Tue, 1 Feb 2011 15:52:44 -0800
Subject: memcg: never OOM when charging huge pages

Huge page coverage should obviously have less priority than the continued
execution of a process.

Never kill a process when charging it a huge page fails.  Instead, give up
after the first failed reclaim attempt and fall back to regular pages.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0e81eb5f0aea..fc75f34ba609 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2351,13 +2351,19 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask, enum charge_type ctype)
 {
 	struct mem_cgroup *mem = NULL;
+	int page_size = PAGE_SIZE;
 	struct page_cgroup *pc;
+	bool oom = true;
 	int ret;
-	int page_size = PAGE_SIZE;
 
 	if (PageTransHuge(page)) {
 		page_size <<= compound_order(page);
 		VM_BUG_ON(!PageTransHuge(page));
+		/*
+		 * Never OOM-kill a process for a huge page.  The
+		 * fault handler will fall back to regular pages.
+		 */
+		oom = false;
 	}
 
 	pc = lookup_page_cgroup(page);
@@ -2366,7 +2372,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 		return 0;
 	prefetchw(pc);
 
-	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size);
+	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size);
 	if (ret || !mem)
 		return ret;
 
-- 
cgit v1.2.3


From 3751d60430fe4c26460a5ca8ad8672d32f93bcb1 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki
Date: Tue, 1 Feb 2011 15:52:45 -0800
Subject: memcg: fix event counting breakage from recent THP update

Changes in e401f1761 ("memcg: modify accounting function for supporting
THP better") adds nr_pages to support multiple page size in
memory_cgroup_charge_statistics.

But counting the number of event nees abs(nr_pages) for increasing
counters.  This patch fixes event counting.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fc75f34ba609..da53a252b259 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -612,8 +612,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 	/* pagein of a big page is an event. So, ignore page size */
 	if (nr_pages > 0)
 		__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
-	else
+	else {
 		__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
+		nr_pages = -nr_pages; /* for event */
+	}
 
 	__this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
 
-- 
cgit v1.2.3


From e6d2e2b2b1e1455df16d68a78f4a3874c7b3ad20 Mon Sep 17 00:00:00 2001
From: Yinghai Lu
Date: Thu, 10 Feb 2011 15:01:30 -0800
Subject: memblock: don't adjust size in memblock_find_base()

While applying patch to use memblock to find aperture for 64bit x86.
Ingo found system with 1g + force_iommu

> No AGP bridge found
> Node 0: aperture @ 38000000 size 32 MB
> Aperture pointing to e820 RAM. Ignoring.
> Your BIOS doesn't leave a aperture memory hole
> Please enable the IOMMU option in the BIOS setup
> This costs you 64 MB of RAM
> Cannot allocate aperture memory hole (0,65536K)

the corresponding code:

	addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20);
	if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) {
		printk(KERN_ERR
			"Cannot allocate aperture memory hole (%lx,%uK)\n",
				addr, aper_size>>10);
		return 0;
	}
	memblock_x86_reserve_range(addr, addr + aper_size, "aperture64")

fails because memblock core code align the size with 512M.  That could
make size way too big.

So don't align the size in that case.

actually __memblock_alloc_base, the another caller already align that
before calling that function.

BTW. x86 does not use __memblock_alloc_base...

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Airlie <airlied@linux.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memblock.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memblock.c b/mm/memblock.c
index bdba245d8afd..4618fda975a0 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -137,8 +137,6 @@ static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
 
 	BUG_ON(0 == size);
 
-	size = memblock_align_up(size, align);
-
 	/* Pump up max_addr */
 	if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
 		end = memblock.current_limit;
-- 
cgit v1.2.3


From e15f8c01af924e611bc7be1e45449c4a74e5dfdd Mon Sep 17 00:00:00 2001
From: Michel Lespinasse
Date: Thu, 10 Feb 2011 15:01:32 -0800
Subject: mlock: fix race when munlocking pages in do_wp_page()

vmscan can lazily find pages that are mapped within VM_LOCKED vmas, and
set the PageMlocked bit on these pages, transfering them onto the
unevictable list.  When do_wp_page() breaks COW within a VM_LOCKED vma,
it may need to clear PageMlocked on the old page and set it on the new
page instead.

This change fixes an issue where do_wp_page() was clearing PageMlocked
on the old page while the pte was still pointing to it (as well as
rmap).  Therefore, we were not protected against vmscan immediately
transfering the old page back onto the unevictable list.  This could
cause pages to get stranded there forever.

I propose to move the corresponding code to the end of do_wp_page(),
after the pte (and rmap) have been pointed to the new page.
Additionally, we can use munlock_vma_page() instead of
clear_page_mlock(), so that the old page stays mlocked if there are
still other VM_LOCKED vmas mapping it.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 31250faff390..32df03cf13a5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2219,7 +2219,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 							 &ptl);
 			if (!pte_same(*page_table, orig_pte)) {
 				unlock_page(old_page);
-				page_cache_release(old_page);
 				goto unlock;
 			}
 			page_cache_release(old_page);
@@ -2289,7 +2288,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 							 &ptl);
 			if (!pte_same(*page_table, orig_pte)) {
 				unlock_page(old_page);
-				page_cache_release(old_page);
 				goto unlock;
 			}
 
@@ -2367,16 +2365,6 @@ gotten:
 	}
 	__SetPageUptodate(new_page);
 
-	/*
-	 * Don't let another task, with possibly unlocked vma,
-	 * keep the mlocked page.
-	 */
-	if ((vma->vm_flags & VM_LOCKED) && old_page) {
-		lock_page(old_page);	/* for LRU manipulation */
-		clear_page_mlock(old_page);
-		unlock_page(old_page);
-	}
-
 	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
 		goto oom_free_new;
 
@@ -2444,10 +2432,20 @@ gotten:
 
 	if (new_page)
 		page_cache_release(new_page);
-	if (old_page)
-		page_cache_release(old_page);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
+	if (old_page) {
+		/*
+		 * Don't let another task, with possibly unlocked vma,
+		 * keep the mlocked page.
+		 */
+		if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
+			lock_page(old_page);	/* LRU manipulation */
+			munlock_vma_page(old_page);
+			unlock_page(old_page);
+		}
+		page_cache_release(old_page);
+	}
 	return ret;
 oom_free_new:
 	page_cache_release(new_page);
-- 
cgit v1.2.3


From 419d8c96dbfa558f00e623023917d0a5afc46129 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse
Date: Thu, 10 Feb 2011 15:01:33 -0800
Subject: mlock: do not munlock pages in __do_fault()

If the page is going to be written to, __do_page needs to break COW.

However, the old page (before breaking COW) was never mapped mapped into
the current pte (__do_fault is only called when the pte is not present),
so vmscan can't have marked the old page as PageMlocked due to being
mapped in __do_fault's VMA.  Therefore, __do_fault() does not need to
worry about clearing PageMlocked() on the old page.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 32df03cf13a5..8e8c18324863 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3051,12 +3051,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 				goto out;
 			}
 			charged = 1;
-			/*
-			 * Don't let another task, with possibly unlocked vma,
-			 * keep the mlocked page.
-			 */
-			if (vma->vm_flags & VM_LOCKED)
-				clear_page_mlock(vmf.page);
 			copy_user_highpage(page, vmf.page, address, vma);
 			__SetPageUptodate(page);
 		} else {
-- 
cgit v1.2.3


From f0fdc5e8e6f579310458aef43d1610a0bb5e81a4 Mon Sep 17 00:00:00 2001
From: Johannes Weiner
Date: Thu, 10 Feb 2011 15:01:34 -0800
Subject: vmscan: fix zone shrinking exit when scan work is done

Commit 3e7d34497067 ("mm: vmscan: reclaim order-0 and use compaction
instead of lumpy reclaim") introduced an indefinite loop in
shrink_zone().

It meant to break out of this loop when no pages had been reclaimed and
not a single page was even scanned.  The way it would detect the latter
is by taking a snapshot of sc->nr_scanned at the beginning of the
function and comparing it against the new sc->nr_scanned after the scan
loop.  But it would re-iterate without updating that snapshot, looping
forever if sc->nr_scanned changed at least once since shrink_zone() was
invoked.

This is not the sole condition that would exit that loop, but it
requires other processes to change the zone state, as the reclaimer that
is stuck obviously can not anymore.

This is only happening for higher-order allocations, where reclaim is
run back to back with compaction.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Kent Overstreet<kent.overstreet@gmail.com>
Reported-by: Kent Overstreet <kent.overstreet@gmail.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 148c6e630df2..17497d0cd8b9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1882,12 +1882,12 @@ static void shrink_zone(int priority, struct zone *zone,
 	unsigned long nr[NR_LRU_LISTS];
 	unsigned long nr_to_scan;
 	enum lru_list l;
-	unsigned long nr_reclaimed;
+	unsigned long nr_reclaimed, nr_scanned;
 	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
-	unsigned long nr_scanned = sc->nr_scanned;
 
 restart:
 	nr_reclaimed = 0;
+	nr_scanned = sc->nr_scanned;
 	get_scan_count(zone, sc, nr, priority);
 
 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
-- 
cgit v1.2.3


From 678ff896a37afdbca292c7846ec895463aed35a5 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki
Date: Thu, 10 Feb 2011 15:01:36 -0800
Subject: memcg: fix leak of accounting at failure path of hugepage collapsing

mem_cgroup_uncharge_page() should be called in all failure cases after
mem_cgroup_charge_newpage() is called in huge_memory.c::collapse_huge_page()

 [ 4209.076861] BUG: Bad page state in process khugepaged  pfn:1e9800
 [ 4209.077601] page:ffffea0006b14000 count:0 mapcount:0 mapping:          (null) index:0x2800
 [ 4209.078674] page flags: 0x40000000004000(head)
 [ 4209.079294] pc:ffff880214a30000 pc->flags:2146246697418756 pc->mem_cgroup:ffffc9000177a000
 [ 4209.082177] (/A)
 [ 4209.082500] Pid: 31, comm: khugepaged Not tainted 2.6.38-rc3-mm1 #1
 [ 4209.083412] Call Trace:
 [ 4209.083678]  [<ffffffff810f4454>] ? bad_page+0xe4/0x140
 [ 4209.084240]  [<ffffffff810f53e6>] ? free_pages_prepare+0xd6/0x120
 [ 4209.084837]  [<ffffffff8155621d>] ? rwsem_down_failed_common+0xbd/0x150
 [ 4209.085509]  [<ffffffff810f5462>] ? __free_pages_ok+0x32/0xe0
 [ 4209.086110]  [<ffffffff810f552b>] ? free_compound_page+0x1b/0x20
 [ 4209.086699]  [<ffffffff810fad6c>] ? __put_compound_page+0x1c/0x30
 [ 4209.087333]  [<ffffffff810fae1d>] ? put_compound_page+0x4d/0x200
 [ 4209.087935]  [<ffffffff810fb015>] ? put_page+0x45/0x50
 [ 4209.097361]  [<ffffffff8113f779>] ? khugepaged+0x9e9/0x1430
 [ 4209.098364]  [<ffffffff8107c870>] ? autoremove_wake_function+0x0/0x40
 [ 4209.099121]  [<ffffffff8113ed90>] ? khugepaged+0x0/0x1430
 [ 4209.099780]  [<ffffffff8107c236>] ? kthread+0x96/0xa0
 [ 4209.100452]  [<ffffffff8100dda4>] ? kernel_thread_helper+0x4/0x10
 [ 4209.101214]  [<ffffffff8107c1a0>] ? kthread+0x0/0xa0
 [ 4209.101842]  [<ffffffff8100dda0>] ? kernel_thread_helper+0x0/0x10

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b6c1ce3c53b5..e62ddb8f24b6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1852,7 +1852,6 @@ static void collapse_huge_page(struct mm_struct *mm,
 		set_pmd_at(mm, address, pmd, _pmd);
 		spin_unlock(&mm->page_table_lock);
 		anon_vma_unlock(vma->anon_vma);
-		mem_cgroup_uncharge_page(new_page);
 		goto out;
 	}
 
@@ -1898,6 +1897,7 @@ out_up_write:
 	return;
 
 out:
+	mem_cgroup_uncharge_page(new_page);
 #ifdef CONFIG_NUMA
 	put_page(new_page);
 #endif
-- 
cgit v1.2.3


From a7d6e4ecdb7648478ddec76d30d87d03d6e22b31 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli
Date: Tue, 15 Feb 2011 19:02:45 +0100
Subject: thp: prevent hugepages during args/env copying into the user stack

Transparent hugepages can only be created if rmap is fully
functional. So we must prevent hugepages to be created while
is_vma_temporary_stack() is true.

This also optmizes away some harmless but unnecessary setting of
khugepaged_scan.address and it switches some BUG_ON to VM_BUG_ON.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h |  3 ++-
 mm/huge_memory.c        | 35 ++++++++++++++++-------------------
 2 files changed, 18 insertions(+), 20 deletions(-)

(limited to 'mm')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 8e6c8c42bc3c..df29c8fde36b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -57,7 +57,8 @@ extern pmd_t *page_check_address_pmd(struct page *page,
 	  (transparent_hugepage_flags &					\
 	   (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG) &&			\
 	   ((__vma)->vm_flags & VM_HUGEPAGE))) &&			\
-	 !((__vma)->vm_flags & VM_NOHUGEPAGE))
+	 !((__vma)->vm_flags & VM_NOHUGEPAGE) &&			\
+	 !is_vma_temporary_stack(__vma))
 #define transparent_hugepage_defrag(__vma)				\
 	((transparent_hugepage_flags &					\
 	  (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)) ||			\
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e62ddb8f24b6..3e29781ee762 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1811,6 +1811,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 	/* VM_PFNMAP vmas may have vm_ops null but vm_file set */
 	if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
 		goto out;
+	if (is_vma_temporary_stack(vma))
+		goto out;
 	VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
 
 	pgd = pgd_offset(mm, address);
@@ -2032,32 +2034,27 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 		if ((!(vma->vm_flags & VM_HUGEPAGE) &&
 		     !khugepaged_always()) ||
 		    (vma->vm_flags & VM_NOHUGEPAGE)) {
+		skip:
 			progress++;
 			continue;
 		}
-
 		/* VM_PFNMAP vmas may have vm_ops null but vm_file set */
-		if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
-			khugepaged_scan.address = vma->vm_end;
-			progress++;
-			continue;
-		}
+		if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+			goto skip;
+		if (is_vma_temporary_stack(vma))
+			goto skip;
+
 		VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
 
 		hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
 		hend = vma->vm_end & HPAGE_PMD_MASK;
-		if (hstart >= hend) {
-			progress++;
-			continue;
-		}
+		if (hstart >= hend)
+			goto skip;
+		if (khugepaged_scan.address > hend)
+			goto skip;
 		if (khugepaged_scan.address < hstart)
 			khugepaged_scan.address = hstart;
-		if (khugepaged_scan.address > hend) {
-			khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
-			progress++;
-			continue;
-		}
-		BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+		VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
 
 		while (khugepaged_scan.address < hend) {
 			int ret;
@@ -2086,7 +2083,7 @@ breakouterloop:
 breakouterloop_mmap_sem:
 
 	spin_lock(&khugepaged_mm_lock);
-	BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+	VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
 	/*
 	 * Release the current mm_slot if this mm is about to die, or
 	 * if we scanned all vmas of this mm.
@@ -2241,9 +2238,9 @@ static int khugepaged(void *none)
 
 	for (;;) {
 		mutex_unlock(&khugepaged_mutex);
-		BUG_ON(khugepaged_thread != current);
+		VM_BUG_ON(khugepaged_thread != current);
 		khugepaged_loop();
-		BUG_ON(khugepaged_thread != current);
+		VM_BUG_ON(khugepaged_thread != current);
 
 		mutex_lock(&khugepaged_mutex);
 		if (!khugepaged_enabled())
-- 
cgit v1.2.3


From 2aa15890f3c191326678f1bd68af61ec6b8753ec Mon Sep 17 00:00:00 2001
From: Miklos Szeredi
Date: Wed, 23 Feb 2011 13:49:47 +0100
Subject: mm: prevent concurrent unmap_mapping_range() on the same inode

Michael Leun reported that running parallel opens on a fuse filesystem
can trigger a "kernel BUG at mm/truncate.c:475"

Gurudas Pai reported the same bug on NFS.

The reason is, unmap_mapping_range() is not prepared for more than
one concurrent invocation per inode.  For example:

  thread1: going through a big range, stops in the middle of a vma and
     stores the restart address in vm_truncate_count.

  thread2: comes in with a small (e.g. single page) unmap request on
     the same vma, somewhere before restart_address, finds that the
     vma was already unmapped up to the restart address and happily
     returns without doing anything.

Another scenario would be two big unmap requests, both having to
restart the unmapping and each one setting vm_truncate_count to its
own value.  This could go on forever without any of them being able to
finish.

Truncate and hole punching already serialize with i_mutex.  Other
callers of unmap_mapping_range() do not, and it's difficult to get
i_mutex protection for all callers.  In particular ->d_revalidate(),
which calls invalidate_inode_pages2_range() in fuse, may be called
with or without i_mutex.

This patch adds a new mutex to 'struct address_space' to prevent
running multiple concurrent unmap_mapping_range() on the same mapping.

[ We'll hopefully get rid of all this with the upcoming mm
  preemptibility series by Peter Zijlstra, the "mm: Remove i_mmap_mutex
  lockbreak" patch in particular.  But that is for 2.6.39 ]

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Reported-by: Michael Leun <lkml20101129@newton.leun.net>
Reported-by: Gurudas Pai <gurudas.pai@oracle.com>
Tested-by: Gurudas Pai <gurudas.pai@oracle.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/gfs2/main.c     |  9 +--------
 fs/inode.c         | 22 +++++++++++++++-------
 fs/nilfs2/btnode.c |  5 -----
 fs/nilfs2/btnode.h |  1 -
 fs/nilfs2/mdt.c    |  4 ++--
 fs/nilfs2/page.c   | 13 -------------
 fs/nilfs2/page.h   |  1 -
 fs/nilfs2/super.c  |  2 +-
 include/linux/fs.h |  2 ++
 mm/memory.c        |  2 ++
 10 files changed, 23 insertions(+), 38 deletions(-)

(limited to 'mm')

diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 85ba027d1c4d..72c31a315d96 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -59,14 +59,7 @@ static void gfs2_init_gl_aspace_once(void *foo)
 	struct address_space *mapping = (struct address_space *)(gl + 1);
 
 	gfs2_init_glock_once(gl);
-	memset(mapping, 0, sizeof(*mapping));
-	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-	spin_lock_init(&mapping->tree_lock);
-	spin_lock_init(&mapping->i_mmap_lock);
-	INIT_LIST_HEAD(&mapping->private_list);
-	spin_lock_init(&mapping->private_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+	address_space_init_once(mapping);
 }
 
 /**
diff --git a/fs/inode.c b/fs/inode.c
index da85e56378f3..9c2b795ccc93 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -295,6 +295,20 @@ static void destroy_inode(struct inode *inode)
 		call_rcu(&inode->i_rcu, i_callback);
 }
 
+void address_space_init_once(struct address_space *mapping)
+{
+	memset(mapping, 0, sizeof(*mapping));
+	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+	spin_lock_init(&mapping->tree_lock);
+	spin_lock_init(&mapping->i_mmap_lock);
+	INIT_LIST_HEAD(&mapping->private_list);
+	spin_lock_init(&mapping->private_lock);
+	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+	mutex_init(&mapping->unmap_mutex);
+}
+EXPORT_SYMBOL(address_space_init_once);
+
 /*
  * These are initializations that only need to be done
  * once, because the fields are idempotent across use
@@ -308,13 +322,7 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->i_devices);
 	INIT_LIST_HEAD(&inode->i_wb_list);
 	INIT_LIST_HEAD(&inode->i_lru);
-	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
-	spin_lock_init(&inode->i_data.tree_lock);
-	spin_lock_init(&inode->i_data.i_mmap_lock);
-	INIT_LIST_HEAD(&inode->i_data.private_list);
-	spin_lock_init(&inode->i_data.private_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
-	INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
+	address_space_init_once(&inode->i_data);
 	i_size_ordered_init(inode);
 #ifdef CONFIG_FSNOTIFY
 	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 388e9e8f5286..85f7baa15f5d 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -35,11 +35,6 @@
 #include "btnode.h"
 
 
-void nilfs_btnode_cache_init_once(struct address_space *btnc)
-{
-	nilfs_mapping_init_once(btnc);
-}
-
 static const struct address_space_operations def_btnode_aops = {
 	.sync_page		= block_sync_page,
 };
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 79037494f1e0..1b8ebd888c28 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt {
 	struct buffer_head *newbh;
 };
 
-void nilfs_btnode_cache_init_once(struct address_space *);
 void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 6a0e2a189f60..a0babd2bff6a 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -454,9 +454,9 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
 	struct backing_dev_info *bdi = inode->i_sb->s_bdi;
 
 	INIT_LIST_HEAD(&shadow->frozen_buffers);
-	nilfs_mapping_init_once(&shadow->frozen_data);
+	address_space_init_once(&shadow->frozen_data);
 	nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
-	nilfs_mapping_init_once(&shadow->frozen_btnodes);
+	address_space_init_once(&shadow->frozen_btnodes);
 	nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
 	mi->mi_shadow = shadow;
 	return 0;
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 0c432416cfef..a585b35fd6bc 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -492,19 +492,6 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
 	return nc;
 }
 
-void nilfs_mapping_init_once(struct address_space *mapping)
-{
-	memset(mapping, 0, sizeof(*mapping));
-	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-	spin_lock_init(&mapping->tree_lock);
-	INIT_LIST_HEAD(&mapping->private_list);
-	spin_lock_init(&mapping->private_lock);
-
-	spin_lock_init(&mapping->i_mmap_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
-}
-
 void nilfs_mapping_init(struct address_space *mapping,
 			struct backing_dev_info *bdi,
 			const struct address_space_operations *aops)
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 622df27cd891..2a00953ebd5f 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -61,7 +61,6 @@ void nilfs_free_private_page(struct page *);
 int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_pages(struct address_space *);
-void nilfs_mapping_init_once(struct address_space *mapping);
 void nilfs_mapping_init(struct address_space *mapping,
 			struct backing_dev_info *bdi,
 			const struct address_space_operations *aops);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 58fd707174e1..1673b3d99842 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1279,7 +1279,7 @@ static void nilfs_inode_init_once(void *obj)
 #ifdef CONFIG_NILFS_XATTR
 	init_rwsem(&ii->xattr_sem);
 #endif
-	nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+	address_space_init_once(&ii->i_btnode_cache);
 	ii->i_bmap = &ii->i_bmap_data;
 	inode_init_once(&ii->vfs_inode);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bd3215940c37..97d08d8a7de8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -649,6 +649,7 @@ struct address_space {
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	struct address_space	*assoc_mapping;	/* ditto */
+	struct mutex		unmap_mutex;    /* to protect unmapping */
 } __attribute__((aligned(sizeof(long))));
 	/*
 	 * On most architectures that alignment is already the case; but
@@ -2225,6 +2226,7 @@ extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin);
 
 extern int inode_init_always(struct super_block *, struct inode *);
 extern void inode_init_once(struct inode *);
+extern void address_space_init_once(struct address_space *mapping);
 extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
 extern struct inode * igrab(struct inode *);
diff --git a/mm/memory.c b/mm/memory.c
index 8e8c18324863..5823698c2b71 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2648,6 +2648,7 @@ void unmap_mapping_range(struct address_space *mapping,
 		details.last_index = ULONG_MAX;
 	details.i_mmap_lock = &mapping->i_mmap_lock;
 
+	mutex_lock(&mapping->unmap_mutex);
 	spin_lock(&mapping->i_mmap_lock);
 
 	/* Protect against endless unmapping loops */
@@ -2664,6 +2665,7 @@ void unmap_mapping_range(struct address_space *mapping,
 	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
 		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
 	spin_unlock(&mapping->i_mmap_lock);
+	mutex_unlock(&mapping->unmap_mutex);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
-- 
cgit v1.2.3


From a3e8cc643d22d2c8ed36b9be7d9c9ca21efcf7f7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins
Date: Wed, 23 Feb 2011 21:39:49 -0800
Subject: mm: fix possible cause of a page_mapped BUG

Robert Swiecki reported a BUG_ON(page_mapped) from a fuzzer, punching
a hole with madvise(,, MADV_REMOVE).  That path is under mutex, and
cannot be explained by lack of serialization in unmap_mapping_range().

Reviewing the code, I found one place where vm_truncate_count handling
should have been updated, when I switched at the last minute from one
way of managing the restart_addr to another: mremap move changes the
virtual addresses, so it ought to adjust the restart_addr.

But rather than exporting the notion of restart_addr from memory.c, or
converting to restart_pgoff throughout, simply reset vm_truncate_count
to 0 to force a rescan if mremap move races with preempted truncation.

We have no confirmation that this fixes Robert's BUG,
but it is a fix that's worth making anyway.

Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mremap.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/mremap.c b/mm/mremap.c
index 9925b6391b80..1de98d492ddc 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -94,9 +94,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		 */
 		mapping = vma->vm_file->f_mapping;
 		spin_lock(&mapping->i_mmap_lock);
-		if (new_vma->vm_truncate_count &&
-		    new_vma->vm_truncate_count != vma->vm_truncate_count)
-			new_vma->vm_truncate_count = 0;
+		new_vma->vm_truncate_count = 0;
 	}
 
 	/*
-- 
cgit v1.2.3


From 8074b26f67165bf045d92e778c9c10dc5e207fc6 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi
Date: Thu, 24 Feb 2011 15:49:53 +0100
Subject: mm: fix refcounting in swapon

Grab a reference to bdev before calling blkdev_get(), which expects
the refcount to be already incremented and either returns success or
decrements the refcount and returns an error.

The bug was introduced by e525fd89 (block: make blkdev_get/put()
handle exclusive access), which didn't take into account this behavior
of blkdev_get().

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swapfile.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 07a458d72fa8..0341c5700e34 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1940,7 +1940,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 
 	error = -EINVAL;
 	if (S_ISBLK(inode->i_mode)) {
-		bdev = I_BDEV(inode);
+		bdev = bdgrab(I_BDEV(inode));
 		error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL,
 				   sys_swapon);
 		if (error < 0) {
-- 
cgit v1.2.3


From a879bf582dfb3a79d30d76ca3af2ae8a0f39010c Mon Sep 17 00:00:00 2001
From: Greg Thelen
Date: Fri, 25 Feb 2011 14:44:13 -0800
Subject: mm: grab rcu read lock in move_pages()

The move_pages() usage of find_task_by_vpid() requires rcu_read_lock() to
prevent free_pid() from reclaiming the pid.

Without this patch, RCU warnings are printed in v2.6.38-rc4 move_pages()
with:

  CONFIG_LOCKUP_DETECTOR=y
  CONFIG_PREEMPT=y
  CONFIG_LOCKDEP=y
  CONFIG_PROVE_LOCKING=y
  CONFIG_PROVE_RCU=y

Previously, migrate_pages() went through a similar transformation
replacing usage of tasklist_lock with rcu read lock:

  commit 55cfaa3cbdd29c4919ecb5fb8965c310f357e48c
  Author: Zeng Zhaoming <zengzm.kernel@gmail.com>
  Date:   Thu Dec 2 14:31:13 2010 -0800

      mm/mempolicy.c: add rcu read lock to protect pid structure

  commit 1e50df39f6e2c3a4a3394df62baa8a213df16c54
  Author: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
  Date:   Thu Jan 13 15:46:14 2011 -0800

      mempolicy: remove tasklist_lock from migrate_pages

Signed-off-by: Greg Thelen <gthelen@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Zeng Zhaoming <zengzm.kernel@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 766115253807..352de555626c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1287,14 +1287,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
 		return -EPERM;
 
 	/* Find the mm_struct */
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	task = pid ? find_task_by_vpid(pid) : current;
 	if (!task) {
-		read_unlock(&tasklist_lock);
+		rcu_read_unlock();
 		return -ESRCH;
 	}
 	mm = get_task_mm(task);
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	if (!mm)
 		return -EINVAL;
-- 
cgit v1.2.3


From 2876592f231d436c295b67726313f6f3cfb6e243 Mon Sep 17 00:00:00 2001
From: Mel Gorman
Date: Fri, 25 Feb 2011 14:44:20 -0800
Subject: mm: vmscan: stop reclaim/compaction earlier due to insufficient
 progress if !__GFP_REPEAT

should_continue_reclaim() for reclaim/compaction allows scanning to
continue even if pages are not being reclaimed until the full list is
scanned.  In terms of allocation success, this makes sense but potentially
it introduces unwanted latency for high-order allocations such as
transparent hugepages and network jumbo frames that would prefer to fail
the allocation attempt and fallback to order-0 pages.  Worse, there is a
potential that the full LRU scan will clear all the young bits, distort
page aging information and potentially push pages into swap that would
have otherwise remained resident.

This patch will stop reclaim/compaction if no pages were reclaimed in the
last SWAP_CLUSTER_MAX pages that were considered.  For allocations such as
hugetlbfs that use __GFP_REPEAT and have fewer fallback options, the full
LRU list may still be scanned.

Order-0 allocation should not be affected because RECLAIM_MODE_COMPACTION
is not set so the following avoids the gfp_mask being examined:

        if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
                return false;

A tool was developed based on ftrace that tracked the latency of
high-order allocations while transparent hugepage support was enabled and
three benchmarks were run.  The "fix-infinite" figures are 2.6.38-rc4 with
Johannes's patch "vmscan: fix zone shrinking exit when scan work is done"
applied.

  STREAM Highorder Allocation Latency Statistics
                 fix-infinite     break-early
  1 :: Count            10298           10229
  1 :: Min             0.4560          0.4640
  1 :: Mean            1.0589          1.0183
  1 :: Max            14.5990         11.7510
  1 :: Stddev          0.5208          0.4719
  2 :: Count                2               1
  2 :: Min             1.8610          3.7240
  2 :: Mean            3.4325          3.7240
  2 :: Max             5.0040          3.7240
  2 :: Stddev          1.5715          0.0000
  9 :: Count           111696          111694
  9 :: Min             0.5230          0.4110
  9 :: Mean           10.5831         10.5718
  9 :: Max            38.4480         43.2900
  9 :: Stddev          1.1147          1.1325

Mean time for order-1 allocations is reduced.  order-2 looks increased but
with so few allocations, it's not particularly significant.  THP mean
allocation latency is also reduced.  That said, allocation time varies so
significantly that the reductions are within noise.

Max allocation time is reduced by a significant amount for low-order
allocations but reduced for THP allocations which presumably are now
breaking before reclaim has done enough work.

  SysBench Highorder Allocation Latency Statistics
                 fix-infinite     break-early
  1 :: Count            15745           15677
  1 :: Min             0.4250          0.4550
  1 :: Mean            1.1023          1.0810
  1 :: Max            14.4590         10.8220
  1 :: Stddev          0.5117          0.5100
  2 :: Count                1               1
  2 :: Min             3.0040          2.1530
  2 :: Mean            3.0040          2.1530
  2 :: Max             3.0040          2.1530
  2 :: Stddev          0.0000          0.0000
  9 :: Count             2017            1931
  9 :: Min             0.4980          0.7480
  9 :: Mean           10.4717         10.3840
  9 :: Max            24.9460         26.2500
  9 :: Stddev          1.1726          1.1966

Again, mean time for order-1 allocations is reduced while order-2
allocations are too few to draw conclusions from.  The mean time for THP
allocations is also slightly reduced albeit the reductions are within
varianes.

Once again, our maximum allocation time is significantly reduced for
low-order allocations and slightly increased for THP allocations.

  Anon stream mmap reference Highorder Allocation Latency Statistics
  1 :: Count             1376            1790
  1 :: Min             0.4940          0.5010
  1 :: Mean            1.0289          0.9732
  1 :: Max             6.2670          4.2540
  1 :: Stddev          0.4142          0.2785
  2 :: Count                1               -
  2 :: Min             1.9060               -
  2 :: Mean            1.9060               -
  2 :: Max             1.9060               -
  2 :: Stddev          0.0000               -
  9 :: Count            11266           11257
  9 :: Min             0.4990          0.4940
  9 :: Mean        27250.4669      24256.1919
  9 :: Max      11439211.0000    6008885.0000
  9 :: Stddev     226427.4624     186298.1430

This benchmark creates one thread per CPU which references an amount of
anonymous memory 1.5 times the size of physical RAM.  This pounds swap
quite heavily and is intended to exercise THP a bit.

Mean allocation time for order-1 is reduced as before.  It's also reduced
for THP allocations but the variations here are pretty massive due to
swap.  As before, maximum allocation times are significantly reduced.

Overall, the patch reduces the mean and maximum allocation latencies for
the smaller high-order allocations.  This was with Slab configured so it
would be expected to be more significant with Slub which uses these size
allocations more aggressively.

The mean allocation times for THP allocations are also slightly reduced.
The maximum latency was slightly increased as predicted by the comments
due to reclaim/compaction breaking early.  However, workloads care more
about the latency of lower-order allocations than THP so it's an
acceptable trade-off.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 17497d0cd8b9..6771ea70bfe7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1841,16 +1841,28 @@ static inline bool should_continue_reclaim(struct zone *zone,
 	if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
 		return false;
 
-	/*
-	 * If we failed to reclaim and have scanned the full list, stop.
-	 * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
-	 *       faster but obviously would be less likely to succeed
-	 *       allocation. If this is desirable, use GFP_REPEAT to decide
-	 *       if both reclaimed and scanned should be checked or just
-	 *       reclaimed
-	 */
-	if (!nr_reclaimed && !nr_scanned)
-		return false;
+	/* Consider stopping depending on scan and reclaim activity */
+	if (sc->gfp_mask & __GFP_REPEAT) {
+		/*
+		 * For __GFP_REPEAT allocations, stop reclaiming if the
+		 * full LRU list has been scanned and we are still failing
+		 * to reclaim pages. This full LRU scan is potentially
+		 * expensive but a __GFP_REPEAT caller really wants to succeed
+		 */
+		if (!nr_reclaimed && !nr_scanned)
+			return false;
+	} else {
+		/*
+		 * For non-__GFP_REPEAT allocations which can presumably
+		 * fail without consequence, stop if we failed to reclaim
+		 * any pages from the last SWAP_CLUSTER_MAX number of
+		 * pages that were scanned. This will return to the
+		 * caller faster at the risk reclaim/compaction and
+		 * the resulting allocation attempt fails
+		 */
+		if (!nr_reclaimed)
+			return false;
+	}
 
 	/*
 	 * If we have not reclaimed enough pages for compaction and the
-- 
cgit v1.2.3


From 29723fccc837d20039078f7a571e8d457eb0d6c6 Mon Sep 17 00:00:00 2001
From: Namhyung Kim
Date: Fri, 25 Feb 2011 14:44:25 -0800
Subject: mm: fix dubious code in __count_immobile_pages()

When pfn_valid_within() failed 'iter' was incremented twice.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a873e61e312e..cdef1d4b4e47 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5376,10 +5376,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
 	for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
 		unsigned long check = pfn + iter;
 
-		if (!pfn_valid_within(check)) {
-			iter++;
+		if (!pfn_valid_within(check))
 			continue;
-		}
+
 		page = pfn_to_page(check);
 		if (!page_count(page)) {
 			if (PageBuddy(page))
-- 
cgit v1.2.3


From 8eac563c1c3a2047083022357ae63722b19e4e08 Mon Sep 17 00:00:00 2001
From: Andi Kleen
Date: Fri, 25 Feb 2011 14:44:28 -0800
Subject: thp: fix interleaving for transparent hugepages

The THP code didn't pass the correct interleaving shift to the memory
policy code.  Fix this here by adjusting for the order.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Acked-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 368fc9d23610..49355a970be2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1830,7 +1830,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
 		unsigned nid;
 
-		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
+		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
 		mpol_cond_put(pol);
 		page = alloc_page_interleave(gfp, order, nid);
 		put_mems_allowed();
-- 
cgit v1.2.3


From e5598f8bf5449bc09e4005600ead32e6f2a3e79b Mon Sep 17 00:00:00 2001
From: Hugh Dickins
Date: Fri, 25 Feb 2011 14:44:29 -0800
Subject: memcg: more mem_cgroup_uncharge() batching

It seems odd that truncate_inode_pages_range(), called not only when
truncating but also when evicting inodes, has mem_cgroup_uncharge_start
and _end() batching in its second loop to clear up a few leftovers, but
not in its first loop that does almost all the work: add them there too.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/truncate.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/truncate.c b/mm/truncate.c
index 49feb46e77b8..d64296be00d3 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -225,6 +225,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	next = start;
 	while (next <= end &&
 	       pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 			pgoff_t page_index = page->index;
@@ -247,6 +248,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 			unlock_page(page);
 		}
 		pagevec_release(&pvec);
+		mem_cgroup_uncharge_end();
 		cond_resched();
 	}
 
-- 
cgit v1.2.3


From 2f5f9486f8c12e3aa40fe3775a18cb14efc5cea2 Mon Sep 17 00:00:00 2001
From: Andi Kleen
Date: Fri, 4 Mar 2011 17:36:29 -0800
Subject: mm: change alloc_pages_vma to pass down the policy node for local
 policy

Currently alloc_pages_vma() always uses the local node as policy node for
the LOCAL policy.  Pass this node down as an argument instead.

No behaviour change from this patch, but will be needed for followons.

Acked-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h |  9 +++++----
 mm/huge_memory.c    |  2 +-
 mm/mempolicy.c      | 11 +++++------
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0b84c61607e8..37b8af5db091 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -332,16 +332,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
 	return alloc_pages_current(gfp_mask, order);
 }
 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
-			struct vm_area_struct *vma, unsigned long addr);
+			struct vm_area_struct *vma, unsigned long addr,
+			int node);
 #else
 #define alloc_pages(gfp_mask, order) \
 		alloc_pages_node(numa_node_id(), gfp_mask, order)
-#define alloc_pages_vma(gfp_mask, order, vma, addr)	\
+#define alloc_pages_vma(gfp_mask, order, vma, addr, node)	\
 	alloc_pages(gfp_mask, order)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
-#define alloc_page_vma(gfp_mask, vma, addr)	\
-	alloc_pages_vma(gfp_mask, 0, vma, addr)
+#define alloc_page_vma(gfp_mask, vma, addr)			\
+	alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
 
 extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3e29781ee762..c7c2cd925599 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -653,7 +653,7 @@ static inline struct page *alloc_hugepage_vma(int defrag,
 					      unsigned long haddr)
 {
 	return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
-			       HPAGE_PMD_ORDER, vma, haddr);
+			       HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
 }
 
 #ifndef CONFIG_NUMA
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 49355a970be2..25a5a9146619 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 }
 
 /* Return a zonelist indicated by gfp for node representing a mempolicy */
-static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
+static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
+	int nd)
 {
-	int nd = numa_node_id();
-
 	switch (policy->mode) {
 	case MPOL_PREFERRED:
 		if (!(policy->flags & MPOL_F_LOCAL))
@@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
 				huge_page_shift(hstate_vma(vma))), gfp_flags);
 	} else {
-		zl = policy_zonelist(gfp_flags, *mpol);
+		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
 		if ((*mpol)->mode == MPOL_BIND)
 			*nodemask = &(*mpol)->v.nodes;
 	}
@@ -1820,7 +1819,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
  */
 struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
-		unsigned long addr)
+		unsigned long addr, int node)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 	struct zonelist *zl;
@@ -1836,7 +1835,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 		put_mems_allowed();
 		return page;
 	}
-	zl = policy_zonelist(gfp, pol);
+	zl = policy_zonelist(gfp, pol, node);
 	if (unlikely(mpol_needs_cond_ref(pol))) {
 		/*
 		 * slow path: ref counted shared policy
-- 
cgit v1.2.3


From 19ee151e140daa5183c4984981801e542e0544fb Mon Sep 17 00:00:00 2001
From: Andi Kleen
Date: Fri, 4 Mar 2011 17:36:31 -0800
Subject: mm: preserve original node for transparent huge page copies

This makes a difference for LOCAL policy, where the node cannot be
determined from the policy itself, but has to be gotten from the original
page.

Acked-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c7c2cd925599..1802db819e28 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -799,8 +799,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 	}
 
 	for (i = 0; i < HPAGE_PMD_NR; i++) {
-		pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
-					  vma, address);
+		pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE,
+					       vma, address, page_to_nid(page));
 		if (unlikely(!pages[i] ||
 			     mem_cgroup_newpage_charge(pages[i], mm,
 						       GFP_KERNEL))) {
-- 
cgit v1.2.3


From 5c4b4be3b6b937256103a5ae49177e0c3a17cb8f Mon Sep 17 00:00:00 2001
From: Andi Kleen
Date: Fri, 4 Mar 2011 17:36:32 -0800
Subject: mm: use correct numa policy node for transparent hugepages

Pass down the correct node for a transparent hugepage allocation.  Most
callers continue to use the current node, however the hugepaged daemon
now uses the previous node of the first to be collapsed page instead.
This ensures that khugepaged does not mess up local memory for an
existing process which uses local policy.

The choice of node is somewhat primitive currently: it just uses the
node of the first page in the pmd range.  An alternative would be to
look at multiple pages and use the most popular node.  I used the
simplest variant for now which should work well enough for the case of
all pages being on the same node.

[akpm@linux-foundation.org: coding-style fixes]
Acked-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 24 +++++++++++++++++-------
 mm/mempolicy.c   |  3 ++-
 2 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1802db819e28..dbe99a5f2073 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -650,10 +650,10 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag)
 
 static inline struct page *alloc_hugepage_vma(int defrag,
 					      struct vm_area_struct *vma,
-					      unsigned long haddr)
+					      unsigned long haddr, int nd)
 {
 	return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
-			       HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
+			       HPAGE_PMD_ORDER, vma, haddr, nd);
 }
 
 #ifndef CONFIG_NUMA
@@ -678,7 +678,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (unlikely(khugepaged_enter(vma)))
 			return VM_FAULT_OOM;
 		page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-					  vma, haddr);
+					  vma, haddr, numa_node_id());
 		if (unlikely(!page))
 			goto out;
 		if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
@@ -902,7 +902,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (transparent_hugepage_enabled(vma) &&
 	    !transparent_hugepage_debug_cow())
 		new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-					      vma, haddr);
+					      vma, haddr, numa_node_id());
 	else
 		new_page = NULL;
 
@@ -1745,7 +1745,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
 static void collapse_huge_page(struct mm_struct *mm,
 			       unsigned long address,
 			       struct page **hpage,
-			       struct vm_area_struct *vma)
+			       struct vm_area_struct *vma,
+			       int node)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -1773,7 +1774,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 	 * mmap_sem in read mode is good idea also to allow greater
 	 * scalability.
 	 */
-	new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
+	new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+				      node);
 	if (unlikely(!new_page)) {
 		up_read(&mm->mmap_sem);
 		*hpage = ERR_PTR(-ENOMEM);
@@ -1919,6 +1921,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 	struct page *page;
 	unsigned long _address;
 	spinlock_t *ptl;
+	int node = -1;
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
@@ -1949,6 +1952,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 		page = vm_normal_page(vma, _address, pteval);
 		if (unlikely(!page))
 			goto out_unmap;
+		/*
+		 * Chose the node of the first page. This could
+		 * be more sophisticated and look at more pages,
+		 * but isn't for now.
+		 */
+		if (node == -1)
+			node = page_to_nid(page);
 		VM_BUG_ON(PageCompound(page));
 		if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
 			goto out_unmap;
@@ -1965,7 +1975,7 @@ out_unmap:
 	pte_unmap_unlock(pte, ptl);
 	if (ret)
 		/* collapse_huge_page will return with the mmap_sem released */
-		collapse_huge_page(mm, address, hpage, vma);
+		collapse_huge_page(mm, address, hpage, vma, node);
 out:
 	return ret;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 25a5a9146619..b53ec99f1428 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1891,7 +1891,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
 	else
 		page = __alloc_pages_nodemask(gfp, order,
-			policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
+				policy_zonelist(gfp, pol, numa_node_id()),
+				policy_nodemask(gfp, pol));
 	put_mems_allowed();
 	return page;
 }
-- 
cgit v1.2.3


From ab9a0f196f2f4f080df54402493ea3dc31b5243e Mon Sep 17 00:00:00 2001
From: Lai Jiangshan
Date: Thu, 10 Mar 2011 15:21:48 +0800
Subject: slub: automatically reserve bytes at the end of slab

There is no "struct" for slub's slab, it shares with struct page.
But struct page is very small, it is insufficient when we need
to add some metadata for slab.

So we add a field "reserved" to struct kmem_cache, when a slab
is allocated, kmem_cache->reserved bytes are automatically reserved
at the end of the slab for slab's metadata.

Changed from v1:
	Export the reserved field via sysfs

Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slub_def.h |  1 +
 mm/slub.c                | 47 ++++++++++++++++++++++++++++++-----------------
 2 files changed, 31 insertions(+), 17 deletions(-)

(limited to 'mm')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 8b6e8ae5d5ca..ae0093cc5189 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -83,6 +83,7 @@ struct kmem_cache {
 	void (*ctor)(void *);
 	int inuse;		/* Offset to metadata */
 	int align;		/* Alignment */
+	int reserved;		/* Reserved bytes at the end of slabs */
 	unsigned long min_partial;
 	const char *name;	/* Name (only for display!) */
 	struct list_head list;	/* List of slab caches */
diff --git a/mm/slub.c b/mm/slub.c
index e15aa7f193c9..d3d17677bab5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -281,11 +281,16 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
 	return (p - addr) / s->size;
 }
 
+static inline int order_objects(int order, unsigned long size, int reserved)
+{
+	return ((PAGE_SIZE << order) - reserved) / size;
+}
+
 static inline struct kmem_cache_order_objects oo_make(int order,
-						unsigned long size)
+		unsigned long size, int reserved)
 {
 	struct kmem_cache_order_objects x = {
-		(order << OO_SHIFT) + (PAGE_SIZE << order) / size
+		(order << OO_SHIFT) + order_objects(order, size, reserved)
 	};
 
 	return x;
@@ -617,7 +622,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
 		return 1;
 
 	start = page_address(page);
-	length = (PAGE_SIZE << compound_order(page));
+	length = (PAGE_SIZE << compound_order(page)) - s->reserved;
 	end = start + length;
 	remainder = length % s->size;
 	if (!remainder)
@@ -698,7 +703,7 @@ static int check_slab(struct kmem_cache *s, struct page *page)
 		return 0;
 	}
 
-	maxobj = (PAGE_SIZE << compound_order(page)) / s->size;
+	maxobj = order_objects(compound_order(page), s->size, s->reserved);
 	if (page->objects > maxobj) {
 		slab_err(s, page, "objects %u > max %u",
 			s->name, page->objects, maxobj);
@@ -748,7 +753,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 		nr++;
 	}
 
-	max_objects = (PAGE_SIZE << compound_order(page)) / s->size;
+	max_objects = order_objects(compound_order(page), s->size, s->reserved);
 	if (max_objects > MAX_OBJS_PER_PAGE)
 		max_objects = MAX_OBJS_PER_PAGE;
 
@@ -1988,13 +1993,13 @@ static int slub_nomerge;
  * the smallest order which will fit the object.
  */
 static inline int slab_order(int size, int min_objects,
-				int max_order, int fract_leftover)
+				int max_order, int fract_leftover, int reserved)
 {
 	int order;
 	int rem;
 	int min_order = slub_min_order;
 
-	if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE)
+	if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
 		return get_order(size * MAX_OBJS_PER_PAGE) - 1;
 
 	for (order = max(min_order,
@@ -2003,10 +2008,10 @@ static inline int slab_order(int size, int min_objects,
 
 		unsigned long slab_size = PAGE_SIZE << order;
 
-		if (slab_size < min_objects * size)
+		if (slab_size < min_objects * size + reserved)
 			continue;
 
-		rem = slab_size % size;
+		rem = (slab_size - reserved) % size;
 
 		if (rem <= slab_size / fract_leftover)
 			break;
@@ -2016,7 +2021,7 @@ static inline int slab_order(int size, int min_objects,
 	return order;
 }
 
-static inline int calculate_order(int size)
+static inline int calculate_order(int size, int reserved)
 {
 	int order;
 	int min_objects;
@@ -2034,14 +2039,14 @@ static inline int calculate_order(int size)
 	min_objects = slub_min_objects;
 	if (!min_objects)
 		min_objects = 4 * (fls(nr_cpu_ids) + 1);
-	max_objects = (PAGE_SIZE << slub_max_order)/size;
+	max_objects = order_objects(slub_max_order, size, reserved);
 	min_objects = min(min_objects, max_objects);
 
 	while (min_objects > 1) {
 		fraction = 16;
 		while (fraction >= 4) {
 			order = slab_order(size, min_objects,
-						slub_max_order, fraction);
+					slub_max_order, fraction, reserved);
 			if (order <= slub_max_order)
 				return order;
 			fraction /= 2;
@@ -2053,14 +2058,14 @@ static inline int calculate_order(int size)
 	 * We were unable to place multiple objects in a slab. Now
 	 * lets see if we can place a single object there.
 	 */
-	order = slab_order(size, 1, slub_max_order, 1);
+	order = slab_order(size, 1, slub_max_order, 1, reserved);
 	if (order <= slub_max_order)
 		return order;
 
 	/*
 	 * Doh this slab cannot be placed using slub_max_order.
 	 */
-	order = slab_order(size, 1, MAX_ORDER, 1);
+	order = slab_order(size, 1, MAX_ORDER, 1, reserved);
 	if (order < MAX_ORDER)
 		return order;
 	return -ENOSYS;
@@ -2311,7 +2316,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 	if (forced_order >= 0)
 		order = forced_order;
 	else
-		order = calculate_order(size);
+		order = calculate_order(size, s->reserved);
 
 	if (order < 0)
 		return 0;
@@ -2329,8 +2334,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 	/*
 	 * Determine the number of objects per slab
 	 */
-	s->oo = oo_make(order, size);
-	s->min = oo_make(get_order(size), size);
+	s->oo = oo_make(order, size, s->reserved);
+	s->min = oo_make(get_order(size), size, s->reserved);
 	if (oo_objects(s->oo) > oo_objects(s->max))
 		s->max = s->oo;
 
@@ -2349,6 +2354,7 @@ static int kmem_cache_open(struct kmem_cache *s,
 	s->objsize = size;
 	s->align = align;
 	s->flags = kmem_cache_flags(size, flags, name, ctor);
+	s->reserved = 0;
 
 	if (!calculate_sizes(s, -1))
 		goto error;
@@ -4017,6 +4023,12 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
 }
 SLAB_ATTR_RO(destroy_by_rcu);
 
+static ssize_t reserved_show(struct kmem_cache *s, char *buf)
+{
+	return sprintf(buf, "%d\n", s->reserved);
+}
+SLAB_ATTR_RO(reserved);
+
 #ifdef CONFIG_SLUB_DEBUG
 static ssize_t slabs_show(struct kmem_cache *s, char *buf)
 {
@@ -4303,6 +4315,7 @@ static struct attribute *slab_attrs[] = {
 	&reclaim_account_attr.attr,
 	&destroy_by_rcu_attr.attr,
 	&shrink_attr.attr,
+	&reserved_attr.attr,
 #ifdef CONFIG_SLUB_DEBUG
 	&total_objects_attr.attr,
 	&slabs_attr.attr,
-- 
cgit v1.2.3


From da9a638c6f8fc0633fa94a334f1c053f5e307177 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan
Date: Thu, 10 Mar 2011 15:22:00 +0800
Subject: slub,rcu: don't assume the size of struct rcu_head

The size of struct rcu_head may be changed. When it becomes larger,
it will pollute the page array.

We reserve some some bytes for struct rcu_head when a slab
is allocated in this situation.

Changed from V1:
	use VM_BUG_ON instead BUG_ON

Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slub.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index d3d17677bab5..ebba3eb19369 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1254,21 +1254,38 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 	__free_pages(page, order);
 }
 
+#define need_reserve_slab_rcu						\
+	(sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
+
 static void rcu_free_slab(struct rcu_head *h)
 {
 	struct page *page;
 
-	page = container_of((struct list_head *)h, struct page, lru);
+	if (need_reserve_slab_rcu)
+		page = virt_to_head_page(h);
+	else
+		page = container_of((struct list_head *)h, struct page, lru);
+
 	__free_slab(page->slab, page);
 }
 
 static void free_slab(struct kmem_cache *s, struct page *page)
 {
 	if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
-		/*
-		 * RCU free overloads the RCU head over the LRU
-		 */
-		struct rcu_head *head = (void *)&page->lru;
+		struct rcu_head *head;
+
+		if (need_reserve_slab_rcu) {
+			int order = compound_order(page);
+			int offset = (PAGE_SIZE << order) - s->reserved;
+
+			VM_BUG_ON(s->reserved != sizeof(*head));
+			head = page_address(page) + offset;
+		} else {
+			/*
+			 * RCU free overloads the RCU head over the LRU
+			 */
+			head = (void *)&page->lru;
+		}
 
 		call_rcu(head, rcu_free_slab);
 	} else
@@ -2356,6 +2373,9 @@ static int kmem_cache_open(struct kmem_cache *s,
 	s->flags = kmem_cache_flags(size, flags, name, ctor);
 	s->reserved = 0;
 
+	if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
+		s->reserved = sizeof(struct rcu_head);
+
 	if (!calculate_sizes(s, -1))
 		goto error;
 	if (disable_higher_order_debug) {
-- 
cgit v1.2.3


From 5bfe53a77e8a3ffce4a10003c75f464a138e272d Mon Sep 17 00:00:00 2001
From: Lai Jiangshan
Date: Thu, 10 Mar 2011 15:22:24 +0800
Subject: slab,rcu: don't assume the size of struct rcu_head

The size of struct rcu_head may be changed. When it becomes larger,
it may pollute the data after struct slab.

Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 mm/slab.c | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 37961d1f584f..52cf0b4634d4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -190,22 +190,6 @@ typedef unsigned int kmem_bufctl_t;
 #define	BUFCTL_ACTIVE	(((kmem_bufctl_t)(~0U))-2)
 #define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-3)
 
-/*
- * struct slab
- *
- * Manages the objs in a slab. Placed either at the beginning of mem allocated
- * for a slab, or allocated from an general cache.
- * Slabs are chained into three list: fully used, partial, fully free slabs.
- */
-struct slab {
-	struct list_head list;
-	unsigned long colouroff;
-	void *s_mem;		/* including colour offset */
-	unsigned int inuse;	/* num of objs active in slab */
-	kmem_bufctl_t free;
-	unsigned short nodeid;
-};
-
 /*
  * struct slab_rcu
  *
@@ -219,8 +203,6 @@ struct slab {
  *
  * rcu_read_lock before reading the address, then rcu_read_unlock after
  * taking the spinlock within the structure expected at that address.
- *
- * We assume struct slab_rcu can overlay struct slab when destroying.
  */
 struct slab_rcu {
 	struct rcu_head head;
@@ -228,6 +210,27 @@ struct slab_rcu {
 	void *addr;
 };
 
+/*
+ * struct slab
+ *
+ * Manages the objs in a slab. Placed either at the beginning of mem allocated
+ * for a slab, or allocated from an general cache.
+ * Slabs are chained into three list: fully used, partial, fully free slabs.
+ */
+struct slab {
+	union {
+		struct {
+			struct list_head list;
+			unsigned long colouroff;
+			void *s_mem;		/* including colour offset */
+			unsigned int inuse;	/* num of objs active in slab */
+			kmem_bufctl_t free;
+			unsigned short nodeid;
+		};
+		struct slab_rcu __slab_cover_slab_rcu;
+	};
+};
+
 /*
  * struct array_cache
  *
-- 
cgit v1.2.3