14 files changed, 918 insertions, 343 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4eb5ae3fbe10..fbd1111ea119 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7,10 +7,14 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mm.h>
-#include <linux/hugetlb.h>
 #include <linux/sysctl.h>
 #include <linux/highmem.h>
 #include <linux/nodemask.h>
+#include <linux/pagemap.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#include <linux/hugetlb.h>
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 static unsigned long nr_huge_pages, free_huge_pages;
@@ -249,6 +253,72 @@ struct vm_operations_struct hugetlb_vm_ops = {
 	.nopage = hugetlb_nopage,
 };
 
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
+{
+	pte_t entry;
+
+	if (vma->vm_flags & VM_WRITE) {
+		entry =
+		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+	} else {
+		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+	}
+	entry = pte_mkyoung(entry);
+	entry = pte_mkhuge(entry);
+
+	return entry;
+}
+
+int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
+			    struct vm_area_struct *vma)
+{
+	pte_t *src_pte, *dst_pte, entry;
+	struct page *ptepage;
+	unsigned long addr = vma->vm_start;
+	unsigned long end = vma->vm_end;
+
+	while (addr < end) {
+		dst_pte = huge_pte_alloc(dst, addr);
+		if (!dst_pte)
+			goto nomem;
+		src_pte = huge_pte_offset(src, addr);
+		BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */
+		entry = *src_pte;
+		ptepage = pte_page(entry);
+		get_page(ptepage);
+		add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
+		set_huge_pte_at(dst, addr, dst_pte, entry);
+		addr += HPAGE_SIZE;
+	}
+	return 0;
+
+nomem:
+	return -ENOMEM;
+}
+
+void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+			  unsigned long end)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long address;
+	pte_t pte;
+	struct page *page;
+
+	WARN_ON(!is_vm_hugetlb_page(vma));
+	BUG_ON(start & ~HPAGE_MASK);
+	BUG_ON(end & ~HPAGE_MASK);
+
+	for (address = start; address < end; address += HPAGE_SIZE) {
+		pte = huge_ptep_get_and_clear(mm, address, huge_pte_offset(mm, address));
+		if (pte_none(pte))
+			continue;
+		page = pte_page(pte);
+		put_page(page);
+	}
+	add_mm_counter(mm, rss,  -((end - start) >> PAGE_SHIFT));
+	flush_tlb_range(vma, start, end);
+}
+
 void zap_hugepage_range(struct vm_area_struct *vma,
 			unsigned long start, unsigned long length)
 {
@@ -258,3 +328,108 @@ void zap_hugepage_range(struct vm_area_struct *vma,
 	unmap_hugepage_range(vma, start, start + length);
 	spin_unlock(&mm->page_table_lock);
 }
+
+int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr;
+	int ret = 0;
+
+	WARN_ON(!is_vm_hugetlb_page(vma));
+	BUG_ON(vma->vm_start & ~HPAGE_MASK);
+	BUG_ON(vma->vm_end & ~HPAGE_MASK);
+
+	hugetlb_prefault_arch_hook(mm);
+
+	spin_lock(&mm->page_table_lock);
+	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+		unsigned long idx;
+		pte_t *pte = huge_pte_alloc(mm, addr);
+		struct page *page;
+
+		if (!pte) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		if (! pte_none(*pte))
+			hugetlb_clean_stale_pgtable(pte);
+
+		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+		page = find_get_page(mapping, idx);
+		if (!page) {
+			/* charge the fs quota first */
+			if (hugetlb_get_quota(mapping)) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			page = alloc_huge_page();
+			if (!page) {
+				hugetlb_put_quota(mapping);
+				ret = -ENOMEM;
+				goto out;
+			}
+			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
+			if (! ret) {
+				unlock_page(page);
+			} else {
+				hugetlb_put_quota(mapping);
+				free_huge_page(page);
+				goto out;
+			}
+		}
+		add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+		set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
+	}
+out:
+	spin_unlock(&mm->page_table_lock);
+	return ret;
+}
+
+int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+			struct page **pages, struct vm_area_struct **vmas,
+			unsigned long *position, int *length, int i)
+{
+	unsigned long vpfn, vaddr = *position;
+	int remainder = *length;
+
+	BUG_ON(!is_vm_hugetlb_page(vma));
+
+	vpfn = vaddr/PAGE_SIZE;
+	while (vaddr < vma->vm_end && remainder) {
+
+		if (pages) {
+			pte_t *pte;
+			struct page *page;
+
+			/* Some archs (sparc64, sh*) have multiple
+			 * pte_ts to each hugepage.  We have to make
+			 * sure we get the first, for the page
+			 * indexing below to work. */
+			pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+
+			/* hugetlb should be locked, and hence, prefaulted */
+			WARN_ON(!pte || pte_none(*pte));
+
+			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+
+			WARN_ON(!PageCompound(page));
+
+			get_page(page);
+			pages[i] = page;
+		}
+
+		if (vmas)
+			vmas[i] = vma;
+
+		vaddr += PAGE_SIZE;
+		++vpfn;
+		--remainder;
+		++i;
+	}
+
+	*length = remainder;
+	*position = vaddr;
+
+	return i;
+}
diff --git a/mm/madvise.c b/mm/madvise.c
index 944b5e52d812..e3108054733c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -8,17 +8,47 @@
 #include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
+#include <linux/mempolicy.h>
 #include <linux/hugetlb.h>
 
 /*
  * We can potentially split a vm area into separate
  * areas, each area with its own behavior.
  */
-static long madvise_behavior(struct vm_area_struct * vma, unsigned long start,
-			     unsigned long end, int behavior)
+static long madvise_behavior(struct vm_area_struct * vma,
+		     struct vm_area_struct **prev,
+		     unsigned long start, unsigned long end, int behavior)
 {
 	struct mm_struct * mm = vma->vm_mm;
 	int error = 0;
+	pgoff_t pgoff;
+	int new_flags = vma->vm_flags & ~VM_READHINTMASK;
+
+	switch (behavior) {
+	case MADV_SEQUENTIAL:
+		new_flags |= VM_SEQ_READ;
+		break;
+	case MADV_RANDOM:
+		new_flags |= VM_RAND_READ;
+		break;
+	default:
+		break;
+	}
+
+	if (new_flags == vma->vm_flags) {
+		*prev = vma;
+		goto success;
+	}
+
+	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
+				vma->vm_file, pgoff, vma_policy(vma));
+	if (*prev) {
+		vma = *prev;
+		goto success;
+	}
+
+	*prev = vma;
 
 	if (start != vma->vm_start) {
 		error = split_vma(mm, vma, start, 1);
@@ -36,21 +66,12 @@ static long madvise_behavior(struct vm_area_struct * vma, unsigned long start,
 	 * vm_flags is protected by the mmap_sem held in write mode.
 	 */
 	VM_ClearReadHint(vma);
-
-	switch (behavior) {
-	case MADV_SEQUENTIAL:
-		vma->vm_flags |= VM_SEQ_READ;
-		break;
-	case MADV_RANDOM:
-		vma->vm_flags |= VM_RAND_READ;
-		break;
-	default:
-		break;
-	}
+	vma->vm_flags = new_flags;
 
 out:
 	if (error == -ENOMEM)
 		error = -EAGAIN;
+success:
 	return error;
 }
 
@@ -58,6 +79,7 @@ out:
  * Schedule all required I/O operations.  Do not wait for completion.
  */
 static long madvise_willneed(struct vm_area_struct * vma,
+			     struct vm_area_struct ** prev,
 			     unsigned long start, unsigned long end)
 {
 	struct file *file = vma->vm_file;
@@ -65,6 +87,7 @@ static long madvise_willneed(struct vm_area_struct * vma,
 	if (!file)
 		return -EBADF;
 
+	*prev = vma;
 	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 	if (end > vma->vm_end)
 		end = vma->vm_end;
@@ -95,8 +118,10 @@ static long madvise_willneed(struct vm_area_struct * vma,
  * dirty pages is already available as msync(MS_INVALIDATE).
  */
 static long madvise_dontneed(struct vm_area_struct * vma,
+			     struct vm_area_struct ** prev,
 			     unsigned long start, unsigned long end)
 {
+	*prev = vma;
 	if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
 		return -EINVAL;
 
@@ -111,8 +136,8 @@ static long madvise_dontneed(struct vm_area_struct * vma,
 	return 0;
 }
 
-static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
-			unsigned long end, int behavior)
+static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
+			unsigned long start, unsigned long end, int behavior)
 {
 	long error = -EBADF;
 
@@ -120,15 +145,15 @@ static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
 	case MADV_NORMAL:
 	case MADV_SEQUENTIAL:
 	case MADV_RANDOM:
-		error = madvise_behavior(vma, start, end, behavior);
+		error = madvise_behavior(vma, prev, start, end, behavior);
 		break;
 
 	case MADV_WILLNEED:
-		error = madvise_willneed(vma, start, end);
+		error = madvise_willneed(vma, prev, start, end);
 		break;
 
 	case MADV_DONTNEED:
-		error = madvise_dontneed(vma, start, end);
+		error = madvise_dontneed(vma, prev, start, end);
 		break;
 
 	default:
@@ -175,8 +200,8 @@ static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
  */
 asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
 {
-	unsigned long end;
-	struct vm_area_struct * vma;
+	unsigned long end, tmp;
+	struct vm_area_struct * vma, *prev;
 	int unmapped_error = 0;
 	int error = -EINVAL;
 	size_t len;
@@ -202,40 +227,42 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
 	/*
 	 * If the interval [start,end) covers some unmapped address
 	 * ranges, just ignore them, but return -ENOMEM at the end.
+	 * - different from the way of handling in mlock etc.
 	 */
-	vma = find_vma(current->mm, start);
+	vma = find_vma_prev(current->mm, start, &prev);
+	if (!vma && prev)
+		vma = prev->vm_next;
 	for (;;) {
 		/* Still start < end. */
 		error = -ENOMEM;
 		if (!vma)
 			goto out;
 
-		/* Here start < vma->vm_end. */
+		/* Here start < (end|vma->vm_end). */
 		if (start < vma->vm_start) {
 			unmapped_error = -ENOMEM;
 			start = vma->vm_start;
+			if (start >= end)
+				goto out;
 		}
 
-		/* Here vma->vm_start <= start < vma->vm_end. */
-		if (end <= vma->vm_end) {
-			if (start < end) {
-				error = madvise_vma(vma, start, end,
-							behavior);
-				if (error)
-					goto out;
-			}
-			error = unmapped_error;
-			goto out;
-		}
+		/* Here vma->vm_start <= start < (end|vma->vm_end) */
+		tmp = vma->vm_end;
+		if (end < tmp)
+			tmp = end;
 
-		/* Here vma->vm_start <= start < vma->vm_end < end. */
-		error = madvise_vma(vma, start, vma->vm_end, behavior);
+		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+		error = madvise_vma(vma, &prev, start, tmp, behavior);
 		if (error)
 			goto out;
-		start = vma->vm_end;
-		vma = vma->vm_next;
+		start = tmp;
+		if (start < prev->vm_end)
+			start = prev->vm_end;
+		error = unmapped_error;
+		if (start >= end)
+			goto out;
+		vma = prev->vm_next;
 	}
-
 out:
 	up_write(&current->mm->mmap_sem);
 	return error;
diff --git a/mm/memory.c b/mm/memory.c
index d209f745db7f..da91b7bf9986 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -840,23 +840,8 @@ check_user_page_readable(struct mm_struct *mm, unsigned long address)
 {
 	return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL;
 }
-
 EXPORT_SYMBOL(check_user_page_readable);
 
-/* 
- * Given a physical address, is there a useful struct page pointing to
- * it?  This may become more complex in the future if we start dealing
- * with IO-aperture pages for direct-IO.
- */
-
-static inline struct page *get_page_map(struct page *page)
-{
-	if (!pfn_valid(page_to_pfn(page)))
-		return NULL;
-	return page;
-}
-
-
 static inline int
 untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
 			 unsigned long address)
@@ -887,7 +872,6 @@ untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
 	return 0;
 }
 
-
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, int len, int write, int force,
 		struct page **pages, struct vm_area_struct **vmas)
@@ -951,21 +935,21 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		}
 		spin_lock(&mm->page_table_lock);
 		do {
-			struct page *map;
+			struct page *page;
 			int lookup_write = write;
 
 			cond_resched_lock(&mm->page_table_lock);
-			while (!(map = follow_page(mm, start, lookup_write))) {
+			while (!(page = follow_page(mm, start, lookup_write))) {
 				/*
 				 * Shortcut for anonymous pages. We don't want
 				 * to force the creation of pages tables for
-				 * insanly big anonymously mapped areas that
+				 * insanely big anonymously mapped areas that
 				 * nobody touched so far. This is important
 				 * for doing a core dump for these mappings.
 				 */
 				if (!lookup_write &&
 				    untouched_anonymous_page(mm,vma,start)) {
-					map = ZERO_PAGE(start);
+					page = ZERO_PAGE(start);
 					break;
 				}
 				spin_unlock(&mm->page_table_lock);
@@ -994,30 +978,21 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				spin_lock(&mm->page_table_lock);
 			}
 			if (pages) {
-				pages[i] = get_page_map(map);
-				if (!pages[i]) {
-					spin_unlock(&mm->page_table_lock);
-					while (i--)
-						page_cache_release(pages[i]);
-					i = -EFAULT;
-					goto out;
-				}
-				flush_dcache_page(pages[i]);
-				if (!PageReserved(pages[i]))
-					page_cache_get(pages[i]);
+				pages[i] = page;
+				flush_dcache_page(page);
+				if (!PageReserved(page))
+					page_cache_get(page);
 			}
 			if (vmas)
 				vmas[i] = vma;
 			i++;
 			start += PAGE_SIZE;
 			len--;
-		} while(len && start < vma->vm_end);
+		} while (len && start < vma->vm_end);
 		spin_unlock(&mm->page_table_lock);
-	} while(len);
-out:
+	} while (len);
 	return i;
 }
-
 EXPORT_SYMBOL(get_user_pages);
 
 static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
@@ -1264,7 +1239,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 	}
 	old_page = pfn_to_page(pfn);
 
-	if (!TestSetPageLocked(old_page)) {
+	if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
 		int reuse = can_share_swap_page(old_page);
 		unlock_page(old_page);
 		if (reuse) {
@@ -1711,10 +1686,6 @@ static int do_swap_page(struct mm_struct * mm,
 	}
 
 	/* The page isn't present yet, go ahead with the fault. */
-		
-	swap_free(entry);
-	if (vm_swap_full())
-		remove_exclusive_swap_page(page);
 
 	inc_mm_counter(mm, rss);
 	pte = mk_pte(page, vma->vm_page_prot);
@@ -1722,12 +1693,16 @@ static int do_swap_page(struct mm_struct * mm,
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
 		write_access = 0;
 	}
-	unlock_page(page);
 
 	flush_icache_page(vma, page);
 	set_pte_at(mm, address, page_table, pte);
 	page_add_anon_rmap(page, vma, address);
 
+	swap_free(entry);
+	if (vm_swap_full())
+		remove_exclusive_swap_page(page);
+	unlock_page(page);
+
 	if (write_access) {
 		if (do_wp_page(mm, vma, address,
 				page_table, pmd, pte) == VM_FAULT_OOM)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 08c41da429cf..cb41c31e7c87 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -238,46 +238,80 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
 }
 
 /* Ensure all existing pages follow the policy. */
-static int
-verify_pages(struct mm_struct *mm,
-	     unsigned long addr, unsigned long end, unsigned long *nodes)
+static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
+		unsigned long addr, unsigned long end, unsigned long *nodes)
 {
-	while (addr < end) {
-		struct page *p;
-		pte_t *pte;
-		pmd_t *pmd;
-		pud_t *pud;
-		pgd_t *pgd;
-		pgd = pgd_offset(mm, addr);
-		if (pgd_none(*pgd)) {
-			unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK;
-			if (next > addr)
-				break;
-			addr = next;
+	pte_t *orig_pte;
+	pte_t *pte;
+
+	spin_lock(&mm->page_table_lock);
+	orig_pte = pte = pte_offset_map(pmd, addr);
+	do {
+		unsigned long pfn;
+		unsigned int nid;
+
+		if (!pte_present(*pte))
 			continue;
-		}
-		pud = pud_offset(pgd, addr);
-		if (pud_none(*pud)) {
-			addr = (addr + PUD_SIZE) & PUD_MASK;
+		pfn = pte_pfn(*pte);
+		if (!pfn_valid(pfn))
 			continue;
-		}
-		pmd = pmd_offset(pud, addr);
-		if (pmd_none(*pmd)) {
-			addr = (addr + PMD_SIZE) & PMD_MASK;
+		nid = pfn_to_nid(pfn);
+		if (!test_bit(nid, nodes))
+			break;
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	pte_unmap(orig_pte);
+	spin_unlock(&mm->page_table_lock);
+	return addr != end;
+}
+
+static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
+		unsigned long addr, unsigned long end, unsigned long *nodes)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		}
-		p = NULL;
-		pte = pte_offset_map(pmd, addr);
-		if (pte_present(*pte))
-			p = pte_page(*pte);
-		pte_unmap(pte);
-		if (p) {
-			unsigned nid = page_to_nid(p);
-			if (!test_bit(nid, nodes))
-				return -EIO;
-		}
-		addr += PAGE_SIZE;
-	}
+		if (check_pte_range(mm, pmd, addr, next, nodes))
+			return -EIO;
+	} while (pmd++, addr = next, addr != end);
+	return 0;
+}
+
+static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
+		unsigned long addr, unsigned long end, unsigned long *nodes)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		if (check_pmd_range(mm, pud, addr, next, nodes))
+			return -EIO;
+	} while (pud++, addr = next, addr != end);
+	return 0;
+}
+
+static inline int check_pgd_range(struct mm_struct *mm,
+		unsigned long addr, unsigned long end, unsigned long *nodes)
+{
+	pgd_t *pgd;
+	unsigned long next;
+
+	pgd = pgd_offset(mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		if (check_pud_range(mm, pgd, addr, next, nodes))
+			return -EIO;
+	} while (pgd++, addr = next, addr != end);
 	return 0;
 }
 
@@ -299,7 +333,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 		if (prev && prev->vm_end < vma->vm_start)
 			return ERR_PTR(-EFAULT);
 		if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
-			err = verify_pages(vma->vm_mm,
+			err = check_pgd_range(vma->vm_mm,
 					   vma->vm_start, vma->vm_end, nodes);
 			if (err) {
 				first = ERR_PTR(err);
@@ -721,7 +755,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or
 	zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
 	page = __alloc_pages(gfp, order, zl);
 	if (page && page_zone(page) == zl->zones[0]) {
-		zl->zones[0]->pageset[get_cpu()].interleave_hit++;
+		zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
 		put_cpu();
 	}
 	return page;
diff --git a/mm/mmap.c b/mm/mmap.c
index de54acd9942f..da3fa90a0aae 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1175,7 +1175,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		    (!vma || addr + len <= vma->vm_start))
 			return addr;
 	}
-	start_addr = addr = mm->free_area_cache;
+	if (len > mm->cached_hole_size) {
+	        start_addr = addr = mm->free_area_cache;
+	} else {
+	        start_addr = addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
 
 full_search:
 	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
@@ -1186,7 +1191,9 @@ full_search:
 			 * some holes.
 			 */
 			if (start_addr != TASK_UNMAPPED_BASE) {
-				start_addr = addr = TASK_UNMAPPED_BASE;
+				addr = TASK_UNMAPPED_BASE;
+			        start_addr = addr;
+				mm->cached_hole_size = 0;
 				goto full_search;
 			}
 			return -ENOMEM;
@@ -1198,19 +1205,22 @@ full_search:
 			mm->free_area_cache = addr + len;
 			return addr;
 		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
 		addr = vma->vm_end;
 	}
 }
 #endif	
 
-void arch_unmap_area(struct vm_area_struct *area)
+void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
 {
 	/*
 	 * Is this a new hole at the lowest possible address?
 	 */
-	if (area->vm_start >= TASK_UNMAPPED_BASE &&
-			area->vm_start < area->vm_mm->free_area_cache)
-		area->vm_mm->free_area_cache = area->vm_start;
+	if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) {
+		mm->free_area_cache = addr;
+		mm->cached_hole_size = ~0UL;
+	}
 }
 
 /*
@@ -1240,6 +1250,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 			return addr;
 	}
 
+	/* check if free_area_cache is useful for us */
+	if (len <= mm->cached_hole_size) {
+ 	        mm->cached_hole_size = 0;
+ 		mm->free_area_cache = mm->mmap_base;
+ 	}
+
 	/* either no address requested or can't fit in requested address hole */
 	addr = mm->free_area_cache;
 
@@ -1251,6 +1267,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 			return (mm->free_area_cache = addr-len);
 	}
 
+	if (mm->mmap_base < len)
+		goto bottomup;
+
 	addr = mm->mmap_base-len;
 
 	do {
@@ -1264,38 +1283,45 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 			/* remember the address as a hint for next time */
 			return (mm->free_area_cache = addr);
 
+ 		/* remember the largest hole we saw so far */
+ 		if (addr + mm->cached_hole_size < vma->vm_start)
+ 		        mm->cached_hole_size = vma->vm_start - addr;
+
 		/* try just below the current vma->vm_start */
 		addr = vma->vm_start-len;
 	} while (len < vma->vm_start);
 
+bottomup:
 	/*
 	 * A failed mmap() very likely causes application failure,
 	 * so fall back to the bottom-up function here. This scenario
 	 * can happen with large stack limits and large mmap()
 	 * allocations.
 	 */
-	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	mm->cached_hole_size = ~0UL;
+  	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
 	/*
 	 * Restore the topdown base:
 	 */
 	mm->free_area_cache = mm->mmap_base;
+	mm->cached_hole_size = ~0UL;
 
 	return addr;
 }
 #endif
 
-void arch_unmap_area_topdown(struct vm_area_struct *area)
+void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
 {
 	/*
 	 * Is this a new hole at the highest possible address?
 	 */
-	if (area->vm_end > area->vm_mm->free_area_cache)
-		area->vm_mm->free_area_cache = area->vm_end;
+	if (addr > mm->free_area_cache)
+		mm->free_area_cache = addr;
 
 	/* dont allow allocations above current base */
-	if (area->vm_mm->free_area_cache > area->vm_mm->mmap_base)
-		area->vm_mm->free_area_cache = area->vm_mm->mmap_base;
+	if (mm->free_area_cache > mm->mmap_base)
+		mm->free_area_cache = mm->mmap_base;
 }
 
 unsigned long
@@ -1595,7 +1621,6 @@ static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
 	if (area->vm_flags & VM_LOCKED)
 		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
 	vm_stat_unaccount(area);
-	area->vm_mm->unmap_area(area);
 	remove_vm_struct(area);
 }
 
@@ -1649,6 +1674,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	struct vm_area_struct **insertion_point;
 	struct vm_area_struct *tail_vma = NULL;
+	unsigned long addr;
 
 	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
 	do {
@@ -1659,6 +1685,11 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 	} while (vma && vma->vm_start < end);
 	*insertion_point = vma;
 	tail_vma->vm_next = NULL;
+	if (mm->unmap_area == arch_unmap_area)
+		addr = prev ? prev->vm_end : mm->mmap_base;
+	else
+		addr = vma ?  vma->vm_start : mm->mmap_base;
+	mm->unmap_area(mm, addr);
 	mm->mmap_cache = NULL;		/* Kill the cache. */
 }
 
diff --git a/mm/msync.c b/mm/msync.c
index 090f426bca7d..d0f5a1bce7cb 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -34,6 +34,8 @@ static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
 		if (!pte_present(*pte))
 			continue;
+		if (!pte_maybe_dirty(*pte))
+			continue;
 		pfn = pte_pfn(*pte);
 		if (!pfn_valid(pfn))
 			continue;
diff --git a/mm/nommu.c b/mm/nommu.c
index c53e9c8f6b4a..ce74452c02d9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1067,7 +1067,7 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
 	return -ENOMEM;
 }
 
-void arch_unmap_area(struct vm_area_struct *area)
+void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
 {
 }
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4bbb1cb10495..59666d905f19 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -258,6 +258,10 @@ void out_of_memory(unsigned int __nocast gfp_mask)
 	struct mm_struct *mm = NULL;
 	task_t * p;
 
+	printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
+	/* print memory stats */
+	show_mem();
+
 	read_lock(&tasklist_lock);
 retry:
 	p = select_bad_process();
@@ -268,12 +272,9 @@ retry:
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!p) {
 		read_unlock(&tasklist_lock);
-		show_free_areas();
 		panic("Out of memory and no killable processes...\n");
 	}
 
-	printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
-	show_free_areas();
 	mm = oom_kill_process(p);
 	if (!mm)
 		goto retry;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b1061b1962f8..206920796f5f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -105,11 +105,13 @@ static void bad_page(const char *function, struct page *page)
 	printk(KERN_EMERG "Backtrace:\n");
 	dump_stack();
 	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
-	page->flags &= ~(1 << PG_private	|
+	page->flags &= ~(1 << PG_lru	|
+			1 << PG_private |
 			1 << PG_locked	|
-			1 << PG_lru	|
 			1 << PG_active	|
 			1 << PG_dirty	|
+			1 << PG_reclaim |
+			1 << PG_slab    |
 			1 << PG_swapcache |
 			1 << PG_writeback);
 	set_page_count(page, 0);
@@ -440,14 +442,17 @@ void set_page_refs(struct page *page, int order)
  */
 static void prep_new_page(struct page *page, int order)
 {
-	if (page->mapping || page_mapcount(page) ||
-	    (page->flags & (
+	if (	page_mapcount(page) ||
+		page->mapping != NULL ||
+		page_count(page) != 0 ||
+		(page->flags & (
+			1 << PG_lru	|
 			1 << PG_private	|
 			1 << PG_locked	|
-			1 << PG_lru	|
 			1 << PG_active	|
 			1 << PG_dirty	|
 			1 << PG_reclaim	|
+			1 << PG_slab    |
 			1 << PG_swapcache |
 			1 << PG_writeback )))
 		bad_page(__FUNCTION__, page);
@@ -511,6 +516,36 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 	return allocated;
 }
 
+#ifdef CONFIG_NUMA
+/* Called from the slab reaper to drain remote pagesets */
+void drain_remote_pages(void)
+{
+	struct zone *zone;
+	int i;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	for_each_zone(zone) {
+		struct per_cpu_pageset *pset;
+
+		/* Do not drain local pagesets */
+		if (zone->zone_pgdat->node_id == numa_node_id())
+			continue;
+
+		pset = zone->pageset[smp_processor_id()];
+		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
+			struct per_cpu_pages *pcp;
+
+			pcp = &pset->pcp[i];
+			if (pcp->count)
+				pcp->count -= free_pages_bulk(zone, pcp->count,
+						&pcp->list, 0);
+		}
+	}
+	local_irq_restore(flags);
+}
+#endif
+
 #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
 static void __drain_pages(unsigned int cpu)
 {
@@ -520,7 +555,7 @@ static void __drain_pages(unsigned int cpu)
 	for_each_zone(zone) {
 		struct per_cpu_pageset *pset;
 
-		pset = &zone->pageset[cpu];
+		pset = zone_pcp(zone, cpu);
 		for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
 			struct per_cpu_pages *pcp;
 
@@ -583,12 +618,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
 
 	local_irq_save(flags);
 	cpu = smp_processor_id();
-	p = &z->pageset[cpu];
+	p = zone_pcp(z,cpu);
 	if (pg == orig) {
-		z->pageset[cpu].numa_hit++;
+		p->numa_hit++;
 	} else {
 		p->numa_miss++;
-		zonelist->zones[0]->pageset[cpu].numa_foreign++;
+		zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
 	}
 	if (pg == NODE_DATA(numa_node_id()))
 		p->local_node++;
@@ -615,12 +650,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
 	if (PageAnon(page))
 		page->mapping = NULL;
 	free_pages_check(__FUNCTION__, page);
-	pcp = &zone->pageset[get_cpu()].pcp[cold];
+	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
 	local_irq_save(flags);
-	if (pcp->count >= pcp->high)
-		pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
 	list_add(&page->lru, &pcp->list);
 	pcp->count++;
+	if (pcp->count >= pcp->high)
+		pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
 	local_irq_restore(flags);
 	put_cpu();
 }
@@ -659,7 +694,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
 	if (order == 0) {
 		struct per_cpu_pages *pcp;
 
-		pcp = &zone->pageset[get_cpu()].pcp[cold];
+		pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
 		local_irq_save(flags);
 		if (pcp->count <= pcp->low)
 			pcp->count += rmqueue_bulk(zone, 0,
@@ -724,6 +759,16 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 	return 1;
 }
 
+static inline int
+should_reclaim_zone(struct zone *z, unsigned int gfp_mask)
+{
+	if (!z->reclaim_pages)
+		return 0;
+	if (gfp_mask & __GFP_NORECLAIM)
+		return 0;
+	return 1;
+}
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
@@ -760,17 +805,32 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
 
 	classzone_idx = zone_idx(zones[0]);
 
- restart:
+restart:
 	/* Go through the zonelist once, looking for a zone with enough free */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
-
-		if (!zone_watermark_ok(z, order, z->pages_low,
-				       classzone_idx, 0, 0))
-			continue;
+		int do_reclaim = should_reclaim_zone(z, gfp_mask);
 
 		if (!cpuset_zone_allowed(z))
 			continue;
 
+		/*
+		 * If the zone is to attempt early page reclaim then this loop
+		 * will try to reclaim pages and check the watermark a second
+		 * time before giving up and falling back to the next zone.
+		 */
+zone_reclaim_retry:
+		if (!zone_watermark_ok(z, order, z->pages_low,
+				       classzone_idx, 0, 0)) {
+			if (!do_reclaim)
+				continue;
+			else {
+				zone_reclaim(z, gfp_mask, order);
+				/* Only try reclaim once */
+				do_reclaim = 0;
+				goto zone_reclaim_retry;
+			}
+		}
+
 		page = buffered_rmqueue(z, order, gfp_mask);
 		if (page)
 			goto got_pg;
@@ -829,7 +889,7 @@ rebalance:
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	did_some_progress = try_to_free_pages(zones, gfp_mask, order);
+	did_some_progress = try_to_free_pages(zones, gfp_mask);
 
 	p->reclaim_state = NULL;
 	p->flags &= ~PF_MEMALLOC;
@@ -905,6 +965,7 @@ nopage:
 			" order:%d, mode:0x%x\n",
 			p->comm, order, gfp_mask);
 		dump_stack();
+		show_mem();
 	}
 	return NULL;
 got_pg:
@@ -1114,7 +1175,7 @@ void get_full_page_state(struct page_state *ret)
 	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
 }
 
-unsigned long __read_page_state(unsigned offset)
+unsigned long __read_page_state(unsigned long offset)
 {
 	unsigned long ret = 0;
 	int cpu;
@@ -1128,7 +1189,7 @@ unsigned long __read_page_state(unsigned offset)
 	return ret;
 }
 
-void __mod_page_state(unsigned offset, unsigned long delta)
+void __mod_page_state(unsigned long offset, unsigned long delta)
 {
 	unsigned long flags;
 	void* ptr;
@@ -1237,22 +1298,23 @@ void show_free_areas(void)
 			if (!cpu_possible(cpu))
 				continue;
 
-			pageset = zone->pageset + cpu;
+			pageset = zone_pcp(zone, cpu);
 
 			for (temperature = 0; temperature < 2; temperature++)
-				printk("cpu %d %s: low %d, high %d, batch %d\n",
+				printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
 					cpu,
 					temperature ? "cold" : "hot",
 					pageset->pcp[temperature].low,
 					pageset->pcp[temperature].high,
-					pageset->pcp[temperature].batch);
+					pageset->pcp[temperature].batch,
+					pageset->pcp[temperature].count);
 		}
 	}
 
 	get_page_state(&ps);
 	get_zone_counts(&active, &inactive, &free);
 
-	printk("\nFree pages: %11ukB (%ukB HighMem)\n",
+	printk("Free pages: %11ukB (%ukB HighMem)\n",
 		K(nr_free_pages()),
 		K(nr_free_highpages()));
 
@@ -1620,6 +1682,155 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
 	memmap_init_zone((size), (nid), (zone), (start_pfn))
 #endif
 
+static int __devinit zone_batchsize(struct zone *zone)
+{
+	int batch;
+
+	/*
+	 * The per-cpu-pages pools are set to around 1000th of the
+	 * size of the zone.  But no more than 1/4 of a meg - there's
+	 * no point in going beyond the size of L2 cache.
+	 *
+	 * OK, so we don't know how big the cache is.  So guess.
+	 */
+	batch = zone->present_pages / 1024;
+	if (batch * PAGE_SIZE > 256 * 1024)
+		batch = (256 * 1024) / PAGE_SIZE;
+	batch /= 4;		/* We effectively *= 4 below */
+	if (batch < 1)
+		batch = 1;
+
+	/*
+	 * Clamp the batch to a 2^n - 1 value. Having a power
+	 * of 2 value was found to be more likely to have
+	 * suboptimal cache aliasing properties in some cases.
+	 *
+	 * For example if 2 tasks are alternately allocating
+	 * batches of pages, one task can end up with a lot
+	 * of pages of one half of the possible page colors
+	 * and the other with pages of the other colors.
+	 */
+	batch = (1 << fls(batch + batch/2)) - 1;
+	return batch;
+}
+
+inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+{
+	struct per_cpu_pages *pcp;
+
+	pcp = &p->pcp[0];		/* hot */
+	pcp->count = 0;
+	pcp->low = 2 * batch;
+	pcp->high = 6 * batch;
+	pcp->batch = max(1UL, 1 * batch);
+	INIT_LIST_HEAD(&pcp->list);
+
+	pcp = &p->pcp[1];		/* cold*/
+	pcp->count = 0;
+	pcp->low = 0;
+	pcp->high = 2 * batch;
+	pcp->batch = max(1UL, 1 * batch);
+	INIT_LIST_HEAD(&pcp->list);
+}
+
+#ifdef CONFIG_NUMA
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * Some NUMA counter updates may also be caught by the boot pagesets.
+ * These will be discarded when bootup is complete.
+ */
+static struct per_cpu_pageset
+	boot_pageset[NR_CPUS] __initdata;
+
+/*
+ * Dynamically allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __devinit process_zones(int cpu)
+{
+	struct zone *zone, *dzone;
+
+	for_each_zone(zone) {
+
+		zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
+					 GFP_KERNEL, cpu_to_node(cpu));
+		if (!zone->pageset[cpu])
+			goto bad;
+
+		setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
+	}
+
+	return 0;
+bad:
+	for_each_zone(dzone) {
+		if (dzone == zone)
+			break;
+		kfree(dzone->pageset[cpu]);
+		dzone->pageset[cpu] = NULL;
+	}
+	return -ENOMEM;
+}
+
+static inline void free_zone_pagesets(int cpu)
+{
+#ifdef CONFIG_NUMA
+	struct zone *zone;
+
+	for_each_zone(zone) {
+		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+
+		zone_pcp(zone, cpu) = NULL;
+		kfree(pset);
+	}
+#endif
+}
+
+static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
+		unsigned long action,
+		void *hcpu)
+{
+	int cpu = (long)hcpu;
+	int ret = NOTIFY_OK;
+
+	switch (action) {
+		case CPU_UP_PREPARE:
+			if (process_zones(cpu))
+				ret = NOTIFY_BAD;
+			break;
+#ifdef CONFIG_HOTPLUG_CPU
+		case CPU_DEAD:
+			free_zone_pagesets(cpu);
+			break;
+#endif
+		default:
+			break;
+	}
+	return ret;
+}
+
+static struct notifier_block pageset_notifier =
+	{ &pageset_cpuup_callback, NULL, 0 };
+
+void __init setup_per_cpu_pageset()
+{
+	int err;
+
+	/* Initialize per_cpu_pageset for cpu 0.
+	 * A cpuup callback will do this for every cpu
+	 * as it comes online
+	 */
+	err = process_zones(smp_processor_id());
+	BUG_ON(err);
+	register_cpu_notifier(&pageset_notifier);
+}
+
+#endif
+
 /*
  * Set up the zone data structures:
  *   - mark all pages reserved
@@ -1662,48 +1873,16 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 
 		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
 
-		/*
-		 * The per-cpu-pages pools are set to around 1000th of the
-		 * size of the zone.  But no more than 1/4 of a meg - there's
-		 * no point in going beyond the size of L2 cache.
-		 *
-		 * OK, so we don't know how big the cache is.  So guess.
-		 */
-		batch = zone->present_pages / 1024;
-		if (batch * PAGE_SIZE > 256 * 1024)
-			batch = (256 * 1024) / PAGE_SIZE;
-		batch /= 4;		/* We effectively *= 4 below */
-		if (batch < 1)
-			batch = 1;
-
-		/*
-		 * Clamp the batch to a 2^n - 1 value. Having a power
-		 * of 2 value was found to be more likely to have
-		 * suboptimal cache aliasing properties in some cases.
-		 *
-		 * For example if 2 tasks are alternately allocating
-		 * batches of pages, one task can end up with a lot
-		 * of pages of one half of the possible page colors
-		 * and the other with pages of the other colors.
-		 */
-		batch = (1 << fls(batch + batch/2)) - 1;
+		batch = zone_batchsize(zone);
 
 		for (cpu = 0; cpu < NR_CPUS; cpu++) {
-			struct per_cpu_pages *pcp;
-
-			pcp = &zone->pageset[cpu].pcp[0];	/* hot */
-			pcp->count = 0;
-			pcp->low = 2 * batch;
-			pcp->high = 6 * batch;
-			pcp->batch = 1 * batch;
-			INIT_LIST_HEAD(&pcp->list);
-
-			pcp = &zone->pageset[cpu].pcp[1];	/* cold */
-			pcp->count = 0;
-			pcp->low = 0;
-			pcp->high = 2 * batch;
-			pcp->batch = 1 * batch;
-			INIT_LIST_HEAD(&pcp->list);
+#ifdef CONFIG_NUMA
+			/* Early boot. Slab allocator not functional yet */
+			zone->pageset[cpu] = &boot_pageset[cpu];
+			setup_pageset(&boot_pageset[cpu],0);
+#else
+			setup_pageset(zone_pcp(zone,cpu), batch);
+#endif
 		}
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
 				zone_names[j], realsize, batch);
@@ -1713,6 +1892,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 		zone->nr_scan_inactive = 0;
 		zone->nr_active = 0;
 		zone->nr_inactive = 0;
+		atomic_set(&zone->reclaim_in_progress, -1);
 		if (!size)
 			continue;
 
@@ -1853,6 +2033,115 @@ struct seq_operations fragmentation_op = {
 	.show	= frag_show,
 };
 
+/*
+ * Output information about zones in @pgdat.
+ */
+static int zoneinfo_show(struct seq_file *m, void *arg)
+{
+	pg_data_t *pgdat = arg;
+	struct zone *zone;
+	struct zone *node_zones = pgdat->node_zones;
+	unsigned long flags;
+
+	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
+		int i;
+
+		if (!zone->present_pages)
+			continue;
+
+		spin_lock_irqsave(&zone->lock, flags);
+		seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+		seq_printf(m,
+			   "\n  pages free     %lu"
+			   "\n        min      %lu"
+			   "\n        low      %lu"
+			   "\n        high     %lu"
+			   "\n        active   %lu"
+			   "\n        inactive %lu"
+			   "\n        scanned  %lu (a: %lu i: %lu)"
+			   "\n        spanned  %lu"
+			   "\n        present  %lu",
+			   zone->free_pages,
+			   zone->pages_min,
+			   zone->pages_low,
+			   zone->pages_high,
+			   zone->nr_active,
+			   zone->nr_inactive,
+			   zone->pages_scanned,
+			   zone->nr_scan_active, zone->nr_scan_inactive,
+			   zone->spanned_pages,
+			   zone->present_pages);
+		seq_printf(m,
+			   "\n        protection: (%lu",
+			   zone->lowmem_reserve[0]);
+		for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
+			seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
+		seq_printf(m,
+			   ")"
+			   "\n  pagesets");
+		for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
+			struct per_cpu_pageset *pageset;
+			int j;
+
+			pageset = zone_pcp(zone, i);
+			for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
+				if (pageset->pcp[j].count)
+					break;
+			}
+			if (j == ARRAY_SIZE(pageset->pcp))
+				continue;
+			for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
+				seq_printf(m,
+					   "\n    cpu: %i pcp: %i"
+					   "\n              count: %i"
+					   "\n              low:   %i"
+					   "\n              high:  %i"
+					   "\n              batch: %i",
+					   i, j,
+					   pageset->pcp[j].count,
+					   pageset->pcp[j].low,
+					   pageset->pcp[j].high,
+					   pageset->pcp[j].batch);
+			}
+#ifdef CONFIG_NUMA
+			seq_printf(m,
+				   "\n            numa_hit:       %lu"
+				   "\n            numa_miss:      %lu"
+				   "\n            numa_foreign:   %lu"
+				   "\n            interleave_hit: %lu"
+				   "\n            local_node:     %lu"
+				   "\n            other_node:     %lu",
+				   pageset->numa_hit,
+				   pageset->numa_miss,
+				   pageset->numa_foreign,
+				   pageset->interleave_hit,
+				   pageset->local_node,
+				   pageset->other_node);
+#endif
+		}
+		seq_printf(m,
+			   "\n  all_unreclaimable: %u"
+			   "\n  prev_priority:     %i"
+			   "\n  temp_priority:     %i"
+			   "\n  start_pfn:         %lu",
+			   zone->all_unreclaimable,
+			   zone->prev_priority,
+			   zone->temp_priority,
+			   zone->zone_start_pfn);
+		spin_unlock_irqrestore(&zone->lock, flags);
+		seq_putc(m, '\n');
+	}
+	return 0;
+}
+
+struct seq_operations zoneinfo_op = {
+	.start	= frag_start, /* iterate over all zones. The same as in
+			       * fragmentation. */
+	.next	= frag_next,
+	.stop	= frag_stop,
+	.show	= zoneinfo_show,
+};
+
 static char *vmstat_text[] = {
 	"nr_dirty",
 	"nr_writeback",
@@ -2058,10 +2347,10 @@ static void setup_per_zone_pages_min(void)
 				min_pages = 128;
 			zone->pages_min = min_pages;
 		} else {
-			/* if it's a lowmem zone, reserve a number of pages 
+			/* if it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
-			zone->pages_min = (pages_min * zone->present_pages) / 
+			zone->pages_min = (pages_min * zone->present_pages) /
 			                   lowmem_pages;
 		}
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 9827409eb7c7..89770bd25f31 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -539,27 +539,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 		goto out_unmap;
 	}
 
-	/*
-	 * Don't pull an anonymous page out from under get_user_pages.
-	 * GUP carefully breaks COW and raises page count (while holding
-	 * page_table_lock, as we have here) to make sure that the page
-	 * cannot be freed.  If we unmap that page here, a user write
-	 * access to the virtual address will bring back the page, but
-	 * its raised count will (ironically) be taken to mean it's not
-	 * an exclusive swap page, do_wp_page will replace it by a copy
-	 * page, and the user never get to see the data GUP was holding
-	 * the original page for.
-	 *
-	 * This test is also useful for when swapoff (unuse_process) has
-	 * to drop page lock: its reference to the page stops existing
-	 * ptes from being unmapped, so swapoff can make progress.
-	 */
-	if (PageSwapCache(page) &&
-	    page_count(page) != page_mapcount(page) + 2) {
-		ret = SWAP_FAIL;
-		goto out_unmap;
-	}
-
 	/* Nuke the page table entry. */
 	flush_cache_page(vma, address, page_to_pfn(page));
 	pteval = ptep_clear_flush(vma, address, pte);
diff --git a/mm/shmem.c b/mm/shmem.c
index 61574b81d979..e64fa726a790 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -6,8 +6,8 @@
  *		 2000-2001 Christoph Rohland
  *		 2000-2001 SAP AG
  *		 2002 Red Hat Inc.
- * Copyright (C) 2002-2004 Hugh Dickins.
- * Copyright (C) 2002-2004 VERITAS Software Corporation.
+ * Copyright (C) 2002-2005 Hugh Dickins.
+ * Copyright (C) 2002-2005 VERITAS Software Corporation.
  * Copyright (C) 2004 Andi Kleen, SuSE Labs
  *
  * Extended attribute support for tmpfs:
@@ -194,7 +194,7 @@ static DEFINE_SPINLOCK(shmem_swaplist_lock);
 static void shmem_free_blocks(struct inode *inode, long pages)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
-	if (sbinfo) {
+	if (sbinfo->max_blocks) {
 		spin_lock(&sbinfo->stat_lock);
 		sbinfo->free_blocks += pages;
 		inode->i_blocks -= pages*BLOCKS_PER_PAGE;
@@ -357,7 +357,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
 		 * page (and perhaps indirect index pages) yet to allocate:
 		 * a waste to allocate index if we cannot allocate data.
 		 */
-		if (sbinfo) {
+		if (sbinfo->max_blocks) {
 			spin_lock(&sbinfo->stat_lock);
 			if (sbinfo->free_blocks <= 1) {
 				spin_unlock(&sbinfo->stat_lock);
@@ -677,8 +677,8 @@ static void shmem_delete_inode(struct inode *inode)
 			spin_unlock(&shmem_swaplist_lock);
 		}
 	}
-	if (sbinfo) {
-		BUG_ON(inode->i_blocks);
+	BUG_ON(inode->i_blocks);
+	if (sbinfo->max_inodes) {
 		spin_lock(&sbinfo->stat_lock);
 		sbinfo->free_inodes++;
 		spin_unlock(&sbinfo->stat_lock);
@@ -1080,7 +1080,7 @@ repeat:
 	} else {
 		shmem_swp_unmap(entry);
 		sbinfo = SHMEM_SB(inode->i_sb);
-		if (sbinfo) {
+		if (sbinfo->max_blocks) {
 			spin_lock(&sbinfo->stat_lock);
 			if (sbinfo->free_blocks == 0 ||
 			    shmem_acct_block(info->flags)) {
@@ -1269,7 +1269,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 	struct shmem_inode_info *info;
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 
-	if (sbinfo) {
+	if (sbinfo->max_inodes) {
 		spin_lock(&sbinfo->stat_lock);
 		if (!sbinfo->free_inodes) {
 			spin_unlock(&sbinfo->stat_lock);
@@ -1319,7 +1319,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 			mpol_shared_policy_init(&info->policy);
 			break;
 		}
-	} else if (sbinfo) {
+	} else if (sbinfo->max_inodes) {
 		spin_lock(&sbinfo->stat_lock);
 		sbinfo->free_inodes++;
 		spin_unlock(&sbinfo->stat_lock);
@@ -1328,31 +1328,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
 }
 
 #ifdef CONFIG_TMPFS
-
-static int shmem_set_size(struct shmem_sb_info *sbinfo,
-			  unsigned long max_blocks, unsigned long max_inodes)
-{
-	int error;
-	unsigned long blocks, inodes;
-
-	spin_lock(&sbinfo->stat_lock);
-	blocks = sbinfo->max_blocks - sbinfo->free_blocks;
-	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
-	error = -EINVAL;
-	if (max_blocks < blocks)
-		goto out;
-	if (max_inodes < inodes)
-		goto out;
-	error = 0;
-	sbinfo->max_blocks  = max_blocks;
-	sbinfo->free_blocks = max_blocks - blocks;
-	sbinfo->max_inodes  = max_inodes;
-	sbinfo->free_inodes = max_inodes - inodes;
-out:
-	spin_unlock(&sbinfo->stat_lock);
-	return error;
-}
-
 static struct inode_operations shmem_symlink_inode_operations;
 static struct inode_operations shmem_symlink_inline_operations;
 
@@ -1607,15 +1582,17 @@ static int shmem_statfs(struct super_block *sb, struct kstatfs *buf)
 	buf->f_type = TMPFS_MAGIC;
 	buf->f_bsize = PAGE_CACHE_SIZE;
 	buf->f_namelen = NAME_MAX;
-	if (sbinfo) {
-		spin_lock(&sbinfo->stat_lock);
+	spin_lock(&sbinfo->stat_lock);
+	if (sbinfo->max_blocks) {
 		buf->f_blocks = sbinfo->max_blocks;
 		buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+	}
+	if (sbinfo->max_inodes) {
 		buf->f_files = sbinfo->max_inodes;
 		buf->f_ffree = sbinfo->free_inodes;
-		spin_unlock(&sbinfo->stat_lock);
 	}
 	/* else leave those fields 0 like simple_statfs */
+	spin_unlock(&sbinfo->stat_lock);
 	return 0;
 }
 
@@ -1672,7 +1649,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
 	 * but each new link needs a new dentry, pinning lowmem, and
 	 * tmpfs dentries cannot be pruned until they are unlinked.
 	 */
-	if (sbinfo) {
+	if (sbinfo->max_inodes) {
 		spin_lock(&sbinfo->stat_lock);
 		if (!sbinfo->free_inodes) {
 			spin_unlock(&sbinfo->stat_lock);
@@ -1697,7 +1674,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
 
 	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) {
 		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
-		if (sbinfo) {
+		if (sbinfo->max_inodes) {
 			spin_lock(&sbinfo->stat_lock);
 			sbinfo->free_inodes++;
 			spin_unlock(&sbinfo->stat_lock);
@@ -1921,22 +1898,42 @@ bad_val:
 static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
-	unsigned long max_blocks = 0;
-	unsigned long max_inodes = 0;
+	unsigned long max_blocks = sbinfo->max_blocks;
+	unsigned long max_inodes = sbinfo->max_inodes;
+	unsigned long blocks;
+	unsigned long inodes;
+	int error = -EINVAL;
+
+	if (shmem_parse_options(data, NULL, NULL, NULL,
+				&max_blocks, &max_inodes))
+		return error;
 
-	if (sbinfo) {
-		max_blocks = sbinfo->max_blocks;
-		max_inodes = sbinfo->max_inodes;
-	}
-	if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes))
-		return -EINVAL;
-	/* Keep it simple: disallow limited <-> unlimited remount */
-	if ((max_blocks || max_inodes) == !sbinfo)
-		return -EINVAL;
-	/* But allow the pointless unlimited -> unlimited remount */
-	if (!sbinfo)
-		return 0;
-	return shmem_set_size(sbinfo, max_blocks, max_inodes);
+	spin_lock(&sbinfo->stat_lock);
+	blocks = sbinfo->max_blocks - sbinfo->free_blocks;
+	inodes = sbinfo->max_inodes - sbinfo->free_inodes;
+	if (max_blocks < blocks)
+		goto out;
+	if (max_inodes < inodes)
+		goto out;
+	/*
+	 * Those tests also disallow limited->unlimited while any are in
+	 * use, so i_blocks will always be zero when max_blocks is zero;
+	 * but we must separately disallow unlimited->limited, because
+	 * in that case we have no record of how much is already in use.
+	 */
+	if (max_blocks && !sbinfo->max_blocks)
+		goto out;
+	if (max_inodes && !sbinfo->max_inodes)
+		goto out;
+
+	error = 0;
+	sbinfo->max_blocks  = max_blocks;
+	sbinfo->free_blocks = max_blocks - blocks;
+	sbinfo->max_inodes  = max_inodes;
+	sbinfo->free_inodes = max_inodes - inodes;
+out:
+	spin_unlock(&sbinfo->stat_lock);
+	return error;
 }
 #endif
 
@@ -1961,11 +1958,11 @@ static int shmem_fill_super(struct super_block *sb,
 	uid_t uid = current->fsuid;
 	gid_t gid = current->fsgid;
 	int err = -ENOMEM;
-
-#ifdef CONFIG_TMPFS
+	struct shmem_sb_info *sbinfo;
 	unsigned long blocks = 0;
 	unsigned long inodes = 0;
 
+#ifdef CONFIG_TMPFS
 	/*
 	 * Per default we only allow half of the physical ram per
 	 * tmpfs instance, limiting inodes to one per page of lowmem;
@@ -1976,34 +1973,34 @@ static int shmem_fill_super(struct super_block *sb,
 		inodes = totalram_pages - totalhigh_pages;
 		if (inodes > blocks)
 			inodes = blocks;
-
-		if (shmem_parse_options(data, &mode,
-					&uid, &gid, &blocks, &inodes))
+		if (shmem_parse_options(data, &mode, &uid, &gid,
+					&blocks, &inodes))
 			return -EINVAL;
 	}
-
-	if (blocks || inodes) {
-		struct shmem_sb_info *sbinfo;
-		sbinfo = kmalloc(sizeof(struct shmem_sb_info), GFP_KERNEL);
-		if (!sbinfo)
-			return -ENOMEM;
-		sb->s_fs_info = sbinfo;
-		spin_lock_init(&sbinfo->stat_lock);
-		sbinfo->max_blocks = blocks;
-		sbinfo->free_blocks = blocks;
-		sbinfo->max_inodes = inodes;
-		sbinfo->free_inodes = inodes;
-	}
-	sb->s_xattr = shmem_xattr_handlers;
 #else
 	sb->s_flags |= MS_NOUSER;
 #endif
 
+	/* Round up to L1_CACHE_BYTES to resist false sharing */
+	sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info),
+				L1_CACHE_BYTES), GFP_KERNEL);
+	if (!sbinfo)
+		return -ENOMEM;
+
+	spin_lock_init(&sbinfo->stat_lock);
+	sbinfo->max_blocks = blocks;
+	sbinfo->free_blocks = blocks;
+	sbinfo->max_inodes = inodes;
+	sbinfo->free_inodes = inodes;
+
+	sb->s_fs_info = sbinfo;
 	sb->s_maxbytes = SHMEM_MAX_BYTES;
 	sb->s_blocksize = PAGE_CACHE_SIZE;
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = TMPFS_MAGIC;
 	sb->s_op = &shmem_ops;
+	sb->s_xattr = shmem_xattr_handlers;
+
 	inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
 	if (!inode)
 		goto failed;
diff --git a/mm/slab.c b/mm/slab.c
index c78d343b3c5f..93cbbbb39f42 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2851,6 +2851,7 @@ next:
 	}
 	check_irq_on();
 	up(&cache_chain_sem);
+	drain_remote_pages();
 	/* Setup the next iteration */
 	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index da48405cd9a3..60cd24a55204 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -276,61 +276,37 @@ void swap_free(swp_entry_t entry)
 }
 
 /*
- * Check if we're the only user of a swap page,
- * when the page is locked.
+ * How many references to page are currently swapped out?
  */
-static int exclusive_swap_page(struct page *page)
+static inline int page_swapcount(struct page *page)
 {
-	int retval = 0;
-	struct swap_info_struct * p;
+	int count = 0;
+	struct swap_info_struct *p;
 	swp_entry_t entry;
 
 	entry.val = page->private;
 	p = swap_info_get(entry);
 	if (p) {
-		/* Is the only swap cache user the cache itself? */
-		if (p->swap_map[swp_offset(entry)] == 1) {
-			/* Recheck the page count with the swapcache lock held.. */
-			write_lock_irq(&swapper_space.tree_lock);
-			if (page_count(page) == 2)
-				retval = 1;
-			write_unlock_irq(&swapper_space.tree_lock);
-		}
+		/* Subtract the 1 for the swap cache itself */
+		count = p->swap_map[swp_offset(entry)] - 1;
 		swap_info_put(p);
 	}
-	return retval;
+	return count;
 }
 
 /*
  * We can use this swap cache entry directly
  * if there are no other references to it.
- *
- * Here "exclusive_swap_page()" does the real
- * work, but we opportunistically check whether
- * we need to get all the locks first..
  */
 int can_share_swap_page(struct page *page)
 {
-	int retval = 0;
+	int count;
 
-	if (!PageLocked(page))
-		BUG();
-	switch (page_count(page)) {
-	case 3:
-		if (!PagePrivate(page))
-			break;
-		/* Fallthrough */
-	case 2:
-		if (!PageSwapCache(page))
-			break;
-		retval = exclusive_swap_page(page);
-		break;
-	case 1:
-		if (PageReserved(page))
-			break;
-		retval = 1;
-	}
-	return retval;
+	BUG_ON(!PageLocked(page));
+	count = page_mapcount(page);
+	if (count <= 1 && PageSwapCache(page))
+		count += page_swapcount(page);
+	return count == 1;
 }
 
 /*
@@ -529,9 +505,10 @@ static int unuse_mm(struct mm_struct *mm,
 
 	if (!down_read_trylock(&mm->mmap_sem)) {
 		/*
-		 * Our reference to the page stops try_to_unmap_one from
-		 * unmapping its ptes, so swapoff can make progress.
+		 * Activate page so shrink_cache is unlikely to unmap its
+		 * ptes while lock is dropped, so swapoff can make progress.
 		 */
+		activate_page(page);
 		unlock_page(page);
 		down_read(&mm->mmap_sem);
 		lock_page(page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 269eded9b459..4b8e62a19370 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -74,6 +74,9 @@ struct scan_control {
 
 	int may_writepage;
 
+	/* Can pages be swapped as part of reclaim? */
+	int may_swap;
+
 	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
 	 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
 	 * In this context, it doesn't matter that we scan the
@@ -180,17 +183,20 @@ EXPORT_SYMBOL(remove_shrinker);
  * `lru_pages' represents the number of on-LRU pages in all the zones which
  * are eligible for the caller's allocation attempt.  It is used for balancing
  * slab reclaim versus page reclaim.
+ *
+ * Returns the number of slab objects which we shrunk.
  */
 static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
 			unsigned long lru_pages)
 {
 	struct shrinker *shrinker;
+	int ret = 0;
 
 	if (scanned == 0)
 		scanned = SWAP_CLUSTER_MAX;
 
 	if (!down_read_trylock(&shrinker_rwsem))
-		return 0;
+		return 1;	/* Assume we'll be able to shrink next time */
 
 	list_for_each_entry(shrinker, &shrinker_list, list) {
 		unsigned long long delta;
@@ -209,10 +215,14 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
 		while (total_scan >= SHRINK_BATCH) {
 			long this_scan = SHRINK_BATCH;
 			int shrink_ret;
+			int nr_before;
 
+			nr_before = (*shrinker->shrinker)(0, gfp_mask);
 			shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
 			if (shrink_ret == -1)
 				break;
+			if (shrink_ret < nr_before)
+				ret += nr_before - shrink_ret;
 			mod_page_state(slabs_scanned, this_scan);
 			total_scan -= this_scan;
 
@@ -222,7 +232,7 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
 		shrinker->nr += total_scan;
 	}
 	up_read(&shrinker_rwsem);
-	return 0;
+	return ret;
 }
 
 /* Called without lock on whether page is mapped, so answer is unstable */
@@ -407,7 +417,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 		 * Anonymous process memory has backing store?
 		 * Try to allocate it some swap space here.
 		 */
-		if (PageAnon(page) && !PageSwapCache(page)) {
+		if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) {
 			if (!add_to_swap(page))
 				goto activate_locked;
 		}
@@ -890,7 +900,9 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
 		if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
 			continue;	/* Let kswapd poll it */
 
+		atomic_inc(&zone->reclaim_in_progress);
 		shrink_zone(zone, sc);
+		atomic_dec(&zone->reclaim_in_progress);
 	}
 }
  
@@ -907,8 +919,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
  * holds filesystem locks which prevent writeout this might not work, and the
  * allocation attempt will fail.
  */
-int try_to_free_pages(struct zone **zones,
-		unsigned int gfp_mask, unsigned int order)
+int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
 {
 	int priority;
 	int ret = 0;
@@ -920,6 +931,7 @@ int try_to_free_pages(struct zone **zones,
 
 	sc.gfp_mask = gfp_mask;
 	sc.may_writepage = 0;
+	sc.may_swap = 1;
 
 	inc_page_state(allocstall);
 
@@ -1020,6 +1032,7 @@ loop_again:
 	total_reclaimed = 0;
 	sc.gfp_mask = GFP_KERNEL;
 	sc.may_writepage = 0;
+	sc.may_swap = 1;
 	sc.nr_mapped = read_page_state(nr_mapped);
 
 	inc_page_state(pageoutrun);
@@ -1079,6 +1092,7 @@ scan:
 		 */
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
+			int nr_slab;
 
 			if (zone->present_pages == 0)
 				continue;
@@ -1098,16 +1112,19 @@ scan:
 			sc.nr_reclaimed = 0;
 			sc.priority = priority;
 			sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
+			atomic_inc(&zone->reclaim_in_progress);
 			shrink_zone(zone, &sc);
+			atomic_dec(&zone->reclaim_in_progress);
 			reclaim_state->reclaimed_slab = 0;
-			shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages);
+			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
+						lru_pages);
 			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
 			total_reclaimed += sc.nr_reclaimed;
 			total_scanned += sc.nr_scanned;
 			if (zone->all_unreclaimable)
 				continue;
-			if (zone->pages_scanned >= (zone->nr_active +
-							zone->nr_inactive) * 4)
+			if (nr_slab == 0 && zone->pages_scanned >=
+				    (zone->nr_active + zone->nr_inactive) * 4)
 				zone->all_unreclaimable = 1;
 			/*
 			 * If we've done a decent amount of scanning and
@@ -1309,3 +1326,73 @@ static int __init kswapd_init(void)
 }
 
 module_init(kswapd_init)
+
+
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order)
+{
+	struct scan_control sc;
+	int nr_pages = 1 << order;
+	int total_reclaimed = 0;
+
+	/* The reclaim may sleep, so don't do it if sleep isn't allowed */
+	if (!(gfp_mask & __GFP_WAIT))
+		return 0;
+	if (zone->all_unreclaimable)
+		return 0;
+
+	sc.gfp_mask = gfp_mask;
+	sc.may_writepage = 0;
+	sc.may_swap = 0;
+	sc.nr_mapped = read_page_state(nr_mapped);
+	sc.nr_scanned = 0;
+	sc.nr_reclaimed = 0;
+	/* scan at the highest priority */
+	sc.priority = 0;
+
+	if (nr_pages > SWAP_CLUSTER_MAX)
+		sc.swap_cluster_max = nr_pages;
+	else
+		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+
+	/* Don't reclaim the zone if there are other reclaimers active */
+	if (!atomic_inc_and_test(&zone->reclaim_in_progress))
+		goto out;
+
+	shrink_zone(zone, &sc);
+	total_reclaimed = sc.nr_reclaimed;
+
+ out:
+	atomic_dec(&zone->reclaim_in_progress);
+	return total_reclaimed;
+}
+
+asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
+				     unsigned int state)
+{
+	struct zone *z;
+	int i;
+
+	if (node >= MAX_NUMNODES || !node_online(node))
+		return -EINVAL;
+
+	/* This will break if we ever add more zones */
+	if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
+		return -EINVAL;
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		if (!(zone & 1<<i))
+			continue;
+
+		z = &NODE_DATA(node)->node_zones[i];
+
+		if (state)
+			z->reclaim_pages = 1;
+		else
+			z->reclaim_pages = 0;
+	}
+
+	return 0;
+}