aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds2014-04-07 16:38:06 -0700
committerLinus Torvalds2014-04-07 16:38:06 -0700
commit26c12d93348f0bda0756aff83f4867d9ae58a5a6 (patch)
tree65221f6837c66a9260c5c973e5fb908b10e0d504 /mm
parentdc5ed40686a4da95881c35d913b60f867755cbe2 (diff)
parentfdc5813fbbd484a54c88477f91a78934cda8bb32 (diff)
Merge branch 'akpm' (incoming from Andrew)
Merge second patch-bomb from Andrew Morton: - the rest of MM - zram updates - zswap updates - exit - procfs - exec - wait - crash dump - lib/idr - rapidio - adfs, affs, bfs, ufs - cris - Kconfig things - initramfs - small amount of IPC material - percpu enhancements - early ioremap support - various other misc things * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (156 commits) MAINTAINERS: update Intel C600 SAS driver maintainers fs/ufs: remove unused ufs_super_block_third pointer fs/ufs: remove unused ufs_super_block_second pointer fs/ufs: remove unused ufs_super_block_first pointer fs/ufs/super.c: add __init to init_inodecache() doc/kernel-parameters.txt: add early_ioremap_debug arm64: add early_ioremap support arm64: initialize pgprot info earlier in boot x86: use generic early_ioremap mm: create generic early_ioremap() support x86/mm: sparse warning fix for early_memremap lglock: map to spinlock when !CONFIG_SMP percpu: add preemption checks to __this_cpu ops vmstat: use raw_cpu_ops to avoid false positives on preemption checks slub: use raw_cpu_inc for incrementing statistics net: replace __this_cpu_inc in route.c with raw_cpu_inc modules: use raw_cpu_write for initialization of per cpu refcount. mm: use raw_cpu ops for determining current NUMA node percpu: add raw_cpu_ops slub: fix leak of 'name' in sysfs_slab_add ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/Makefile3
-rw-r--r--mm/compaction.c84
-rw-r--r--mm/early_ioremap.c245
-rw-r--r--mm/filemap.c86
-rw-r--r--mm/huge_memory.c21
-rw-r--r--mm/hugetlb.c14
-rw-r--r--mm/internal.h16
-rw-r--r--mm/memblock.c28
-rw-r--r--mm/memcontrol.c453
-rw-r--r--mm/memory.c147
-rw-r--r--mm/mempolicy.c46
-rw-r--r--mm/mempool.c4
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mmap.c55
-rw-r--r--mm/mprotect.c56
-rw-r--r--mm/nommu.c49
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c118
-rw-r--r--mm/readahead.c21
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/shmem.c7
-rw-r--r--mm/slab.c8
-rw-r--r--mm/slab.h21
-rw-r--r--mm/slab_common.c250
-rw-r--r--mm/slub.c87
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/util.c5
-rw-r--r--mm/vmacache.c112
-rw-r--r--mm/vmalloc.c10
-rw-r--r--mm/vmscan.c12
-rw-r--r--mm/zswap.c78
32 files changed, 1342 insertions, 722 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 2888024e0b0a..ebe5880c29d6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -216,6 +216,7 @@ config PAGEFLAGS_EXTENDED
#
config SPLIT_PTLOCK_CPUS
int
+ default "999999" if !MMU
default "999999" if ARM && !CPU_CACHE_VIPT
default "999999" if PARISC && !PA20
default "4"
@@ -577,3 +578,6 @@ config PGTABLE_MAPPING
You can check speed with zsmalloc benchmark:
https://github.com/spartacus06/zsmapbench
+
+config GENERIC_EARLY_IOREMAP
+ bool
diff --git a/mm/Makefile b/mm/Makefile
index cdd741519ee0..9e5aaf92197d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
- compaction.o balloon_compaction.o \
+ compaction.o balloon_compaction.o vmacache.o \
interval_tree.o list_lru.o workingset.o $(mmu-y)
obj-y += init-mm.o
@@ -61,3 +61,4 @@ obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
obj-$(CONFIG_ZBUD) += zbud.o
obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
+obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
diff --git a/mm/compaction.c b/mm/compaction.c
index b6ab77160068..37f976287068 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -217,21 +217,12 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
/* Returns true if the page is within a block suitable for migration to */
static bool suitable_migration_target(struct page *page)
{
- int migratetype = get_pageblock_migratetype(page);
-
- /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
- if (migratetype == MIGRATE_RESERVE)
- return false;
-
- if (is_migrate_isolate(migratetype))
- return false;
-
- /* If the page is a large free page, then allow migration */
+ /* If the page is a large free page, then disallow migration */
if (PageBuddy(page) && page_order(page) >= pageblock_order)
- return true;
+ return false;
/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
- if (migrate_async_suitable(migratetype))
+ if (migrate_async_suitable(get_pageblock_migratetype(page)))
return true;
/* Otherwise skip the block */
@@ -253,6 +244,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
struct page *cursor, *valid_page = NULL;
unsigned long flags;
bool locked = false;
+ bool checked_pageblock = false;
cursor = pfn_to_page(blockpfn);
@@ -284,8 +276,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
break;
/* Recheck this is a suitable migration target under lock */
- if (!strict && !suitable_migration_target(page))
- break;
+ if (!strict && !checked_pageblock) {
+ /*
+ * We need to check suitability of pageblock only once
+ * and this isolate_freepages_block() is called with
+ * pageblock range, so just check once is sufficient.
+ */
+ checked_pageblock = true;
+ if (!suitable_migration_target(page))
+ break;
+ }
/* Recheck this is a buddy page under lock */
if (!PageBuddy(page))
@@ -460,12 +460,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
unsigned long last_pageblock_nr = 0, pageblock_nr;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
- isolate_mode_t mode = 0;
struct lruvec *lruvec;
unsigned long flags;
bool locked = false;
struct page *page = NULL, *valid_page = NULL;
bool skipped_async_unsuitable = false;
+ const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
+ (unevictable ? ISOLATE_UNEVICTABLE : 0);
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -487,7 +488,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
cond_resched();
for (; low_pfn < end_pfn; low_pfn++) {
/* give a chance to irqs before checking need_resched() */
- if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+ if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
if (should_release_lock(&zone->lru_lock)) {
spin_unlock_irqrestore(&zone->lru_lock, flags);
locked = false;
@@ -526,8 +527,25 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
/* If isolation recently failed, do not retry */
pageblock_nr = low_pfn >> pageblock_order;
- if (!isolation_suitable(cc, page))
- goto next_pageblock;
+ if (last_pageblock_nr != pageblock_nr) {
+ int mt;
+
+ last_pageblock_nr = pageblock_nr;
+ if (!isolation_suitable(cc, page))
+ goto next_pageblock;
+
+ /*
+ * For async migration, also only scan in MOVABLE
+ * blocks. Async migration is optimistic to see if
+ * the minimum amount of work satisfies the allocation
+ */
+ mt = get_pageblock_migratetype(page);
+ if (!cc->sync && !migrate_async_suitable(mt)) {
+ cc->finished_update_migrate = true;
+ skipped_async_unsuitable = true;
+ goto next_pageblock;
+ }
+ }
/*
* Skip if free. page_order cannot be used without zone->lock
@@ -537,18 +555,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
continue;
/*
- * For async migration, also only scan in MOVABLE blocks. Async
- * migration is optimistic to see if the minimum amount of work
- * satisfies the allocation
- */
- if (!cc->sync && last_pageblock_nr != pageblock_nr &&
- !migrate_async_suitable(get_pageblock_migratetype(page))) {
- cc->finished_update_migrate = true;
- skipped_async_unsuitable = true;
- goto next_pageblock;
- }
-
- /*
* Check may be lockless but that's ok as we recheck later.
* It's possible to migrate LRU pages and balloon pages
* Skip any other type of page
@@ -557,11 +563,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
if (unlikely(balloon_page_movable(page))) {
if (locked && balloon_page_isolate(page)) {
/* Successfully isolated */
- cc->finished_update_migrate = true;
- list_add(&page->lru, migratelist);
- cc->nr_migratepages++;
- nr_isolated++;
- goto check_compact_cluster;
+ goto isolate_success;
}
}
continue;
@@ -607,12 +609,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
continue;
}
- if (!cc->sync)
- mode |= ISOLATE_ASYNC_MIGRATE;
-
- if (unevictable)
- mode |= ISOLATE_UNEVICTABLE;
-
lruvec = mem_cgroup_page_lruvec(page, zone);
/* Try isolate the page */
@@ -622,13 +618,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
VM_BUG_ON_PAGE(PageTransCompound(page), page);
/* Successfully isolated */
- cc->finished_update_migrate = true;
del_page_from_lru_list(page, lruvec, page_lru(page));
+
+isolate_success:
+ cc->finished_update_migrate = true;
list_add(&page->lru, migratelist);
cc->nr_migratepages++;
nr_isolated++;
-check_compact_cluster:
/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
++low_pfn;
@@ -639,7 +636,6 @@ check_compact_cluster:
next_pageblock:
low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
- last_pageblock_nr = pageblock_nr;
}
acct_isolated(zone, locked, cc);
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
new file mode 100644
index 000000000000..e10ccd299d66
--- /dev/null
+++ b/mm/early_ioremap.c
@@ -0,0 +1,245 @@
+/*
+ * Provide common bits of early_ioremap() support for architectures needing
+ * temporary mappings during boot before ioremap() is available.
+ *
+ * This is mostly a direct copy of the x86 early_ioremap implementation.
+ *
+ * (C) Copyright 1995 1996, 2014 Linus Torvalds
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <asm/fixmap.h>
+
+#ifdef CONFIG_MMU
+static int early_ioremap_debug __initdata;
+
+static int __init early_ioremap_debug_setup(char *str)
+{
+ early_ioremap_debug = 1;
+
+ return 0;
+}
+early_param("early_ioremap_debug", early_ioremap_debug_setup);
+
+static int after_paging_init __initdata;
+
+void __init __weak early_ioremap_shutdown(void)
+{
+}
+
+void __init early_ioremap_reset(void)
+{
+ early_ioremap_shutdown();
+ after_paging_init = 1;
+}
+
+/*
+ * Generally, ioremap() is available after paging_init() has been called.
+ * Architectures wanting to allow early_ioremap after paging_init() can
+ * define __late_set_fixmap and __late_clear_fixmap to do the right thing.
+ */
+#ifndef __late_set_fixmap
+static inline void __init __late_set_fixmap(enum fixed_addresses idx,
+ phys_addr_t phys, pgprot_t prot)
+{
+ BUG();
+}
+#endif
+
+#ifndef __late_clear_fixmap
+static inline void __init __late_clear_fixmap(enum fixed_addresses idx)
+{
+ BUG();
+}
+#endif
+
+static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
+static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+
+void __init early_ioremap_setup(void)
+{
+ int i;
+
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ if (WARN_ON(prev_map[i]))
+ break;
+
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+}
+
+static int __init check_early_ioremap_leak(void)
+{
+ int count = 0;
+ int i;
+
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ if (prev_map[i])
+ count++;
+
+ if (WARN(count, KERN_WARNING
+ "Debug warning: early ioremap leak of %d areas detected.\n"
+ "please boot with early_ioremap_debug and report the dmesg.\n",
+ count))
+ return 1;
+ return 0;
+}
+late_initcall(check_early_ioremap_leak);
+
+static void __init __iomem *
+__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
+{
+ unsigned long offset;
+ resource_size_t last_addr;
+ unsigned int nrpages;
+ enum fixed_addresses idx;
+ int i, slot;
+
+ WARN_ON(system_state != SYSTEM_BOOTING);
+
+ slot = -1;
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+ if (!prev_map[i]) {
+ slot = i;
+ break;
+ }
+ }
+
+ if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n",
+ __func__, (u64)phys_addr, size))
+ return NULL;
+
+ /* Don't allow wraparound or zero size */
+ last_addr = phys_addr + size - 1;
+ if (WARN_ON(!size || last_addr < phys_addr))
+ return NULL;
+
+ prev_size[slot] = size;
+ /*
+ * Mappings have to be page-aligned
+ */
+ offset = phys_addr & ~PAGE_MASK;
+ phys_addr &= PAGE_MASK;
+ size = PAGE_ALIGN(last_addr + 1) - phys_addr;
+
+ /*
+ * Mappings have to fit in the FIX_BTMAP area.
+ */
+ nrpages = size >> PAGE_SHIFT;
+ if (WARN_ON(nrpages > NR_FIX_BTMAPS))
+ return NULL;
+
+ /*
+ * Ok, go for it..
+ */
+ idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
+ while (nrpages > 0) {
+ if (after_paging_init)
+ __late_set_fixmap(idx, phys_addr, prot);
+ else
+ __early_set_fixmap(idx, phys_addr, prot);
+ phys_addr += PAGE_SIZE;
+ --idx;
+ --nrpages;
+ }
+ WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n",
+ __func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]);
+
+ prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
+ return prev_map[slot];
+}
+
+void __init early_iounmap(void __iomem *addr, unsigned long size)
+{
+ unsigned long virt_addr;
+ unsigned long offset;
+ unsigned int nrpages;
+ enum fixed_addresses idx;
+ int i, slot;
+
+ slot = -1;
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+ if (prev_map[i] == addr) {
+ slot = i;
+ break;
+ }
+ }
+
+ if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n",
+ addr, size))
+ return;
+
+ if (WARN(prev_size[slot] != size,
+ "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
+ addr, size, slot, prev_size[slot]))
+ return;
+
+ WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n",
+ addr, size, slot);
+
+ virt_addr = (unsigned long)addr;
+ if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)))
+ return;
+
+ offset = virt_addr & ~PAGE_MASK;
+ nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
+
+ idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
+ while (nrpages > 0) {
+ if (after_paging_init)
+ __late_clear_fixmap(idx);
+ else
+ __early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR);
+ --idx;
+ --nrpages;
+ }
+ prev_map[slot] = NULL;
+}
+
+/* Remap an IO device */
+void __init __iomem *
+early_ioremap(resource_size_t phys_addr, unsigned long size)
+{
+ return __early_ioremap(phys_addr, size, FIXMAP_PAGE_IO);
+}
+
+/* Remap memory */
+void __init *
+early_memremap(resource_size_t phys_addr, unsigned long size)
+{
+ return (__force void *)__early_ioremap(phys_addr, size,
+ FIXMAP_PAGE_NORMAL);
+}
+#else /* CONFIG_MMU */
+
+void __init __iomem *
+early_ioremap(resource_size_t phys_addr, unsigned long size)
+{
+ return (__force void __iomem *)phys_addr;
+}
+
+/* Remap memory */
+void __init *
+early_memremap(resource_size_t phys_addr, unsigned long size)
+{
+ return (void *)phys_addr;
+}
+
+void __init early_iounmap(void __iomem *addr, unsigned long size)
+{
+}
+
+#endif /* CONFIG_MMU */
+
+
+void __init early_memunmap(void *addr, unsigned long size)
+{
+ early_iounmap((__force void __iomem *)addr, size);
+}
diff --git a/mm/filemap.c b/mm/filemap.c
index 21781f1fe52b..27ebc0c9571b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include <linux/memcontrol.h>
#include <linux/cleancache.h>
+#include <linux/rmap.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -562,7 +563,7 @@ static int __add_to_page_cache_locked(struct page *page,
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
- error = mem_cgroup_cache_charge(page, current->mm,
+ error = mem_cgroup_charge_file(page, current->mm,
gfp_mask & GFP_RECLAIM_MASK);
if (error)
return error;
@@ -1952,11 +1953,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
struct page *page;
- pgoff_t size;
+ loff_t size;
int ret = 0;
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (offset >= size)
+ size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
+ if (offset >= size >> PAGE_CACHE_SHIFT)
return VM_FAULT_SIGBUS;
/*
@@ -2005,8 +2006,8 @@ retry_find:
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (unlikely(offset >= size)) {
+ size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
+ if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) {
unlock_page(page);
page_cache_release(page);
return VM_FAULT_SIGBUS;
@@ -2064,6 +2065,78 @@ page_not_uptodate:
}
EXPORT_SYMBOL(filemap_fault);
+void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ loff_t size;
+ struct page *page;
+ unsigned long address = (unsigned long) vmf->virtual_address;
+ unsigned long addr;
+ pte_t *pte;
+
+ rcu_read_lock();
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) {
+ if (iter.index > vmf->max_pgoff)
+ break;
+repeat:
+ page = radix_tree_deref_slot(slot);
+ if (unlikely(!page))
+ goto next;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page))
+ break;
+ else
+ goto next;
+ }
+
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *slot)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+
+ if (!PageUptodate(page) ||
+ PageReadahead(page) ||
+ PageHWPoison(page))
+ goto skip;
+ if (!trylock_page(page))
+ goto skip;
+
+ if (page->mapping != mapping || !PageUptodate(page))
+ goto unlock;
+
+ size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE);
+ if (page->index >= size >> PAGE_CACHE_SHIFT)
+ goto unlock;
+
+ pte = vmf->pte + page->index - vmf->pgoff;
+ if (!pte_none(*pte))
+ goto unlock;
+
+ if (file->f_ra.mmap_miss > 0)
+ file->f_ra.mmap_miss--;
+ addr = address + (page->index - vmf->pgoff) * PAGE_SIZE;
+ do_set_pte(vma, addr, page, pte, false, false);
+ unlock_page(page);
+ goto next;
+unlock:
+ unlock_page(page);
+skip:
+ page_cache_release(page);
+next:
+ if (iter.index == vmf->max_pgoff)
+ break;
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(filemap_map_pages);
+
int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct page *page = vmf->page;
@@ -2093,6 +2166,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = filemap_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6ac89e9f82ef..64635f5278ff 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -827,7 +827,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+ if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -968,7 +968,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
__GFP_OTHER_NODE,
vma, address, page_to_nid(page));
if (unlikely(!pages[i] ||
- mem_cgroup_newpage_charge(pages[i], mm,
+ mem_cgroup_charge_anon(pages[i], mm,
GFP_KERNEL))) {
if (pages[i])
put_page(pages[i]);
@@ -1101,7 +1101,7 @@ alloc:
goto out;
}
- if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+ if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) {
put_page(new_page);
if (page) {
split_huge_page(page);
@@ -1891,17 +1891,22 @@ out:
int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
{
- struct mm_struct *mm = vma->vm_mm;
-
switch (advice) {
case MADV_HUGEPAGE:
+#ifdef CONFIG_S390
+ /*
+ * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
+ * can't handle this properly after s390_enable_sie, so we simply
+ * ignore the madvise to prevent qemu from causing a SIGSEGV.
+ */
+ if (mm_has_pgste(vma->vm_mm))
+ return 0;
+#endif
/*
* Be somewhat over-protective like KSM for now!
*/
if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
return -EINVAL;
- if (mm->def_flags & VM_NOHUGEPAGE)
- return -EINVAL;
*vm_flags &= ~VM_NOHUGEPAGE;
*vm_flags |= VM_HUGEPAGE;
/*
@@ -2354,7 +2359,7 @@ static void collapse_huge_page(struct mm_struct *mm,
if (!new_page)
return;
- if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
+ if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)))
return;
/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7c02b9dadfb0..dd30f22b35e0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -13,6 +13,7 @@
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
+#include <linux/compiler.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
#include <linux/bootmem.h>
@@ -1535,6 +1536,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
while (min_count < persistent_huge_pages(h)) {
if (!free_pool_huge_page(h, nodes_allowed, 0))
break;
+ cond_resched_lock(&hugetlb_lock);
}
while (count < persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, nodes_allowed, 1))
@@ -2690,7 +2692,8 @@ retry_avoidcopy:
BUG_ON(huge_pte_none(pte));
spin_lock(ptl);
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
- if (likely(pte_same(huge_ptep_get(ptep), pte)))
+ if (likely(ptep &&
+ pte_same(huge_ptep_get(ptep), pte)))
goto retry_avoidcopy;
/*
* race occurs while re-acquiring page table
@@ -2734,7 +2737,7 @@ retry_avoidcopy:
*/
spin_lock(ptl);
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
- if (likely(pte_same(huge_ptep_get(ptep), pte))) {
+ if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
ClearPagePrivate(new_page);
/* Break COW */
@@ -2896,8 +2899,7 @@ retry:
if (anon_rmap) {
ClearPagePrivate(page);
hugepage_add_new_anon_rmap(page, vma, address);
- }
- else
+ } else
page_dup_rmap(page);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
@@ -3185,6 +3187,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
+ mmu_notifier_invalidate_range_start(mm, start, end);
mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
for (; address < end; address += huge_page_size(h)) {
spinlock_t *ptl;
@@ -3214,6 +3217,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
*/
flush_tlb_range(vma, start, end);
mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ mmu_notifier_invalidate_range_end(mm, start, end);
return pages << h->order;
}
@@ -3518,7 +3522,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
/* Can be overriden by architectures */
-__attribute__((weak)) struct page *
+struct page * __weak
follow_huge_pud(struct mm_struct *mm, unsigned long address,
pud_t *pud, int write)
{
diff --git a/mm/internal.h b/mm/internal.h
index 29e1e761f9eb..07b67361a40a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -11,6 +11,7 @@
#ifndef __MM_INTERNAL_H
#define __MM_INTERNAL_H
+#include <linux/fs.h>
#include <linux/mm.h>
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
@@ -21,6 +22,20 @@ static inline void set_page_count(struct page *page, int v)
atomic_set(&page->_count, v);
}
+extern int __do_page_cache_readahead(struct address_space *mapping,
+ struct file *filp, pgoff_t offset, unsigned long nr_to_read,
+ unsigned long lookahead_size);
+
+/*
+ * Submit IO for the read-ahead request in file_ra_state.
+ */
+static inline unsigned long ra_submit(struct file_ra_state *ra,
+ struct address_space *mapping, struct file *filp)
+{
+ return __do_page_cache_readahead(mapping, filp,
+ ra->start, ra->size, ra->async_size);
+}
+
/*
* Turn a non-refcounted page (->_count == 0) into refcounted with
* a count of one.
@@ -370,5 +385,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
+#define ALLOC_FAIR 0x100 /* fair zone allocation */
#endif /* __MM_INTERNAL_H */
diff --git a/mm/memblock.c b/mm/memblock.c
index 7fe5354e7552..e9d6ca9a01a9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1253,7 +1253,7 @@ phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
pages += end_pfn - start_pfn;
}
- return (phys_addr_t)pages << PAGE_SHIFT;
+ return PFN_PHYS(pages);
}
/* lowest address */
@@ -1271,16 +1271,14 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void)
void __init memblock_enforce_memory_limit(phys_addr_t limit)
{
- unsigned long i;
phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
+ struct memblock_region *r;
if (!limit)
return;
/* find out max address */
- for (i = 0; i < memblock.memory.cnt; i++) {
- struct memblock_region *r = &memblock.memory.regions[i];
-
+ for_each_memblock(memory, r) {
if (limit <= r->size) {
max_addr = r->base + limit;
break;
@@ -1326,7 +1324,7 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
unsigned long *start_pfn, unsigned long *end_pfn)
{
struct memblock_type *type = &memblock.memory;
- int mid = memblock_search(type, (phys_addr_t)pfn << PAGE_SHIFT);
+ int mid = memblock_search(type, PFN_PHYS(pfn));
if (mid == -1)
return -1;
@@ -1379,13 +1377,12 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si
void __init_memblock memblock_trim_memory(phys_addr_t align)
{
- int i;
phys_addr_t start, end, orig_start, orig_end;
- struct memblock_type *mem = &memblock.memory;
+ struct memblock_region *r;
- for (i = 0; i < mem->cnt; i++) {
- orig_start = mem->regions[i].base;
- orig_end = mem->regions[i].base + mem->regions[i].size;
+ for_each_memblock(memory, r) {
+ orig_start = r->base;
+ orig_end = r->base + r->size;
start = round_up(orig_start, align);
end = round_down(orig_end, align);
@@ -1393,11 +1390,12 @@ void __init_memblock memblock_trim_memory(phys_addr_t align)
continue;
if (start < end) {
- mem->regions[i].base = start;
- mem->regions[i].size = end - start;
+ r->base = start;
+ r->size = end - start;
} else {
- memblock_remove_region(mem, i);
- i--;
+ memblock_remove_region(&memblock.memory,
+ r - memblock.memory.regions);
+ r--;
}
}
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dcc8153a1681..29501f040568 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -921,8 +921,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
struct page *page,
bool anon, int nr_pages)
{
- preempt_disable();
-
/*
* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
* counted as CACHE even if it's on ANON LRU.
@@ -947,8 +945,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
}
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
-
- preempt_enable();
}
unsigned long
@@ -1075,22 +1071,15 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
}
-struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
+static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
struct mem_cgroup *memcg = NULL;
- if (!mm)
- return NULL;
- /*
- * Because we have no locks, mm->owner's may be being moved to other
- * cgroup. We use css_tryget() here even if this looks
- * pessimistic (rather than adding locks here).
- */
rcu_read_lock();
do {
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (unlikely(!memcg))
- break;
+ memcg = root_mem_cgroup;
} while (!css_tryget(&memcg->css));
rcu_read_unlock();
return memcg;
@@ -1486,7 +1475,7 @@ bool task_in_mem_cgroup(struct task_struct *task,
p = find_lock_task_mm(task);
if (p) {
- curr = try_get_mem_cgroup_from_mm(p->mm);
+ curr = get_mem_cgroup_from_mm(p->mm);
task_unlock(p);
} else {
/*
@@ -1500,8 +1489,6 @@ bool task_in_mem_cgroup(struct task_struct *task,
css_get(&curr->css);
rcu_read_unlock();
}
- if (!curr)
- return false;
/*
* We should check use_hierarchy of "memcg" not "curr". Because checking
* use_hierarchy of "curr" here make this function true if hierarchy is
@@ -2588,7 +2575,7 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
}
-/* See __mem_cgroup_try_charge() for details */
+/* See mem_cgroup_try_charge() for details */
enum {
CHARGE_OK, /* success */
CHARGE_RETRY, /* need to retry but retry is not bad */
@@ -2661,45 +2648,34 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
return CHARGE_NOMEM;
}
-/*
- * __mem_cgroup_try_charge() does
- * 1. detect memcg to be charged against from passed *mm and *ptr,
- * 2. update res_counter
- * 3. call memory reclaim if necessary.
- *
- * In some special case, if the task is fatal, fatal_signal_pending() or
- * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
- * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
- * as possible without any hazards. 2: all pages should have a valid
- * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
- * pointer, that is treated as a charge to root_mem_cgroup.
- *
- * So __mem_cgroup_try_charge() will return
- * 0 ... on success, filling *ptr with a valid memcg pointer.
- * -ENOMEM ... charge failure because of resource limits.
- * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup.
+/**
+ * mem_cgroup_try_charge - try charging a memcg
+ * @memcg: memcg to charge
+ * @nr_pages: number of pages to charge
+ * @oom: trigger OOM if reclaim fails
*
- * Unlike the exported interface, an "oom" parameter is added. if oom==true,
- * the oom-killer can be invoked.
+ * Returns 0 if @memcg was charged successfully, -EINTR if the charge
+ * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
*/
-static int __mem_cgroup_try_charge(struct mm_struct *mm,
- gfp_t gfp_mask,
- unsigned int nr_pages,
- struct mem_cgroup **ptr,
- bool oom)
+static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
+ gfp_t gfp_mask,
+ unsigned int nr_pages,
+ bool oom)
{
unsigned int batch = max(CHARGE_BATCH, nr_pages);
int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct mem_cgroup *memcg = NULL;
int ret;
+ if (mem_cgroup_is_root(memcg))
+ goto done;
/*
- * Unlike gloval-vm's OOM-kill, we're not in memory shortage
- * in system level. So, allow to go ahead dying process in addition to
- * MEMDIE process.
+ * Unlike in global OOM situations, memcg is not in a physical
+ * memory shortage. Allow dying and OOM-killed tasks to
+ * bypass the last charges so that they can exit quickly and
+ * free their memory.
*/
- if (unlikely(test_thread_flag(TIF_MEMDIE)
- || fatal_signal_pending(current)))
+ if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+ fatal_signal_pending(current)))
goto bypass;
if (unlikely(task_in_memcg_oom(current)))
@@ -2707,73 +2683,16 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
if (gfp_mask & __GFP_NOFAIL)
oom = false;
-
- /*
- * We always charge the cgroup the mm_struct belongs to.
- * The mm_struct's mem_cgroup changes on task migration if the
- * thread group leader migrates. It's possible that mm is not
- * set, if so charge the root memcg (happens for pagecache usage).
- */
- if (!*ptr && !mm)
- *ptr = root_mem_cgroup;
again:
- if (*ptr) { /* css should be a valid one */
- memcg = *ptr;
- if (mem_cgroup_is_root(memcg))
- goto done;
- if (consume_stock(memcg, nr_pages))
- goto done;
- css_get(&memcg->css);
- } else {
- struct task_struct *p;
-
- rcu_read_lock();
- p = rcu_dereference(mm->owner);
- /*
- * Because we don't have task_lock(), "p" can exit.
- * In that case, "memcg" can point to root or p can be NULL with
- * race with swapoff. Then, we have small risk of mis-accouning.
- * But such kind of mis-account by race always happens because
- * we don't have cgroup_mutex(). It's overkill and we allo that
- * small race, here.
- * (*) swapoff at el will charge against mm-struct not against
- * task-struct. So, mm->owner can be NULL.
- */
- memcg = mem_cgroup_from_task(p);
- if (!memcg)
- memcg = root_mem_cgroup;
- if (mem_cgroup_is_root(memcg)) {
- rcu_read_unlock();
- goto done;
- }
- if (consume_stock(memcg, nr_pages)) {
- /*
- * It seems dagerous to access memcg without css_get().
- * But considering how consume_stok works, it's not
- * necessary. If consume_stock success, some charges
- * from this memcg are cached on this cpu. So, we
- * don't need to call css_get()/css_tryget() before
- * calling consume_stock().
- */
- rcu_read_unlock();
- goto done;
- }
- /* after here, we may be blocked. we need to get refcnt */
- if (!css_tryget(&memcg->css)) {
- rcu_read_unlock();
- goto again;
- }
- rcu_read_unlock();
- }
+ if (consume_stock(memcg, nr_pages))
+ goto done;
do {
bool invoke_oom = oom && !nr_oom_retries;
/* If killed, bypass charge */
- if (fatal_signal_pending(current)) {
- css_put(&memcg->css);
+ if (fatal_signal_pending(current))
goto bypass;
- }
ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
nr_pages, invoke_oom);
@@ -2782,17 +2701,12 @@ again:
break;
case CHARGE_RETRY: /* not in OOM situation but retry */
batch = nr_pages;
- css_put(&memcg->css);
- memcg = NULL;
goto again;
case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
- css_put(&memcg->css);
goto nomem;
case CHARGE_NOMEM: /* OOM routine works */
- if (!oom || invoke_oom) {
- css_put(&memcg->css);
+ if (!oom || invoke_oom)
goto nomem;
- }
nr_oom_retries--;
break;
}
@@ -2800,20 +2714,44 @@ again:
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
- css_put(&memcg->css);
done:
- *ptr = memcg;
return 0;
nomem:
- if (!(gfp_mask & __GFP_NOFAIL)) {
- *ptr = NULL;
+ if (!(gfp_mask & __GFP_NOFAIL))
return -ENOMEM;
- }
bypass:
- *ptr = root_mem_cgroup;
return -EINTR;
}
+/**
+ * mem_cgroup_try_charge_mm - try charging a mm
+ * @mm: mm_struct to charge
+ * @nr_pages: number of pages to charge
+ * @oom: trigger OOM if reclaim fails
+ *
+ * Returns the charged mem_cgroup associated with the given mm_struct or
+ * NULL the charge failed.
+ */
+static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
+ gfp_t gfp_mask,
+ unsigned int nr_pages,
+ bool oom)
+
+{
+ struct mem_cgroup *memcg;
+ int ret;
+
+ memcg = get_mem_cgroup_from_mm(mm);
+ ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
+ css_put(&memcg->css);
+ if (ret == -EINTR)
+ memcg = root_mem_cgroup;
+ else if (ret)
+ memcg = NULL;
+
+ return memcg;
+}
+
/*
* Somemtimes we have to undo a charge we got by try_charge().
* This function is for that and do uncharge, put css's refcnt.
@@ -3009,20 +2947,17 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
{
struct res_counter *fail_res;
- struct mem_cgroup *_memcg;
int ret = 0;
ret = res_counter_charge(&memcg->kmem, size, &fail_res);
if (ret)
return ret;
- _memcg = memcg;
- ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
- &_memcg, oom_gfp_allowed(gfp));
-
+ ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,
+ oom_gfp_allowed(gfp));
if (ret == -EINTR) {
/*
- * __mem_cgroup_try_charge() chosed to bypass to root due to
+ * mem_cgroup_try_charge() chosed to bypass to root due to
* OOM kill or fatal signal. Since our only options are to
* either fail the allocation or charge it to this cgroup, do
* it as a temporary condition. But we can't fail. From a
@@ -3032,7 +2967,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
*
* This condition will only trigger if the task entered
* memcg_charge_kmem in a sane state, but was OOM-killed during
- * __mem_cgroup_try_charge() above. Tasks that were already
+ * mem_cgroup_try_charge() above. Tasks that were already
* dying when the allocation triggers should have been already
* directed to the root cgroup in memcontrol.h
*/
@@ -3159,6 +3094,29 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
return 0;
}
+char *memcg_create_cache_name(struct mem_cgroup *memcg,
+ struct kmem_cache *root_cache)
+{
+ static char *buf = NULL;
+
+ /*
+ * We need a mutex here to protect the shared buffer. Since this is
+ * expected to be called only on cache creation, we can employ the
+ * slab_mutex for that purpose.
+ */
+ lockdep_assert_held(&slab_mutex);
+
+ if (!buf) {
+ buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!buf)
+ return NULL;
+ }
+
+ cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1);
+ return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
+ memcg_cache_id(memcg), buf);
+}
+
int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
struct kmem_cache *root_cache)
{
@@ -3182,6 +3140,7 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
s->memcg_params->root_cache = root_cache;
INIT_WORK(&s->memcg_params->destroy,
kmem_cache_destroy_work_func);
+ css_get(&memcg->css);
} else
s->memcg_params->is_root_cache = true;
@@ -3190,6 +3149,10 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
void memcg_free_cache_params(struct kmem_cache *s)
{
+ if (!s->memcg_params)
+ return;
+ if (!s->memcg_params->is_root_cache)
+ css_put(&s->memcg_params->memcg->css);
kfree(s->memcg_params);
}
@@ -3212,9 +3175,6 @@ void memcg_register_cache(struct kmem_cache *s)
memcg = s->memcg_params->memcg;
id = memcg_cache_id(memcg);
- css_get(&memcg->css);
-
-
/*
* Since readers won't lock (see cache_from_memcg_idx()), we need a
* barrier here to ensure nobody will see the kmem_cache partially
@@ -3263,10 +3223,8 @@ void memcg_unregister_cache(struct kmem_cache *s)
* after removing it from the memcg_slab_caches list, otherwise we can
* fail to convert memcg_params_to_cache() while traversing the list.
*/
- VM_BUG_ON(!root->memcg_params->memcg_caches[id]);
+ VM_BUG_ON(root->memcg_params->memcg_caches[id] != s);
root->memcg_params->memcg_caches[id] = NULL;
-
- css_put(&memcg->css);
}
/*
@@ -3363,55 +3321,10 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
schedule_work(&cachep->memcg_params->destroy);
}
-static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
- struct kmem_cache *s)
-{
- struct kmem_cache *new = NULL;
- static char *tmp_path = NULL, *tmp_name = NULL;
- static DEFINE_MUTEX(mutex); /* protects tmp_name */
-
- BUG_ON(!memcg_can_account_kmem(memcg));
-
- mutex_lock(&mutex);
- /*
- * kmem_cache_create_memcg duplicates the given name and
- * cgroup_name for this name requires RCU context.
- * This static temporary buffer is used to prevent from
- * pointless shortliving allocation.
- */
- if (!tmp_path || !tmp_name) {
- if (!tmp_path)
- tmp_path = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!tmp_name)
- tmp_name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
- if (!tmp_path || !tmp_name)
- goto out;
- }
-
- cgroup_name(memcg->css.cgroup, tmp_name, NAME_MAX + 1);
- snprintf(tmp_path, PATH_MAX, "%s(%d:%s)", s->name,
- memcg_cache_id(memcg), tmp_name);
-
- new = kmem_cache_create_memcg(memcg, tmp_path, s->object_size, s->align,
- (s->flags & ~SLAB_PANIC), s->ctor, s);
- if (new)
- new->allocflags |= __GFP_KMEMCG;
- else
- new = s;
-out:
- mutex_unlock(&mutex);
- return new;
-}
-
-void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+int __kmem_cache_destroy_memcg_children(struct kmem_cache *s)
{
struct kmem_cache *c;
- int i;
-
- if (!s->memcg_params)
- return;
- if (!s->memcg_params->is_root_cache)
- return;
+ int i, failed = 0;
/*
* If the cache is being destroyed, we trust that there is no one else
@@ -3445,16 +3358,14 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
c->memcg_params->dead = false;
cancel_work_sync(&c->memcg_params->destroy);
kmem_cache_destroy(c);
+
+ if (cache_from_memcg_idx(s, i))
+ failed++;
}
mutex_unlock(&activate_kmem_mutex);
+ return failed;
}
-struct create_work {
- struct mem_cgroup *memcg;
- struct kmem_cache *cachep;
- struct work_struct work;
-};
-
static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
{
struct kmem_cache *cachep;
@@ -3472,13 +3383,20 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
mutex_unlock(&memcg->slab_caches_mutex);
}
+struct create_work {
+ struct mem_cgroup *memcg;
+ struct kmem_cache *cachep;
+ struct work_struct work;
+};
+
static void memcg_create_cache_work_func(struct work_struct *w)
{
- struct create_work *cw;
+ struct create_work *cw = container_of(w, struct create_work, work);
+ struct mem_cgroup *memcg = cw->memcg;
+ struct kmem_cache *cachep = cw->cachep;
- cw = container_of(w, struct create_work, work);
- memcg_create_kmem_cache(cw->memcg, cw->cachep);
- css_put(&cw->memcg->css);
+ kmem_cache_create_memcg(memcg, cachep);
+ css_put(&memcg->css);
kfree(cw);
}
@@ -3637,15 +3555,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
if (!current->mm || current->memcg_kmem_skip_account)
return true;
- memcg = try_get_mem_cgroup_from_mm(current->mm);
-
- /*
- * very rare case described in mem_cgroup_from_task. Unfortunately there
- * isn't much we can do without complicating this too much, and it would
- * be gfp-dependent anyway. Just let it go
- */
- if (unlikely(!memcg))
- return true;
+ memcg = get_mem_cgroup_from_mm(current->mm);
if (!memcg_can_account_kmem(memcg)) {
css_put(&memcg->css);
@@ -3748,19 +3658,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static inline
-void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
- struct mem_cgroup *to,
- unsigned int nr_pages,
- enum mem_cgroup_stat_index idx)
-{
- /* Update stat data for mem_cgroup */
- preempt_disable();
- __this_cpu_sub(from->stat->count[idx], nr_pages);
- __this_cpu_add(to->stat->count[idx], nr_pages);
- preempt_enable();
-}
-
/**
* mem_cgroup_move_account - move account of the page
* @page: the page
@@ -3806,13 +3703,19 @@ static int mem_cgroup_move_account(struct page *page,
move_lock_mem_cgroup(from, &flags);
- if (!anon && page_mapped(page))
- mem_cgroup_move_account_page_stat(from, to, nr_pages,
- MEM_CGROUP_STAT_FILE_MAPPED);
+ if (!anon && page_mapped(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ }
- if (PageWriteback(page))
- mem_cgroup_move_account_page_stat(from, to, nr_pages,
- MEM_CGROUP_STAT_WRITEBACK);
+ if (PageWriteback(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ }
mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
@@ -3898,19 +3801,19 @@ out:
return ret;
}
-/*
- * Charge the memory controller for page usage.
- * Return
- * 0 if the charge was successful
- * < 0 if the cgroup is over its limit
- */
-static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, enum charge_type ctype)
+int mem_cgroup_charge_anon(struct page *page,
+ struct mm_struct *mm, gfp_t gfp_mask)
{
- struct mem_cgroup *memcg = NULL;
unsigned int nr_pages = 1;
+ struct mem_cgroup *memcg;
bool oom = true;
- int ret;
+
+ if (mem_cgroup_disabled())
+ return 0;
+
+ VM_BUG_ON_PAGE(page_mapped(page), page);
+ VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
+ VM_BUG_ON(!mm);
if (PageTransHuge(page)) {
nr_pages <<= compound_order(page);
@@ -3922,25 +3825,14 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
oom = false;
}
- ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
- if (ret == -ENOMEM)
- return ret;
- __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
+ memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
+ if (!memcg)
+ return -ENOMEM;
+ __mem_cgroup_commit_charge(memcg, page, nr_pages,
+ MEM_CGROUP_CHARGE_TYPE_ANON, false);
return 0;
}
-int mem_cgroup_newpage_charge(struct page *page,
- struct mm_struct *mm, gfp_t gfp_mask)
-{
- if (mem_cgroup_disabled())
- return 0;
- VM_BUG_ON_PAGE(page_mapped(page), page);
- VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
- VM_BUG_ON(!mm);
- return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_ANON);
-}
-
/*
* While swap-in, try_charge -> commit or cancel, the page is locked.
* And when try_charge() successfully returns, one refcnt to memcg without
@@ -3952,7 +3844,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
gfp_t mask,
struct mem_cgroup **memcgp)
{
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg = NULL;
struct page_cgroup *pc;
int ret;
@@ -3965,31 +3857,29 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
* in turn serializes uncharging.
*/
if (PageCgroupUsed(pc))
- return 0;
- if (!do_swap_account)
- goto charge_cur_mm;
- memcg = try_get_mem_cgroup_from_page(page);
+ goto out;
+ if (do_swap_account)
+ memcg = try_get_mem_cgroup_from_page(page);
if (!memcg)
- goto charge_cur_mm;
- *memcgp = memcg;
- ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
+ memcg = get_mem_cgroup_from_mm(mm);
+ ret = mem_cgroup_try_charge(memcg, mask, 1, true);
css_put(&memcg->css);
if (ret == -EINTR)
- ret = 0;
- return ret;
-charge_cur_mm:
- ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
- if (ret == -EINTR)
- ret = 0;
- return ret;
+ memcg = root_mem_cgroup;
+ else if (ret)
+ return ret;
+out:
+ *memcgp = memcg;
+ return 0;
}
int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
gfp_t gfp_mask, struct mem_cgroup **memcgp)
{
- *memcgp = NULL;
- if (mem_cgroup_disabled())
+ if (mem_cgroup_disabled()) {
+ *memcgp = NULL;
return 0;
+ }
/*
* A racing thread's fault, or swapoff, may have already
* updated the pte, and even removed page from swap cache: in
@@ -3997,12 +3887,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
* there's also a KSM case which does need to charge the page.
*/
if (!PageSwapCache(page)) {
- int ret;
+ struct mem_cgroup *memcg;
- ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
- if (ret == -EINTR)
- ret = 0;
- return ret;
+ memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
+ if (!memcg)
+ return -ENOMEM;
+ *memcgp = memcg;
+ return 0;
}
return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
}
@@ -4046,11 +3937,11 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
MEM_CGROUP_CHARGE_TYPE_ANON);
}
-int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
+int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
{
- struct mem_cgroup *memcg = NULL;
enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
+ struct mem_cgroup *memcg;
int ret;
if (mem_cgroup_disabled())
@@ -4058,15 +3949,28 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
if (PageCompound(page))
return 0;
- if (!PageSwapCache(page))
- ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
- else { /* page is swapcache/shmem */
+ if (PageSwapCache(page)) { /* shmem */
ret = __mem_cgroup_try_charge_swapin(mm, page,
gfp_mask, &memcg);
- if (!ret)
- __mem_cgroup_commit_charge_swapin(page, memcg, type);
+ if (ret)
+ return ret;
+ __mem_cgroup_commit_charge_swapin(page, memcg, type);
+ return 0;
}
- return ret;
+
+ /*
+ * Page cache insertions can happen without an actual mm
+ * context, e.g. during disk probing on boot.
+ */
+ if (unlikely(!mm))
+ memcg = root_mem_cgroup;
+ else {
+ memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
+ if (!memcg)
+ return -ENOMEM;
+ }
+ __mem_cgroup_commit_charge(memcg, page, 1, type, false);
+ return 0;
}
static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
@@ -6678,8 +6582,7 @@ one_by_one:
batch_count = PRECHARGE_COUNT_AT_ONCE;
cond_resched();
}
- ret = __mem_cgroup_try_charge(NULL,
- GFP_KERNEL, 1, &memcg, false);
+ ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);
if (ret)
/* mem_cgroup_clear_mc() will do uncharge later */
return ret;
diff --git a/mm/memory.c b/mm/memory.c
index 82c1e4cf00d1..d0f0bef3be48 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -60,6 +60,7 @@
#include <linux/migrate.h>
#include <linux/string.h>
#include <linux/dma-debug.h>
+#include <linux/debugfs.h>
#include <asm/io.h>
#include <asm/pgalloc.h>
@@ -1320,9 +1321,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* It is undesirable to test vma->vm_file as it
* should be non-null for valid hugetlb area.
* However, vm_file will be NULL in the error
- * cleanup path of do_mmap_pgoff. When
+ * cleanup path of mmap_region. When
* hugetlbfs ->mmap method fails,
- * do_mmap_pgoff() nullifies vma->vm_file
+ * mmap_region() nullifies vma->vm_file
* before calling this function to clean up.
* Since no pte has actually been setup, it is
* safe to do nothing in this case.
@@ -2781,7 +2782,7 @@ reuse:
*/
if (!page_mkwrite) {
wait_on_page_locked(dirty_page);
- set_page_dirty_balance(dirty_page, page_mkwrite);
+ set_page_dirty_balance(dirty_page);
/* file_update_time outside page_lock */
if (vma->vm_file)
file_update_time(vma->vm_file);
@@ -2827,7 +2828,7 @@ gotten:
}
__SetPageUptodate(new_page);
- if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
+ if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
goto oom_free_new;
mmun_start = address & PAGE_MASK;
@@ -3280,7 +3281,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
__SetPageUptodate(page);
- if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
+ if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL))
goto oom_free_page;
entry = mk_pte(page, vma->vm_page_prot);
@@ -3342,7 +3343,22 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
return ret;
}
-static void do_set_pte(struct vm_area_struct *vma, unsigned long address,
+/**
+ * do_set_pte - setup new PTE entry for given page and add reverse page mapping.
+ *
+ * @vma: virtual memory area
+ * @address: user virtual address
+ * @page: page to map
+ * @pte: pointer to target page table entry
+ * @write: true, if new entry is writable
+ * @anon: true, if it's anonymous page
+ *
+ * Caller must hold page table lock relevant for @pte.
+ *
+ * Target users are page handler itself and implementations of
+ * vm_ops->map_pages.
+ */
+void do_set_pte(struct vm_area_struct *vma, unsigned long address,
struct page *page, pte_t *pte, bool write, bool anon)
{
pte_t entry;
@@ -3366,6 +3382,105 @@ static void do_set_pte(struct vm_area_struct *vma, unsigned long address,
update_mmu_cache(vma, address, pte);
}
+#define FAULT_AROUND_ORDER 4
+
+#ifdef CONFIG_DEBUG_FS
+static unsigned int fault_around_order = FAULT_AROUND_ORDER;
+
+static int fault_around_order_get(void *data, u64 *val)
+{
+ *val = fault_around_order;
+ return 0;
+}
+
+static int fault_around_order_set(void *data, u64 val)
+{
+ BUILD_BUG_ON((1UL << FAULT_AROUND_ORDER) > PTRS_PER_PTE);
+ if (1UL << val > PTRS_PER_PTE)
+ return -EINVAL;
+ fault_around_order = val;
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fault_around_order_fops,
+ fault_around_order_get, fault_around_order_set, "%llu\n");
+
+static int __init fault_around_debugfs(void)
+{
+ void *ret;
+
+ ret = debugfs_create_file("fault_around_order", 0644, NULL, NULL,
+ &fault_around_order_fops);
+ if (!ret)
+ pr_warn("Failed to create fault_around_order in debugfs");
+ return 0;
+}
+late_initcall(fault_around_debugfs);
+
+static inline unsigned long fault_around_pages(void)
+{
+ return 1UL << fault_around_order;
+}
+
+static inline unsigned long fault_around_mask(void)
+{
+ return ~((1UL << (PAGE_SHIFT + fault_around_order)) - 1);
+}
+#else
+static inline unsigned long fault_around_pages(void)
+{
+ unsigned long nr_pages;
+
+ nr_pages = 1UL << FAULT_AROUND_ORDER;
+ BUILD_BUG_ON(nr_pages > PTRS_PER_PTE);
+ return nr_pages;
+}
+
+static inline unsigned long fault_around_mask(void)
+{
+ return ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1);
+}
+#endif
+
+static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
+ pte_t *pte, pgoff_t pgoff, unsigned int flags)
+{
+ unsigned long start_addr;
+ pgoff_t max_pgoff;
+ struct vm_fault vmf;
+ int off;
+
+ start_addr = max(address & fault_around_mask(), vma->vm_start);
+ off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+ pte -= off;
+ pgoff -= off;
+
+ /*
+ * max_pgoff is either end of page table or end of vma
+ * or fault_around_pages() from pgoff, depending what is neast.
+ */
+ max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
+ PTRS_PER_PTE - 1;
+ max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
+ pgoff + fault_around_pages() - 1);
+
+ /* Check if it makes any sense to call ->map_pages */
+ while (!pte_none(*pte)) {
+ if (++pgoff > max_pgoff)
+ return;
+ start_addr += PAGE_SIZE;
+ if (start_addr >= vma->vm_end)
+ return;
+ pte++;
+ }
+
+ vmf.virtual_address = (void __user *) start_addr;
+ vmf.pte = pte;
+ vmf.pgoff = pgoff;
+ vmf.max_pgoff = max_pgoff;
+ vmf.flags = flags;
+ vma->vm_ops->map_pages(vma, &vmf);
+}
+
static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
@@ -3373,7 +3488,20 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *fault_page;
spinlock_t *ptl;
pte_t *pte;
- int ret;
+ int ret = 0;
+
+ /*
+ * Let's call ->map_pages() first and use ->fault() as fallback
+ * if page by the offset is not ready to be mapped (cold cache or
+ * something).
+ */
+ if (vma->vm_ops->map_pages) {
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ do_fault_around(vma, address, pte, pgoff, flags);
+ if (!pte_same(*pte, orig_pte))
+ goto unlock_out;
+ pte_unmap_unlock(pte, ptl);
+ }
ret = __do_fault(vma, address, pgoff, flags, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -3387,8 +3515,9 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return ret;
}
do_set_pte(vma, address, fault_page, pte, false, false);
- pte_unmap_unlock(pte, ptl);
unlock_page(fault_page);
+unlock_out:
+ pte_unmap_unlock(pte, ptl);
return ret;
}
@@ -3408,7 +3537,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (!new_page)
return VM_FAULT_OOM;
- if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) {
+ if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) {
page_cache_release(new_page);
return VM_FAULT_OOM;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e3ab02822799..78e1472933ea 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -795,36 +795,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
return err;
}
-/*
- * Update task->flags PF_MEMPOLICY bit: set iff non-default
- * mempolicy. Allows more rapid checking of this (combined perhaps
- * with other PF_* flag bits) on memory allocation hot code paths.
- *
- * If called from outside this file, the task 'p' should -only- be
- * a newly forked child not yet visible on the task list, because
- * manipulating the task flags of a visible task is not safe.
- *
- * The above limitation is why this routine has the funny name
- * mpol_fix_fork_child_flag().
- *
- * It is also safe to call this with a task pointer of current,
- * which the static wrapper mpol_set_task_struct_flag() does,
- * for use within this file.
- */
-
-void mpol_fix_fork_child_flag(struct task_struct *p)
-{
- if (p->mempolicy)
- p->flags |= PF_MEMPOLICY;
- else
- p->flags &= ~PF_MEMPOLICY;
-}
-
-static void mpol_set_task_struct_flag(void)
-{
- mpol_fix_fork_child_flag(current);
-}
-
/* Set the process memory policy */
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
@@ -861,7 +831,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
}
old = current->mempolicy;
current->mempolicy = new;
- mpol_set_task_struct_flag();
if (new && new->mode == MPOL_INTERLEAVE &&
nodes_weight(new->v.nodes))
current->il_next = first_node(new->v.nodes);
@@ -1782,21 +1751,18 @@ static unsigned interleave_nodes(struct mempolicy *policy)
/*
* Depending on the memory policy provide a node from which to allocate the
* next slab entry.
- * @policy must be protected by freeing by the caller. If @policy is
- * the current task's mempolicy, this protection is implicit, as only the
- * task can change it's policy. The system default policy requires no
- * such protection.
*/
-unsigned slab_node(void)
+unsigned int mempolicy_slab_node(void)
{
struct mempolicy *policy;
+ int node = numa_mem_id();
if (in_interrupt())
- return numa_node_id();
+ return node;
policy = current->mempolicy;
if (!policy || policy->flags & MPOL_F_LOCAL)
- return numa_node_id();
+ return node;
switch (policy->mode) {
case MPOL_PREFERRED:
@@ -1816,11 +1782,11 @@ unsigned slab_node(void)
struct zonelist *zonelist;
struct zone *zone;
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
- zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
+ zonelist = &NODE_DATA(node)->node_zonelists[0];
(void)first_zones_zonelist(zonelist, highest_zoneidx,
&policy->v.nodes,
&zone);
- return zone ? zone->node : numa_node_id();
+ return zone ? zone->node : node;
}
default:
diff --git a/mm/mempool.c b/mm/mempool.c
index 659aa42bad16..905434f18c97 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -304,9 +304,9 @@ void mempool_free(void *element, mempool_t *pool)
* ensures that there will be frees which return elements to the
* pool waking up the waiters.
*/
- if (pool->curr_nr < pool->min_nr) {
+ if (unlikely(pool->curr_nr < pool->min_nr)) {
spin_lock_irqsave(&pool->lock, flags);
- if (pool->curr_nr < pool->min_nr) {
+ if (likely(pool->curr_nr < pool->min_nr)) {
add_element(pool, element);
spin_unlock_irqrestore(&pool->lock, flags);
wake_up(&pool->wait);
diff --git a/mm/mlock.c b/mm/mlock.c
index 4e1a68162285..b1eb53634005 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -79,6 +79,7 @@ void clear_page_mlock(struct page *page)
*/
void mlock_vma_page(struct page *page)
{
+ /* Serialize with page migration */
BUG_ON(!PageLocked(page));
if (!TestSetPageMlocked(page)) {
@@ -174,6 +175,7 @@ unsigned int munlock_vma_page(struct page *page)
unsigned int nr_pages;
struct zone *zone = page_zone(page);
+ /* For try_to_munlock() and to serialize with page migration */
BUG_ON(!PageLocked(page));
/*
diff --git a/mm/mmap.c b/mm/mmap.c
index 46433e137abc..b1202cf81f4b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -10,6 +10,7 @@
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
@@ -681,8 +682,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
prev->vm_next = next = vma->vm_next;
if (next)
next->vm_prev = prev;
- if (mm->mmap_cache == vma)
- mm->mmap_cache = prev;
+
+ /* Kill the cache */
+ vmacache_invalidate(mm);
}
/*
@@ -1989,34 +1991,33 @@ EXPORT_SYMBOL(get_unmapped_area);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct *vma = NULL;
+ struct rb_node *rb_node;
+ struct vm_area_struct *vma;
/* Check the cache first. */
- /* (Cache hit rate is typically around 35%.) */
- vma = ACCESS_ONCE(mm->mmap_cache);
- if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
- struct rb_node *rb_node;
+ vma = vmacache_find(mm, addr);
+ if (likely(vma))
+ return vma;
- rb_node = mm->mm_rb.rb_node;
- vma = NULL;
+ rb_node = mm->mm_rb.rb_node;
+ vma = NULL;
- while (rb_node) {
- struct vm_area_struct *vma_tmp;
-
- vma_tmp = rb_entry(rb_node,
- struct vm_area_struct, vm_rb);
-
- if (vma_tmp->vm_end > addr) {
- vma = vma_tmp;
- if (vma_tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
- }
- if (vma)
- mm->mmap_cache = vma;
+ while (rb_node) {
+ struct vm_area_struct *tmp;
+
+ tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+ if (tmp->vm_end > addr) {
+ vma = tmp;
+ if (tmp->vm_start <= addr)
+ break;
+ rb_node = rb_node->rb_left;
+ } else
+ rb_node = rb_node->rb_right;
}
+
+ if (vma)
+ vmacache_update(addr, vma);
return vma;
}
@@ -2388,7 +2389,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
} else
mm->highest_vm_end = prev ? prev->vm_end : 0;
tail_vma->vm_next = NULL;
- mm->mmap_cache = NULL; /* Kill the cache. */
+
+ /* Kill the cache */
+ vmacache_invalidate(mm);
}
/*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 769a67a15803..c43d557941f8 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -36,6 +36,34 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
}
#endif
+/*
+ * For a prot_numa update we only hold mmap_sem for read so there is a
+ * potential race with faulting where a pmd was temporarily none. This
+ * function checks for a transhuge pmd under the appropriate lock. It
+ * returns a pte if it was successfully locked or NULL if it raced with
+ * a transhuge insertion.
+ */
+static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, int prot_numa, spinlock_t **ptl)
+{
+ pte_t *pte;
+ spinlock_t *pmdl;
+
+ /* !prot_numa is protected by mmap_sem held for write */
+ if (!prot_numa)
+ return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
+
+ pmdl = pmd_lock(vma->vm_mm, pmd);
+ if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) {
+ spin_unlock(pmdl);
+ return NULL;
+ }
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
+ spin_unlock(pmdl);
+ return pte;
+}
+
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable, int prot_numa)
@@ -45,7 +73,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
spinlock_t *ptl;
unsigned long pages = 0;
- pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
+ if (!pte)
+ return 0;
+
arch_enter_lazy_mmu_mode();
do {
oldpte = *pte;
@@ -109,15 +140,26 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pgprot_t newprot, int dirty_accountable, int prot_numa)
{
pmd_t *pmd;
+ struct mm_struct *mm = vma->vm_mm;
unsigned long next;
unsigned long pages = 0;
unsigned long nr_huge_updates = 0;
+ unsigned long mni_start = 0;
pmd = pmd_offset(pud, addr);
do {
unsigned long this_pages;
next = pmd_addr_end(addr, end);
+ if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd))
+ continue;
+
+ /* invoke the mmu notifier if the pmd is populated */
+ if (!mni_start) {
+ mni_start = addr;
+ mmu_notifier_invalidate_range_start(mm, mni_start, end);
+ }
+
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
split_huge_page_pmd(vma, addr, pmd);
@@ -130,18 +172,21 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pages += HPAGE_PMD_NR;
nr_huge_updates++;
}
+
+ /* huge pmd was handled */
continue;
}
}
- /* fall through */
+ /* fall through, the trans huge pmd just split */
}
- if (pmd_none_or_clear_bad(pmd))
- continue;
this_pages = change_pte_range(vma, pmd, addr, next, newprot,
dirty_accountable, prot_numa);
pages += this_pages;
} while (pmd++, addr = next, addr != end);
+ if (mni_start)
+ mmu_notifier_invalidate_range_end(mm, mni_start, end);
+
if (nr_huge_updates)
count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
return pages;
@@ -201,15 +246,12 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgprot_t newprot,
int dirty_accountable, int prot_numa)
{
- struct mm_struct *mm = vma->vm_mm;
unsigned long pages;
- mmu_notifier_invalidate_range_start(mm, start, end);
if (is_vm_hugetlb_page(vma))
pages = hugetlb_change_protection(vma, start, end, newprot);
else
pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
- mmu_notifier_invalidate_range_end(mm, start, end);
return pages;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index a554e5a451cd..85f8d6698d48 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -15,6 +15,7 @@
#include <linux/export.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/file.h>
@@ -24,6 +25,7 @@
#include <linux/vmalloc.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+#include <linux/compiler.h>
#include <linux/mount.h>
#include <linux/personality.h>
#include <linux/security.h>
@@ -296,7 +298,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
count = -(unsigned long) addr;
memcpy(addr, buf, count);
- return(count);
+ return count;
}
/*
@@ -459,7 +461,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases);
* Implement a stub for vmalloc_sync_all() if the architecture chose not to
* have one.
*/
-void __attribute__((weak)) vmalloc_sync_all(void)
+void __weak vmalloc_sync_all(void)
{
}
@@ -768,16 +770,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
*/
static void delete_vma_from_mm(struct vm_area_struct *vma)
{
+ int i;
struct address_space *mapping;
struct mm_struct *mm = vma->vm_mm;
+ struct task_struct *curr = current;
kenter("%p", vma);
protect_vma(vma, 0);
mm->map_count--;
- if (mm->mmap_cache == vma)
- mm->mmap_cache = NULL;
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ /* if the vma is cached, invalidate the entire cache */
+ if (curr->vmacache[i] == vma) {
+ vmacache_invalidate(curr->mm);
+ break;
+ }
+ }
/* remove the VMA from the mapping */
if (vma->vm_file) {
@@ -825,8 +834,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
struct vm_area_struct *vma;
/* check the cache first */
- vma = ACCESS_ONCE(mm->mmap_cache);
- if (vma && vma->vm_start <= addr && vma->vm_end > addr)
+ vma = vmacache_find(mm, addr);
+ if (likely(vma))
return vma;
/* trawl the list (there may be multiple mappings in which addr
@@ -835,7 +844,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
if (vma->vm_start > addr)
return NULL;
if (vma->vm_end > addr) {
- mm->mmap_cache = vma;
+ vmacache_update(addr, vma);
return vma;
}
}
@@ -874,8 +883,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
unsigned long end = addr + len;
/* check the cache first */
- vma = mm->mmap_cache;
- if (vma && vma->vm_start == addr && vma->vm_end == end)
+ vma = vmacache_find_exact(mm, addr, end);
+ if (vma)
return vma;
/* trawl the list (there may be multiple mappings in which addr
@@ -886,7 +895,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
if (vma->vm_start > addr)
return NULL;
if (vma->vm_end == end) {
- mm->mmap_cache = vma;
+ vmacache_update(addr, vma);
return vma;
}
}
@@ -1003,8 +1012,7 @@ static int validate_mmap_request(struct file *file,
/* we mustn't privatise shared mappings */
capabilities &= ~BDI_CAP_MAP_COPY;
- }
- else {
+ } else {
/* we're going to read the file into private memory we
* allocate */
if (!(capabilities & BDI_CAP_MAP_COPY))
@@ -1035,23 +1043,20 @@ static int validate_mmap_request(struct file *file,
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
if (prot & PROT_EXEC)
return -EPERM;
- }
- else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
+ } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
/* handle implication of PROT_EXEC by PROT_READ */
if (current->personality & READ_IMPLIES_EXEC) {
if (capabilities & BDI_CAP_EXEC_MAP)
prot |= PROT_EXEC;
}
- }
- else if ((prot & PROT_READ) &&
+ } else if ((prot & PROT_READ) &&
(prot & PROT_EXEC) &&
!(capabilities & BDI_CAP_EXEC_MAP)
) {
/* backing file is not executable, try to copy */
capabilities &= ~BDI_CAP_MAP_DIRECT;
}
- }
- else {
+ } else {
/* anonymous mappings are always memory backed and can be
* privately mapped
*/
@@ -1659,7 +1664,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
/* find the first potentially overlapping VMA */
vma = find_vma(mm, start);
if (!vma) {
- static int limit = 0;
+ static int limit;
if (limit < 5) {
printk(KERN_WARNING
"munmap of memory not mmapped by process %d"
@@ -1985,6 +1990,12 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);
+void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ BUG();
+}
+EXPORT_SYMBOL(filemap_map_pages);
+
int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
unsigned long size, pgoff_t pgoff)
{
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7106cb1aca8e..ef413492a149 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1562,9 +1562,9 @@ pause:
bdi_start_background_writeback(bdi);
}
-void set_page_dirty_balance(struct page *page, int page_mkwrite)
+void set_page_dirty_balance(struct page *page)
{
- if (set_page_dirty(page) || page_mkwrite) {
+ if (set_page_dirty(page)) {
struct address_space *mapping = page_mapping(page);
if (mapping)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 979378deccbf..5dba2933c9c0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -295,7 +295,8 @@ static inline int bad_range(struct zone *zone, struct page *page)
}
#endif
-static void bad_page(struct page *page, char *reason, unsigned long bad_flags)
+static void bad_page(struct page *page, const char *reason,
+ unsigned long bad_flags)
{
static unsigned long resume;
static unsigned long nr_shown;
@@ -623,7 +624,7 @@ out:
static inline int free_pages_check(struct page *page)
{
- char *bad_reason = NULL;
+ const char *bad_reason = NULL;
unsigned long bad_flags = 0;
if (unlikely(page_mapcount(page)))
@@ -859,7 +860,7 @@ static inline void expand(struct zone *zone, struct page *page,
*/
static inline int check_new_page(struct page *page)
{
- char *bad_reason = NULL;
+ const char *bad_reason = NULL;
unsigned long bad_flags = 0;
if (unlikely(page_mapcount(page)))
@@ -1238,15 +1239,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
}
local_irq_restore(flags);
}
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
-{
- return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
-}
-#else
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
-{
- return false;
-}
#endif
/*
@@ -1583,12 +1575,7 @@ again:
get_pageblock_migratetype(page));
}
- /*
- * NOTE: GFP_THISNODE allocations do not partake in the kswapd
- * aging protocol, so they can't be fair.
- */
- if (!gfp_thisnode_allocation(gfp_flags))
- __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+ __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1870,7 +1857,7 @@ static void __paginginit init_zone_allows_reclaim(int nid)
{
int i;
- for_each_online_node(i)
+ for_each_node_state(i, N_MEMORY)
if (node_distance(nid, i) <= RECLAIM_DISTANCE)
node_set(i, NODE_DATA(nid)->reclaim_nodes);
else
@@ -1954,23 +1941,12 @@ zonelist_scan:
* zone size to ensure fair page aging. The zone a
* page was allocated in should have no effect on the
* time the page has in memory before being reclaimed.
- *
- * Try to stay in local zones in the fastpath. If
- * that fails, the slowpath is entered, which will do
- * another pass starting with the local zones, but
- * ultimately fall back to remote zones that do not
- * partake in the fairness round-robin cycle of this
- * zonelist.
- *
- * NOTE: GFP_THISNODE allocations do not partake in
- * the kswapd aging protocol, so they can't be fair.
*/
- if ((alloc_flags & ALLOC_WMARK_LOW) &&
- !gfp_thisnode_allocation(gfp_mask)) {
- if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
- continue;
+ if (alloc_flags & ALLOC_FAIR) {
if (!zone_local(preferred_zone, zone))
continue;
+ if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
+ continue;
}
/*
* When allocating a page cache page for writing, we
@@ -2408,32 +2384,40 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
return page;
}
-static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist,
- enum zone_type high_zoneidx,
- struct zone *preferred_zone)
+static void reset_alloc_batches(struct zonelist *zonelist,
+ enum zone_type high_zoneidx,
+ struct zone *preferred_zone)
{
struct zoneref *z;
struct zone *zone;
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
- if (!(gfp_mask & __GFP_NO_KSWAPD))
- wakeup_kswapd(zone, order, zone_idx(preferred_zone));
/*
* Only reset the batches of zones that were actually
- * considered in the fast path, we don't want to
- * thrash fairness information for zones that are not
+ * considered in the fairness pass, we don't want to
+ * trash fairness information for zones that are not
* actually part of this zonelist's round-robin cycle.
*/
if (!zone_local(preferred_zone, zone))
continue;
mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) -
- low_wmark_pages(zone) -
- zone_page_state(zone, NR_ALLOC_BATCH));
+ high_wmark_pages(zone) - low_wmark_pages(zone) -
+ atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
}
}
+static void wake_all_kswapds(unsigned int order,
+ struct zonelist *zonelist,
+ enum zone_type high_zoneidx,
+ struct zone *preferred_zone)
+{
+ struct zoneref *z;
+ struct zone *zone;
+
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+ wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+}
+
static inline int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
@@ -2522,12 +2506,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
* allowed per node queues are empty and that nodes are
* over allocated.
*/
- if (gfp_thisnode_allocation(gfp_mask))
+ if (IS_ENABLED(CONFIG_NUMA) &&
+ (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
restart:
- prepare_slowpath(gfp_mask, order, zonelist,
- high_zoneidx, preferred_zone);
+ if (!(gfp_mask & __GFP_NO_KSWAPD))
+ wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -2711,7 +2696,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
- int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
+ int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
struct mem_cgroup *memcg = NULL;
gfp_mask &= gfp_allowed_mask;
@@ -2752,12 +2737,29 @@ retry_cpuset:
if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
+retry:
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, alloc_flags,
preferred_zone, migratetype);
if (unlikely(!page)) {
/*
+ * The first pass makes sure allocations are spread
+ * fairly within the local node. However, the local
+ * node might have free pages left after the fairness
+ * batches are exhausted, and remote zones haven't
+ * even been considered yet. Try once more without
+ * fairness, and include remote zones now, before
+ * entering the slowpath and waking kswapd: prefer
+ * spilling to a remote zone over swapping locally.
+ */
+ if (alloc_flags & ALLOC_FAIR) {
+ reset_alloc_batches(zonelist, high_zoneidx,
+ preferred_zone);
+ alloc_flags &= ~ALLOC_FAIR;
+ goto retry;
+ }
+ /*
* Runtime PM, block IO and its error handling path
* can deadlock because I/O on the device might not
* complete.
@@ -4919,7 +4921,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
- init_zone_allows_reclaim(nid);
+ if (node_state(nid, N_MEMORY))
+ init_zone_allows_reclaim(nid);
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
#endif
@@ -5070,7 +5073,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
nodemask_t saved_node_state = node_states[N_MEMORY];
unsigned long totalpages = early_calculate_totalpages();
int usable_nodes = nodes_weight(node_states[N_MEMORY]);
- struct memblock_type *type = &memblock.memory;
+ struct memblock_region *r;
/* Need to find movable_zone earlier when movable_node is specified. */
find_usable_zone_for_movable();
@@ -5080,13 +5083,13 @@ static void __init find_zone_movable_pfns_for_nodes(void)
* options.
*/
if (movable_node_is_enabled()) {
- for (i = 0; i < type->cnt; i++) {
- if (!memblock_is_hotpluggable(&type->regions[i]))
+ for_each_memblock(memory, r) {
+ if (!memblock_is_hotpluggable(r))
continue;
- nid = type->regions[i].nid;
+ nid = r->nid;
- usable_startpfn = PFN_DOWN(type->regions[i].base);
+ usable_startpfn = PFN_DOWN(r->base);
zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
min(usable_startpfn, zone_movable_pfn[nid]) :
usable_startpfn;
@@ -6544,7 +6547,8 @@ static void dump_page_flags(unsigned long flags)
printk(")\n");
}
-void dump_page_badflags(struct page *page, char *reason, unsigned long badflags)
+void dump_page_badflags(struct page *page, const char *reason,
+ unsigned long badflags)
{
printk(KERN_ALERT
"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
@@ -6560,8 +6564,8 @@ void dump_page_badflags(struct page *page, char *reason, unsigned long badflags)
mem_cgroup_print_bad_page(page);
}
-void dump_page(struct page *page, char *reason)
+void dump_page(struct page *page, const char *reason)
{
dump_page_badflags(page, reason, 0);
}
-EXPORT_SYMBOL_GPL(dump_page);
+EXPORT_SYMBOL(dump_page);
diff --git a/mm/readahead.c b/mm/readahead.c
index 29c5e1af5a0c..0ca36a7770b1 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -8,9 +8,7 @@
*/
#include <linux/kernel.h>
-#include <linux/fs.h>
#include <linux/gfp.h>
-#include <linux/mm.h>
#include <linux/export.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
@@ -20,6 +18,8 @@
#include <linux/syscalls.h>
#include <linux/file.h>
+#include "internal.h"
+
/*
* Initialise a struct file's readahead state. Assumes that the caller has
* memset *ra to zero.
@@ -149,8 +149,7 @@ out:
*
* Returns the number of pages requested, or the maximum amount of I/O allowed.
*/
-static int
-__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read,
unsigned long lookahead_size)
{
@@ -244,20 +243,6 @@ unsigned long max_sane_readahead(unsigned long nr)
}
/*
- * Submit IO for the read-ahead request in file_ra_state.
- */
-unsigned long ra_submit(struct file_ra_state *ra,
- struct address_space *mapping, struct file *filp)
-{
- int actual;
-
- actual = __do_page_cache_readahead(mapping, filp,
- ra->start, ra->size, ra->async_size);
-
- return actual;
-}
-
-/*
* Set the initial window size, round to next power of 2 and square
* for small size, x 4 for medium, and x 2 for large
* for 128k (32 page) max ra
diff --git a/mm/rmap.c b/mm/rmap.c
index 11cf322f8133..9c3e77396d1a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1332,9 +1332,19 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
BUG_ON(!page || PageAnon(page));
if (locked_vma) {
- mlock_vma_page(page); /* no-op if already mlocked */
- if (page == check_page)
+ if (page == check_page) {
+ /* we know we have check_page locked */
+ mlock_vma_page(page);
ret = SWAP_MLOCK;
+ } else if (trylock_page(page)) {
+ /*
+ * If we can lock the page, perform mlock.
+ * Otherwise leave the page alone, it will be
+ * eventually encountered again later.
+ */
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
continue; /* don't unmap */
}
diff --git a/mm/shmem.c b/mm/shmem.c
index a3ba988ec946..70273f8df586 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -683,7 +683,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
* the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used.
*/
- error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
+ error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL);
if (error)
goto out;
/* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -1080,7 +1080,7 @@ repeat:
goto failed;
}
- error = mem_cgroup_cache_charge(page, current->mm,
+ error = mem_cgroup_charge_file(page, current->mm,
gfp & GFP_RECLAIM_MASK);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
@@ -1134,7 +1134,7 @@ repeat:
SetPageSwapBacked(page);
__set_page_locked(page);
- error = mem_cgroup_cache_charge(page, current->mm,
+ error = mem_cgroup_charge_file(page, current->mm,
gfp & GFP_RECLAIM_MASK);
if (error)
goto decused;
@@ -2723,6 +2723,7 @@ static const struct super_operations shmem_ops = {
static const struct vm_operations_struct shmem_vm_ops = {
.fault = shmem_fault,
+ .map_pages = filemap_map_pages,
#ifdef CONFIG_NUMA
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
diff --git a/mm/slab.c b/mm/slab.c
index 9153c802e2fe..3db4cb06e32e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3027,7 +3027,7 @@ out:
#ifdef CONFIG_NUMA
/*
- * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
+ * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set.
*
* If we are in_interrupt, then process context, including cpusets and
* mempolicy, may not apply and should not be used for allocation policy.
@@ -3042,7 +3042,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
nid_alloc = cpuset_slab_spread_node();
else if (current->mempolicy)
- nid_alloc = slab_node();
+ nid_alloc = mempolicy_slab_node();
if (nid_alloc != nid_here)
return ____cache_alloc_node(cachep, flags, nid_alloc);
return NULL;
@@ -3074,7 +3074,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
- zonelist = node_zonelist(slab_node(), flags);
+ zonelist = node_zonelist(mempolicy_slab_node(), flags);
retry:
/*
@@ -3259,7 +3259,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
{
void *objp;
- if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
+ if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) {
objp = alternate_node_alloc(cache, flags);
if (objp)
goto out;
diff --git a/mm/slab.h b/mm/slab.h
index 8184a7cde272..3045316b7c9d 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -55,12 +55,12 @@ extern void create_boot_cache(struct kmem_cache *, const char *name,
struct mem_cgroup;
#ifdef CONFIG_SLUB
struct kmem_cache *
-__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *));
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *));
#else
static inline struct kmem_cache *
-__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *))
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *))
{ return NULL; }
#endif
@@ -119,13 +119,6 @@ static inline bool is_root_cache(struct kmem_cache *s)
return !s->memcg_params || s->memcg_params->is_root_cache;
}
-static inline bool cache_match_memcg(struct kmem_cache *cachep,
- struct mem_cgroup *memcg)
-{
- return (is_root_cache(cachep) && !memcg) ||
- (cachep->memcg_params->memcg == memcg);
-}
-
static inline void memcg_bind_pages(struct kmem_cache *s, int order)
{
if (!is_root_cache(s))
@@ -204,12 +197,6 @@ static inline bool is_root_cache(struct kmem_cache *s)
return true;
}
-static inline bool cache_match_memcg(struct kmem_cache *cachep,
- struct mem_cgroup *memcg)
-{
- return true;
-}
-
static inline void memcg_bind_pages(struct kmem_cache *s, int order)
{
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1ec3c619ba04..f3cfccf76dda 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -29,8 +29,7 @@ DEFINE_MUTEX(slab_mutex);
struct kmem_cache *kmem_cache;
#ifdef CONFIG_DEBUG_VM
-static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
- size_t size)
+static int kmem_cache_sanity_check(const char *name, size_t size)
{
struct kmem_cache *s = NULL;
@@ -57,13 +56,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
}
#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON)
- /*
- * For simplicity, we won't check this in the list of memcg
- * caches. We have control over memcg naming, and if there
- * aren't duplicates in the global list, there won't be any
- * duplicates in the memcg lists as well.
- */
- if (!memcg && !strcmp(s->name, name)) {
+ if (!strcmp(s->name, name)) {
pr_err("%s (%s): Cache name already exists.\n",
__func__, name);
dump_stack();
@@ -77,8 +70,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
return 0;
}
#else
-static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
- const char *name, size_t size)
+static inline int kmem_cache_sanity_check(const char *name, size_t size)
{
return 0;
}
@@ -139,6 +131,46 @@ unsigned long calculate_alignment(unsigned long flags,
return ALIGN(align, sizeof(void *));
}
+static struct kmem_cache *
+do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *),
+ struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+ struct kmem_cache *s;
+ int err;
+
+ err = -ENOMEM;
+ s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
+ if (!s)
+ goto out;
+
+ s->name = name;
+ s->object_size = object_size;
+ s->size = size;
+ s->align = align;
+ s->ctor = ctor;
+
+ err = memcg_alloc_cache_params(memcg, s, root_cache);
+ if (err)
+ goto out_free_cache;
+
+ err = __kmem_cache_create(s, flags);
+ if (err)
+ goto out_free_cache;
+
+ s->refcount = 1;
+ list_add(&s->list, &slab_caches);
+ memcg_register_cache(s);
+out:
+ if (err)
+ return ERR_PTR(err);
+ return s;
+
+out_free_cache:
+ memcg_free_cache_params(s);
+ kfree(s);
+ goto out;
+}
/*
* kmem_cache_create - Create a cache.
@@ -164,34 +196,21 @@ unsigned long calculate_alignment(unsigned long flags,
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
*/
-
struct kmem_cache *
-kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *),
- struct kmem_cache *parent_cache)
+kmem_cache_create(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *))
{
- struct kmem_cache *s = NULL;
+ struct kmem_cache *s;
+ char *cache_name;
int err;
get_online_cpus();
mutex_lock(&slab_mutex);
- err = kmem_cache_sanity_check(memcg, name, size);
+ err = kmem_cache_sanity_check(name, size);
if (err)
goto out_unlock;
- if (memcg) {
- /*
- * Since per-memcg caches are created asynchronously on first
- * allocation (see memcg_kmem_get_cache()), several threads can
- * try to create the same cache, but only one of them may
- * succeed. Therefore if we get here and see the cache has
- * already been created, we silently return NULL.
- */
- if (cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg)))
- goto out_unlock;
- }
-
/*
* Some allocators will constraint the set of valid flags to a subset
* of all flags. We expect them to define CACHE_CREATE_MASK in this
@@ -200,50 +219,29 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
*/
flags &= CACHE_CREATE_MASK;
- s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
+ s = __kmem_cache_alias(name, size, align, flags, ctor);
if (s)
goto out_unlock;
- err = -ENOMEM;
- s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
- if (!s)
+ cache_name = kstrdup(name, GFP_KERNEL);
+ if (!cache_name) {
+ err = -ENOMEM;
goto out_unlock;
+ }
- s->object_size = s->size = size;
- s->align = calculate_alignment(flags, align, size);
- s->ctor = ctor;
-
- s->name = kstrdup(name, GFP_KERNEL);
- if (!s->name)
- goto out_free_cache;
-
- err = memcg_alloc_cache_params(memcg, s, parent_cache);
- if (err)
- goto out_free_cache;
-
- err = __kmem_cache_create(s, flags);
- if (err)
- goto out_free_cache;
-
- s->refcount = 1;
- list_add(&s->list, &slab_caches);
- memcg_register_cache(s);
+ s = do_kmem_cache_create(cache_name, size, size,
+ calculate_alignment(flags, align, size),
+ flags, ctor, NULL, NULL);
+ if (IS_ERR(s)) {
+ err = PTR_ERR(s);
+ kfree(cache_name);
+ }
out_unlock:
mutex_unlock(&slab_mutex);
put_online_cpus();
if (err) {
- /*
- * There is no point in flooding logs with warnings or
- * especially crashing the system if we fail to create a cache
- * for a memcg. In this case we will be accounting the memcg
- * allocation to the root cgroup until we succeed to create its
- * own cache, but it isn't that critical.
- */
- if (!memcg)
- return NULL;
-
if (flags & SLAB_PANIC)
panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
name, err);
@@ -255,52 +253,112 @@ out_unlock:
return NULL;
}
return s;
+}
+EXPORT_SYMBOL(kmem_cache_create);
-out_free_cache:
- memcg_free_cache_params(s);
- kfree(s->name);
- kmem_cache_free(kmem_cache, s);
- goto out_unlock;
+#ifdef CONFIG_MEMCG_KMEM
+/*
+ * kmem_cache_create_memcg - Create a cache for a memory cgroup.
+ * @memcg: The memory cgroup the new cache is for.
+ * @root_cache: The parent of the new cache.
+ *
+ * This function attempts to create a kmem cache that will serve allocation
+ * requests going from @memcg to @root_cache. The new cache inherits properties
+ * from its parent.
+ */
+void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+ struct kmem_cache *s;
+ char *cache_name;
+
+ get_online_cpus();
+ mutex_lock(&slab_mutex);
+
+ /*
+ * Since per-memcg caches are created asynchronously on first
+ * allocation (see memcg_kmem_get_cache()), several threads can try to
+ * create the same cache, but only one of them may succeed.
+ */
+ if (cache_from_memcg_idx(root_cache, memcg_cache_id(memcg)))
+ goto out_unlock;
+
+ cache_name = memcg_create_cache_name(memcg, root_cache);
+ if (!cache_name)
+ goto out_unlock;
+
+ s = do_kmem_cache_create(cache_name, root_cache->object_size,
+ root_cache->size, root_cache->align,
+ root_cache->flags, root_cache->ctor,
+ memcg, root_cache);
+ if (IS_ERR(s)) {
+ kfree(cache_name);
+ goto out_unlock;
+ }
+
+ s->allocflags |= __GFP_KMEMCG;
+
+out_unlock:
+ mutex_unlock(&slab_mutex);
+ put_online_cpus();
}
-struct kmem_cache *
-kmem_cache_create(const char *name, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *))
+static int kmem_cache_destroy_memcg_children(struct kmem_cache *s)
{
- return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
+ int rc;
+
+ if (!s->memcg_params ||
+ !s->memcg_params->is_root_cache)
+ return 0;
+
+ mutex_unlock(&slab_mutex);
+ rc = __kmem_cache_destroy_memcg_children(s);
+ mutex_lock(&slab_mutex);
+
+ return rc;
}
-EXPORT_SYMBOL(kmem_cache_create);
+#else
+static int kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+{
+ return 0;
+}
+#endif /* CONFIG_MEMCG_KMEM */
void kmem_cache_destroy(struct kmem_cache *s)
{
- /* Destroy all the children caches if we aren't a memcg cache */
- kmem_cache_destroy_memcg_children(s);
-
get_online_cpus();
mutex_lock(&slab_mutex);
+
s->refcount--;
- if (!s->refcount) {
- list_del(&s->list);
-
- if (!__kmem_cache_shutdown(s)) {
- memcg_unregister_cache(s);
- mutex_unlock(&slab_mutex);
- if (s->flags & SLAB_DESTROY_BY_RCU)
- rcu_barrier();
-
- memcg_free_cache_params(s);
- kfree(s->name);
- kmem_cache_free(kmem_cache, s);
- } else {
- list_add(&s->list, &slab_caches);
- mutex_unlock(&slab_mutex);
- printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",
- s->name);
- dump_stack();
- }
- } else {
- mutex_unlock(&slab_mutex);
+ if (s->refcount)
+ goto out_unlock;
+
+ if (kmem_cache_destroy_memcg_children(s) != 0)
+ goto out_unlock;
+
+ list_del(&s->list);
+ memcg_unregister_cache(s);
+
+ if (__kmem_cache_shutdown(s) != 0) {
+ list_add(&s->list, &slab_caches);
+ memcg_register_cache(s);
+ printk(KERN_ERR "kmem_cache_destroy %s: "
+ "Slab cache still has objects\n", s->name);
+ dump_stack();
+ goto out_unlock;
}
+
+ mutex_unlock(&slab_mutex);
+ if (s->flags & SLAB_DESTROY_BY_RCU)
+ rcu_barrier();
+
+ memcg_free_cache_params(s);
+ kfree(s->name);
+ kmem_cache_free(kmem_cache, s);
+ goto out_put_cpus;
+
+out_unlock:
+ mutex_unlock(&slab_mutex);
+out_put_cpus:
put_online_cpus();
}
EXPORT_SYMBOL(kmem_cache_destroy);
diff --git a/mm/slub.c b/mm/slub.c
index fe6d7be22ef0..f620bbf4054a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -224,7 +224,11 @@ static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
static inline void stat(const struct kmem_cache *s, enum stat_item si)
{
#ifdef CONFIG_SLUB_STATS
- __this_cpu_inc(s->cpu_slab->stat[si]);
+ /*
+ * The rmw is racy on a preemptible kernel but this is acceptable, so
+ * avoid this_cpu_add()'s irq-disable overhead.
+ */
+ raw_cpu_inc(s->cpu_slab->stat[si]);
#endif
}
@@ -1685,7 +1689,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
do {
cpuset_mems_cookie = read_mems_allowed_begin();
- zonelist = node_zonelist(slab_node(), flags);
+ zonelist = node_zonelist(mempolicy_slab_node(), flags);
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
struct kmem_cache_node *n;
@@ -3685,6 +3689,9 @@ static int slab_unmergeable(struct kmem_cache *s)
if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
return 1;
+ if (!is_root_cache(s))
+ return 1;
+
if (s->ctor)
return 1;
@@ -3697,9 +3704,8 @@ static int slab_unmergeable(struct kmem_cache *s)
return 0;
}
-static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
- size_t align, unsigned long flags, const char *name,
- void (*ctor)(void *))
+static struct kmem_cache *find_mergeable(size_t size, size_t align,
+ unsigned long flags, const char *name, void (*ctor)(void *))
{
struct kmem_cache *s;
@@ -3722,7 +3728,7 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
continue;
if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
- continue;
+ continue;
/*
* Check if alignment is compatible.
* Courtesy of Adrian Drzewiecki
@@ -3733,23 +3739,24 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
if (s->size - size >= sizeof(void *))
continue;
- if (!cache_match_memcg(s, memcg))
- continue;
-
return s;
}
return NULL;
}
struct kmem_cache *
-__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *))
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s;
- s = find_mergeable(memcg, size, align, flags, name, ctor);
+ s = find_mergeable(size, align, flags, name, ctor);
if (s) {
+ int i;
+ struct kmem_cache *c;
+
s->refcount++;
+
/*
* Adjust the object sizes so that we clear
* the complete object on kzalloc.
@@ -3757,6 +3764,15 @@ __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
s->object_size = max(s->object_size, (int)size);
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
+ for_each_memcg_cache_index(i) {
+ c = cache_from_memcg_idx(s, i);
+ if (!c)
+ continue;
+ c->object_size = s->object_size;
+ c->inuse = max_t(int, c->inuse,
+ ALIGN(size, sizeof(void *)));
+ }
+
if (sysfs_slab_alias(s, name)) {
s->refcount--;
s = NULL;
@@ -5126,6 +5142,15 @@ static const struct kset_uevent_ops slab_uevent_ops = {
static struct kset *slab_kset;
+static inline struct kset *cache_kset(struct kmem_cache *s)
+{
+#ifdef CONFIG_MEMCG_KMEM
+ if (!is_root_cache(s))
+ return s->memcg_params->root_cache->memcg_kset;
+#endif
+ return slab_kset;
+}
+
#define ID_STR_LENGTH 64
/* Create a unique string id for a slab cache:
@@ -5191,26 +5216,39 @@ static int sysfs_slab_add(struct kmem_cache *s)
name = create_unique_id(s);
}
- s->kobj.kset = slab_kset;
+ s->kobj.kset = cache_kset(s);
err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
- if (err) {
- kobject_put(&s->kobj);
- return err;
- }
+ if (err)
+ goto out_put_kobj;
err = sysfs_create_group(&s->kobj, &slab_attr_group);
- if (err) {
- kobject_del(&s->kobj);
- kobject_put(&s->kobj);
- return err;
+ if (err)
+ goto out_del_kobj;
+
+#ifdef CONFIG_MEMCG_KMEM
+ if (is_root_cache(s)) {
+ s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
+ if (!s->memcg_kset) {
+ err = -ENOMEM;
+ goto out_del_kobj;
+ }
}
+#endif
+
kobject_uevent(&s->kobj, KOBJ_ADD);
if (!unmergeable) {
/* Setup first alias */
sysfs_slab_alias(s, s->name);
- kfree(name);
}
- return 0;
+out:
+ if (!unmergeable)
+ kfree(name);
+ return err;
+out_del_kobj:
+ kobject_del(&s->kobj);
+out_put_kobj:
+ kobject_put(&s->kobj);
+ goto out;
}
static void sysfs_slab_remove(struct kmem_cache *s)
@@ -5222,6 +5260,9 @@ static void sysfs_slab_remove(struct kmem_cache *s)
*/
return;
+#ifdef CONFIG_MEMCG_KMEM
+ kset_unregister(s->memcg_kset);
+#endif
kobject_uevent(&s->kobj, KOBJ_REMOVE);
kobject_del(&s->kobj);
kobject_put(&s->kobj);
diff --git a/mm/sparse.c b/mm/sparse.c
index 38cad8fd7397..d1b48b691ac8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -5,10 +5,12 @@
#include <linux/slab.h>
#include <linux/mmzone.h>
#include <linux/bootmem.h>
+#include <linux/compiler.h>
#include <linux/highmem.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
+
#include "internal.h"
#include <asm/dma.h>
#include <asm/pgalloc.h>
@@ -461,7 +463,7 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
}
#endif
-void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
+void __weak __meminit vmemmap_populate_print_last(void)
{
}
diff --git a/mm/util.c b/mm/util.c
index a24aa22f2473..d7813e6d4cc7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,6 +1,7 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/err.h>
#include <linux/sched.h>
@@ -307,7 +308,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
* If the architecture not support this function, simply return with no
* page pinned
*/
-int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
+int __weak __get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
{
return 0;
@@ -338,7 +339,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
* callers need to carefully consider what to use. On many architectures,
* get_user_pages_fast simply falls back to get_user_pages.
*/
-int __attribute__((weak)) get_user_pages_fast(unsigned long start,
+int __weak get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
{
struct mm_struct *mm = current->mm;
diff --git a/mm/vmacache.c b/mm/vmacache.c
new file mode 100644
index 000000000000..d4224b397c0e
--- /dev/null
+++ b/mm/vmacache.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2014 Davidlohr Bueso.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
+
+/*
+ * Flush vma caches for threads that share a given mm.
+ *
+ * The operation is safe because the caller holds the mmap_sem
+ * exclusively and other threads accessing the vma cache will
+ * have mmap_sem held at least for read, so no extra locking
+ * is required to maintain the vma cache.
+ */
+void vmacache_flush_all(struct mm_struct *mm)
+{
+ struct task_struct *g, *p;
+
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ /*
+ * Only flush the vmacache pointers as the
+ * mm seqnum is already set and curr's will
+ * be set upon invalidation when the next
+ * lookup is done.
+ */
+ if (mm == p->mm)
+ vmacache_flush(p);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * This task may be accessing a foreign mm via (for example)
+ * get_user_pages()->find_vma(). The vmacache is task-local and this
+ * task's vmacache pertains to a different mm (ie, its own). There is
+ * nothing we can do here.
+ *
+ * Also handle the case where a kernel thread has adopted this mm via use_mm().
+ * That kernel thread's vmacache is not applicable to this mm.
+ */
+static bool vmacache_valid_mm(struct mm_struct *mm)
+{
+ return current->mm == mm && !(current->flags & PF_KTHREAD);
+}
+
+void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
+{
+ if (vmacache_valid_mm(newvma->vm_mm))
+ current->vmacache[VMACACHE_HASH(addr)] = newvma;
+}
+
+static bool vmacache_valid(struct mm_struct *mm)
+{
+ struct task_struct *curr;
+
+ if (!vmacache_valid_mm(mm))
+ return false;
+
+ curr = current;
+ if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
+ /*
+ * First attempt will always be invalid, initialize
+ * the new cache for this task here.
+ */
+ curr->vmacache_seqnum = mm->vmacache_seqnum;
+ vmacache_flush(curr);
+ return false;
+ }
+ return true;
+}
+
+struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
+{
+ int i;
+
+ if (!vmacache_valid(mm))
+ return NULL;
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ struct vm_area_struct *vma = current->vmacache[i];
+
+ if (vma && vma->vm_start <= addr && vma->vm_end > addr) {
+ BUG_ON(vma->vm_mm != mm);
+ return vma;
+ }
+ }
+
+ return NULL;
+}
+
+#ifndef CONFIG_MMU
+struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ int i;
+
+ if (!vmacache_valid(mm))
+ return NULL;
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ struct vm_area_struct *vma = current->vmacache[i];
+
+ if (vma && vma->vm_start == start && vma->vm_end == end)
+ return vma;
+ }
+
+ return NULL;
+}
+#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0fdf96803c5b..bf233b283319 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -27,7 +27,9 @@
#include <linux/pfn.h>
#include <linux/kmemleak.h>
#include <linux/atomic.h>
+#include <linux/compiler.h>
#include <linux/llist.h>
+
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
@@ -1083,6 +1085,12 @@ EXPORT_SYMBOL(vm_unmap_ram);
* @node: prefer to allocate data structures on this node
* @prot: memory protection to use. PAGE_KERNEL for regular RAM
*
+ * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
+ * faster than vmap so it's good. But if you mix long-life and short-life
+ * objects with vm_map_ram(), it could consume lots of address space through
+ * fragmentation (especially on a 32bit machine). You could see failures in
+ * the end. Please use this function for short-lived objects.
+ *
* Returns: a pointer to the address that has been mapped, or %NULL on failure
*/
void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
@@ -2181,7 +2189,7 @@ EXPORT_SYMBOL(remap_vmalloc_range);
* Implement a stub for vmalloc_sync_all() if the architecture chose not to
* have one.
*/
-void __attribute__((weak)) vmalloc_sync_all(void)
+void __weak vmalloc_sync_all(void)
{
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1f56a80a7c41..06879ead7380 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2314,15 +2314,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
unsigned long lru_pages = 0;
bool aborted_reclaim = false;
struct reclaim_state *reclaim_state = current->reclaim_state;
+ gfp_t orig_mask;
struct shrink_control shrink = {
.gfp_mask = sc->gfp_mask,
};
+ enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
/*
* If the number of buffer_heads in the machine exceeds the maximum
* allowed level, force direct reclaim to scan the highmem zone as
* highmem pages could be pinning lowmem pages storing buffer_heads
*/
+ orig_mask = sc->gfp_mask;
if (buffer_heads_over_limit)
sc->gfp_mask |= __GFP_HIGHMEM;
@@ -2356,7 +2359,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
* noticeable problem, like transparent huge
* page allocations.
*/
- if (compaction_ready(zone, sc)) {
+ if ((zonelist_zone_idx(z) <= requested_highidx)
+ && compaction_ready(zone, sc)) {
aborted_reclaim = true;
continue;
}
@@ -2393,6 +2397,12 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
}
}
+ /*
+ * Restore to original mask to avoid the impact on the caller if we
+ * promoted it to __GFP_HIGHMEM.
+ */
+ sc->gfp_mask = orig_mask;
+
return aborted_reclaim;
}
diff --git a/mm/zswap.c b/mm/zswap.c
index d7337fbf6605..aeaef0fb5624 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -89,6 +89,9 @@ static unsigned int zswap_max_pool_percent = 20;
module_param_named(max_pool_percent,
zswap_max_pool_percent, uint, 0644);
+/* zbud_pool is shared by all of zswap backend */
+static struct zbud_pool *zswap_pool;
+
/*********************************
* compression functions
**********************************/
@@ -160,14 +163,14 @@ static void zswap_comp_exit(void)
* rbnode - links the entry into red-black tree for the appropriate swap type
* refcount - the number of outstanding reference to the entry. This is needed
* to protect against premature freeing of the entry by code
- * concurent calls to load, invalidate, and writeback. The lock
+ * concurrent calls to load, invalidate, and writeback. The lock
* for the zswap_tree structure that contains the entry must
* be held while changing the refcount. Since the lock must
* be held, there is no reason to also make refcount atomic.
* offset - the swap offset for the entry. Index into the red-black tree.
- * handle - zsmalloc allocation handle that stores the compressed page data
+ * handle - zbud allocation handle that stores the compressed page data
* length - the length in bytes of the compressed page data. Needed during
- * decompression
+ * decompression
*/
struct zswap_entry {
struct rb_node rbnode;
@@ -189,7 +192,6 @@ struct zswap_header {
struct zswap_tree {
struct rb_root rbroot;
spinlock_t lock;
- struct zbud_pool *pool;
};
static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
@@ -202,7 +204,7 @@ static struct kmem_cache *zswap_entry_cache;
static int zswap_entry_cache_create(void)
{
zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
- return (zswap_entry_cache == NULL);
+ return zswap_entry_cache == NULL;
}
static void zswap_entry_cache_destory(void)
@@ -282,16 +284,15 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
}
/*
- * Carries out the common pattern of freeing and entry's zsmalloc allocation,
+ * Carries out the common pattern of freeing and entry's zbud allocation,
* freeing the entry itself, and decrementing the number of stored pages.
*/
-static void zswap_free_entry(struct zswap_tree *tree,
- struct zswap_entry *entry)
+static void zswap_free_entry(struct zswap_entry *entry)
{
- zbud_free(tree->pool, entry->handle);
+ zbud_free(zswap_pool, entry->handle);
zswap_entry_cache_free(entry);
atomic_dec(&zswap_stored_pages);
- zswap_pool_pages = zbud_get_pool_size(tree->pool);
+ zswap_pool_pages = zbud_get_pool_size(zswap_pool);
}
/* caller must hold the tree lock */
@@ -311,7 +312,7 @@ static void zswap_entry_put(struct zswap_tree *tree,
BUG_ON(refcount < 0);
if (refcount == 0) {
zswap_rb_erase(&tree->rbroot, entry);
- zswap_free_entry(tree, entry);
+ zswap_free_entry(entry);
}
}
@@ -407,8 +408,8 @@ cleanup:
**********************************/
static bool zswap_is_full(void)
{
- return (totalram_pages * zswap_max_pool_percent / 100 <
- zswap_pool_pages);
+ return totalram_pages * zswap_max_pool_percent / 100 <
+ zswap_pool_pages;
}
/*********************************
@@ -545,7 +546,6 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
zbud_unmap(pool, handle);
tree = zswap_trees[swp_type(swpentry)];
offset = swp_offset(swpentry);
- BUG_ON(pool != tree->pool);
/* find and ref zswap entry */
spin_lock(&tree->lock);
@@ -573,13 +573,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
case ZSWAP_SWAPCACHE_NEW: /* page is locked */
/* decompress */
dlen = PAGE_SIZE;
- src = (u8 *)zbud_map(tree->pool, entry->handle) +
+ src = (u8 *)zbud_map(zswap_pool, entry->handle) +
sizeof(struct zswap_header);
dst = kmap_atomic(page);
ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
entry->length, dst, &dlen);
kunmap_atomic(dst);
- zbud_unmap(tree->pool, entry->handle);
+ zbud_unmap(zswap_pool, entry->handle);
BUG_ON(ret);
BUG_ON(dlen != PAGE_SIZE);
@@ -652,7 +652,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
/* reclaim space if needed */
if (zswap_is_full()) {
zswap_pool_limit_hit++;
- if (zbud_reclaim_page(tree->pool, 8)) {
+ if (zbud_reclaim_page(zswap_pool, 8)) {
zswap_reject_reclaim_fail++;
ret = -ENOMEM;
goto reject;
@@ -679,7 +679,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
/* store */
len = dlen + sizeof(struct zswap_header);
- ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN,
+ ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN,
&handle);
if (ret == -ENOSPC) {
zswap_reject_compress_poor++;
@@ -689,11 +689,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
zswap_reject_alloc_fail++;
goto freepage;
}
- zhdr = zbud_map(tree->pool, handle);
+ zhdr = zbud_map(zswap_pool, handle);
zhdr->swpentry = swp_entry(type, offset);
buf = (u8 *)(zhdr + 1);
memcpy(buf, dst, dlen);
- zbud_unmap(tree->pool, handle);
+ zbud_unmap(zswap_pool, handle);
put_cpu_var(zswap_dstmem);
/* populate entry */
@@ -716,7 +716,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
/* update stats */
atomic_inc(&zswap_stored_pages);
- zswap_pool_pages = zbud_get_pool_size(tree->pool);
+ zswap_pool_pages = zbud_get_pool_size(zswap_pool);
return 0;
@@ -752,13 +752,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
/* decompress */
dlen = PAGE_SIZE;
- src = (u8 *)zbud_map(tree->pool, entry->handle) +
+ src = (u8 *)zbud_map(zswap_pool, entry->handle) +
sizeof(struct zswap_header);
dst = kmap_atomic(page);
ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
dst, &dlen);
kunmap_atomic(dst);
- zbud_unmap(tree->pool, entry->handle);
+ zbud_unmap(zswap_pool, entry->handle);
BUG_ON(ret);
spin_lock(&tree->lock);
@@ -804,11 +804,9 @@ static void zswap_frontswap_invalidate_area(unsigned type)
/* walk the tree and free everything */
spin_lock(&tree->lock);
rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
- zswap_free_entry(tree, entry);
+ zswap_free_entry(entry);
tree->rbroot = RB_ROOT;
spin_unlock(&tree->lock);
-
- zbud_destroy_pool(tree->pool);
kfree(tree);
zswap_trees[type] = NULL;
}
@@ -822,20 +820,14 @@ static void zswap_frontswap_init(unsigned type)
struct zswap_tree *tree;
tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
- if (!tree)
- goto err;
- tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
- if (!tree->pool)
- goto freetree;
+ if (!tree) {
+ pr_err("alloc failed, zswap disabled for swap type %d\n", type);
+ return;
+ }
+
tree->rbroot = RB_ROOT;
spin_lock_init(&tree->lock);
zswap_trees[type] = tree;
- return;
-
-freetree:
- kfree(tree);
-err:
- pr_err("alloc failed, zswap disabled for swap type %d\n", type);
}
static struct frontswap_ops zswap_frontswap_ops = {
@@ -907,9 +899,16 @@ static int __init init_zswap(void)
return 0;
pr_info("loading zswap\n");
+
+ zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
+ if (!zswap_pool) {
+ pr_err("zbud pool creation failed\n");
+ goto error;
+ }
+
if (zswap_entry_cache_create()) {
pr_err("entry cache creation failed\n");
- goto error;
+ goto cachefail;
}
if (zswap_comp_init()) {
pr_err("compressor initialization failed\n");
@@ -919,6 +918,7 @@ static int __init init_zswap(void)
pr_err("per-cpu initialization failed\n");
goto pcpufail;
}
+
frontswap_register_ops(&zswap_frontswap_ops);
if (zswap_debugfs_init())
pr_warn("debugfs initialization failed\n");
@@ -927,6 +927,8 @@ pcpufail:
zswap_comp_exit();
compfail:
zswap_entry_cache_destory();
+cachefail:
+ zbud_destroy_pool(zswap_pool);
error:
return -ENOMEM;
}