diff options
author | Linus Torvalds | 2020-01-31 12:16:36 -0800 |
---|---|---|
committer | Linus Torvalds | 2020-01-31 12:16:36 -0800 |
commit | 7eec11d3a784a283f916590e5aa30b855c2ccfd7 (patch) | |
tree | e1bafb0d159b787684e392ae613933f9211c7d7a /mm | |
parent | ddaefe8947b48b638f726cf89730ecc1000ebcc3 (diff) | |
parent | 43e76af85fa7e75ac9b71fc2fcc250abb1889bff (diff) |
Merge branch 'akpm' (patches from Andrew)
Pull updates from Andrew Morton:
"Most of -mm and quite a number of other subsystems: hotfixes, scripts,
ocfs2, misc, lib, binfmt, init, reiserfs, exec, dma-mapping, kcov.
MM is fairly quiet this time. Holidays, I assume"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (118 commits)
kcov: ignore fault-inject and stacktrace
include/linux/io-mapping.h-mapping: use PHYS_PFN() macro in io_mapping_map_atomic_wc()
execve: warn if process starts with executable stack
reiserfs: prevent NULL pointer dereference in reiserfs_insert_item()
init/main.c: fix misleading "This architecture does not have kernel memory protection" message
init/main.c: fix quoted value handling in unknown_bootoption
init/main.c: remove unnecessary repair_env_string in do_initcall_level
init/main.c: log arguments and environment passed to init
fs/binfmt_elf.c: coredump: allow process with empty address space to coredump
fs/binfmt_elf.c: coredump: delete duplicated overflow check
fs/binfmt_elf.c: coredump: allocate core ELF header on stack
fs/binfmt_elf.c: make BAD_ADDR() unlikely
fs/binfmt_elf.c: better codegen around current->mm
fs/binfmt_elf.c: don't copy ELF header around
fs/binfmt_elf.c: fix ->start_code calculation
fs/binfmt_elf.c: smaller code generation around auxv vector fill
lib/find_bit.c: uninline helper _find_next_bit()
lib/find_bit.c: join _find_next_bit{_le}
uapi: rename ext2_swab() to swab() and share globally in swab.h
lib/scatterlist.c: adjust indentation in __sg_alloc_table
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/backing-dev.c | 1 | ||||
-rw-r--r-- | mm/debug.c | 16 | ||||
-rw-r--r-- | mm/early_ioremap.c | 8 | ||||
-rw-r--r-- | mm/filemap.c | 34 | ||||
-rw-r--r-- | mm/gup.c | 501 | ||||
-rw-r--r-- | mm/gup_benchmark.c | 9 | ||||
-rw-r--r-- | mm/huge_memory.c | 44 | ||||
-rw-r--r-- | mm/kmemleak.c | 112 | ||||
-rw-r--r-- | mm/memblock.c | 22 | ||||
-rw-r--r-- | mm/memcontrol.c | 25 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 24 | ||||
-rw-r--r-- | mm/mempolicy.c | 6 | ||||
-rw-r--r-- | mm/memremap.c | 75 | ||||
-rw-r--r-- | mm/migrate.c | 77 | ||||
-rw-r--r-- | mm/mmap.c | 36 | ||||
-rw-r--r-- | mm/oom_kill.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 75 | ||||
-rw-r--r-- | mm/page_isolation.c | 53 | ||||
-rw-r--r-- | mm/page_vma_mapped.c | 12 | ||||
-rw-r--r-- | mm/process_vm_access.c | 28 | ||||
-rw-r--r-- | mm/slub.c | 88 | ||||
-rw-r--r-- | mm/sparse.c | 2 | ||||
-rw-r--r-- | mm/swap.c | 27 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/vmscan.c | 24 | ||||
-rw-r--r-- | mm/zswap.c | 86 |
27 files changed, 769 insertions, 621 deletions
diff --git a/mm/Makefile b/mm/Makefile index 1937cc251883..32f08e22e824 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -20,6 +20,7 @@ KCOV_INSTRUMENT_kmemleak.o := n KCOV_INSTRUMENT_memcontrol.o := n KCOV_INSTRUMENT_mmzone.o := n KCOV_INSTRUMENT_vmstat.o := n +KCOV_INSTRUMENT_failslab.o := n CFLAGS_init-mm.o += $(call cc-disable-warning, override-init) CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c360f6a6c844..62f05f605fb5 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -21,6 +21,7 @@ struct backing_dev_info noop_backing_dev_info = { EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; +const char *bdi_unknown_name = "(unknown)"; /* * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU diff --git a/mm/debug.c b/mm/debug.c index 74ee73cf7079..ecccd9f17801 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -46,7 +46,15 @@ void __dump_page(struct page *page, const char *reason) { struct address_space *mapping; bool page_poisoned = PagePoisoned(page); + /* + * Accessing the pageblock without the zone lock. It could change to + * "isolate" again in the meantime, but since we are just dumping the + * state for debugging, it should be fine to accept a bit of + * inaccuracy here due to racing. + */ + bool page_cma = is_migrate_cma_page(page); int mapcount; + char *type = ""; /* * If struct page is poisoned don't access Page*() functions as that @@ -78,9 +86,9 @@ void __dump_page(struct page *page, const char *reason) page, page_ref_count(page), mapcount, page->mapping, page_to_pgoff(page)); if (PageKsm(page)) - pr_warn("ksm flags: %#lx(%pGp)\n", page->flags, &page->flags); + type = "ksm "; else if (PageAnon(page)) - pr_warn("anon flags: %#lx(%pGp)\n", page->flags, &page->flags); + type = "anon "; else if (mapping) { if (mapping->host && mapping->host->i_dentry.first) { struct dentry *dentry; @@ -88,10 +96,12 @@ void __dump_page(struct page *page, const char *reason) pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry); } else pr_warn("%ps\n", mapping->a_ops); - pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags); } BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); + pr_warn("%sflags: %#lx(%pGp)%s\n", type, page->flags, &page->flags, + page_cma ? " CMA" : ""); + hex_only: print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32, sizeof(unsigned long), page, diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index 1826f191e72c..a0018ad1a1f6 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -121,8 +121,8 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) } } - if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n", - __func__, (u64)phys_addr, size)) + if (WARN(slot < 0, "%s(%pa, %08lx) not found slot\n", + __func__, &phys_addr, size)) return NULL; /* Don't allow wraparound or zero size */ @@ -158,8 +158,8 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) --idx; --nrpages; } - WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n", - __func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]); + WARN(early_ioremap_debug, "%s(%pa, %08lx) [%d] => %08lx + %08lx\n", + __func__, &phys_addr, size, slot, offset, slot_virt[slot]); prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); return prev_map[slot]; diff --git a/mm/filemap.c b/mm/filemap.c index bf6aa30be58d..1784478270e1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -632,33 +632,6 @@ static bool mapping_needs_writeback(struct address_space *mapping) return mapping->nrpages; } -int filemap_write_and_wait(struct address_space *mapping) -{ - int err = 0; - - if (mapping_needs_writeback(mapping)) { - err = filemap_fdatawrite(mapping); - /* - * Even if the above returned error, the pages may be - * written partially (e.g. -ENOSPC), so we wait for it. - * But the -EIO is special case, it may indicate the worst - * thing (e.g. bug) happened, so we avoid waiting for it. - */ - if (err != -EIO) { - int err2 = filemap_fdatawait(mapping); - if (!err) - err = err2; - } else { - /* Clear any previously stored errors */ - filemap_check_errors(mapping); - } - } else { - err = filemap_check_errors(mapping); - } - return err; -} -EXPORT_SYMBOL(filemap_write_and_wait); - /** * filemap_write_and_wait_range - write out & wait on a file range * @mapping: the address_space for the pages @@ -680,7 +653,12 @@ int filemap_write_and_wait_range(struct address_space *mapping, if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); - /* See comment of filemap_write_and_wait() */ + /* + * Even if the above returned error, the pages may be + * written partially (e.g. -ENOSPC), so we wait for it. + * But the -EIO is special case, it may indicate the worst + * thing (e.g. bug) happened, so we avoid waiting for it. + */ if (err != -EIO) { int err2 = filemap_fdatawait_range(mapping, lstart, lend); @@ -29,8 +29,23 @@ struct follow_page_context { unsigned int page_mask; }; +/* + * Return the compound head page with ref appropriately incremented, + * or NULL if that failed. + */ +static inline struct page *try_get_compound_head(struct page *page, int refs) +{ + struct page *head = compound_head(page); + + if (WARN_ON_ONCE(page_ref_count(head) < 0)) + return NULL; + if (unlikely(!page_cache_add_speculative(head, refs))) + return NULL; + return head; +} + /** - * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages + * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages * @pages: array of pages to be maybe marked dirty, and definitely released. * @npages: number of pages in the @pages array. * @make_dirty: whether to mark the pages dirty @@ -40,19 +55,19 @@ struct follow_page_context { * * For each page in the @pages array, make that page (or its head page, if a * compound page) dirty, if @make_dirty is true, and if the page was previously - * listed as clean. In any case, releases all pages using put_user_page(), - * possibly via put_user_pages(), for the non-dirty case. + * listed as clean. In any case, releases all pages using unpin_user_page(), + * possibly via unpin_user_pages(), for the non-dirty case. * - * Please see the put_user_page() documentation for details. + * Please see the unpin_user_page() documentation for details. * * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is * required, then the caller should a) verify that this is really correct, * because _lock() is usually required, and b) hand code it: - * set_page_dirty_lock(), put_user_page(). + * set_page_dirty_lock(), unpin_user_page(). * */ -void put_user_pages_dirty_lock(struct page **pages, unsigned long npages, - bool make_dirty) +void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, + bool make_dirty) { unsigned long index; @@ -63,7 +78,7 @@ void put_user_pages_dirty_lock(struct page **pages, unsigned long npages, */ if (!make_dirty) { - put_user_pages(pages, npages); + unpin_user_pages(pages, npages); return; } @@ -91,21 +106,21 @@ void put_user_pages_dirty_lock(struct page **pages, unsigned long npages, */ if (!PageDirty(page)) set_page_dirty_lock(page); - put_user_page(page); + unpin_user_page(page); } } -EXPORT_SYMBOL(put_user_pages_dirty_lock); +EXPORT_SYMBOL(unpin_user_pages_dirty_lock); /** - * put_user_pages() - release an array of gup-pinned pages. + * unpin_user_pages() - release an array of gup-pinned pages. * @pages: array of pages to be marked dirty and released. * @npages: number of pages in the @pages array. * - * For each page in the @pages array, release the page using put_user_page(). + * For each page in the @pages array, release the page using unpin_user_page(). * - * Please see the put_user_page() documentation for details. + * Please see the unpin_user_page() documentation for details. */ -void put_user_pages(struct page **pages, unsigned long npages) +void unpin_user_pages(struct page **pages, unsigned long npages) { unsigned long index; @@ -115,9 +130,9 @@ void put_user_pages(struct page **pages, unsigned long npages) * single operation to the head page should suffice. */ for (index = 0; index < npages; index++) - put_user_page(pages[index]); + unpin_user_page(pages[index]); } -EXPORT_SYMBOL(put_user_pages); +EXPORT_SYMBOL(unpin_user_pages); #ifdef CONFIG_MMU static struct page *no_page_table(struct vm_area_struct *vma, @@ -179,6 +194,10 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, spinlock_t *ptl; pte_t *ptep, pte; + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == + (FOLL_PIN | FOLL_GET))) + return ERR_PTR(-EINVAL); retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); @@ -323,7 +342,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, pmdval = READ_ONCE(*pmd); if (pmd_none(pmdval)) return no_page_table(vma, flags); - if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) { + if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) { page = follow_huge_pmd(mm, address, pmd, flags); if (page) return page; @@ -433,7 +452,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, pud = pud_offset(p4dp, address); if (pud_none(*pud)) return no_page_table(vma, flags); - if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { + if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) { page = follow_huge_pud(mm, address, pud, flags); if (page) return page; @@ -796,7 +815,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, start = untagged_addr(start); - VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); + VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); /* * If FOLL_FORCE is set then do not force a full fault as the hinting @@ -1020,7 +1039,16 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, BUG_ON(*locked != 1); } - if (pages) + /* + * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior + * is to set FOLL_GET if the caller wants pages[] filled in (but has + * carelessly failed to specify FOLL_GET), so keep doing that, but only + * for FOLL_GET, not for the newer FOLL_PIN. + * + * FOLL_PIN always expects pages to be non-null, but no need to assert + * that here, as any failures will be obvious enough. + */ + if (pages && !(flags & FOLL_PIN)) flags |= FOLL_GET; pages_done = 0; @@ -1096,88 +1124,6 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, return pages_done; } -/* - * get_user_pages_remote() - pin user pages in memory - * @tsk: the task_struct to use for page fault accounting, or - * NULL if faults are not to be recorded. - * @mm: mm_struct of target mm - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying lookup behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. Or NULL, if caller - * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. - * @locked: pointer to lock flag indicating whether lock is held and - * subsequently whether VM_FAULT_RETRY functionality can be - * utilised. Lock must initially be held. - * - * Returns either number of pages pinned (which may be less than the - * number requested), or an error. Details about the return value: - * - * -- If nr_pages is 0, returns 0. - * -- If nr_pages is >0, but no pages were pinned, returns -errno. - * -- If nr_pages is >0, and some pages were pinned, returns the number of - * pages pinned. Again, this may be less than nr_pages. - * - * The caller is responsible for releasing returned @pages, via put_page(). - * - * @vmas are valid only as long as mmap_sem is held. - * - * Must be called with mmap_sem held for read or write. - * - * get_user_pages walks a process's page tables and takes a reference to - * each struct page that each user address corresponds to at a given - * instant. That is, it takes the page that would be accessed if a user - * thread accesses the given user virtual address at that instant. - * - * This does not guarantee that the page exists in the user mappings when - * get_user_pages returns, and there may even be a completely different - * page there in some cases (eg. if mmapped pagecache has been invalidated - * and subsequently re faulted). However it does guarantee that the page - * won't be freed completely. And mostly callers simply care that the page - * contains data that was valid *at some point in time*. Typically, an IO - * or similar operation cannot guarantee anything stronger anyway because - * locks can't be held over the syscall boundary. - * - * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page - * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must - * be called after the page is finished with, and before put_page is called. - * - * get_user_pages is typically used for fewer-copy IO operations, to get a - * handle on the memory by some means other than accesses via the user virtual - * addresses. The pages may be submitted for DMA to devices or accessed via - * their kernel linear mapping (via the kmap APIs). Care should be taken to - * use the correct cache flushing APIs. - * - * See also get_user_pages_fast, for performance critical applications. - * - * get_user_pages should be phased out in favor of - * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing - * should use get_user_pages because it cannot pass - * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. - */ -long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) -{ - /* - * FIXME: Current FOLL_LONGTERM behavior is incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. As there are no users of this flag in this call we simply - * disallow this option for now. - */ - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) - return -EINVAL; - - return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, - locked, - gup_flags | FOLL_TOUCH | FOLL_REMOTE); -} -EXPORT_SYMBOL(get_user_pages_remote); - /** * populate_vma_page_range() - populate a range of pages in the vma. * @vma: target vma @@ -1612,6 +1558,116 @@ static __always_inline long __gup_longterm_locked(struct task_struct *tsk, #endif /* CONFIG_FS_DAX || CONFIG_CMA */ /* + * get_user_pages_remote() - pin user pages in memory + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. + * + * Returns either number of pages pinned (which may be less than the + * number requested), or an error. Details about the return value: + * + * -- If nr_pages is 0, returns 0. + * -- If nr_pages is >0, but no pages were pinned, returns -errno. + * -- If nr_pages is >0, and some pages were pinned, returns the number of + * pages pinned. Again, this may be less than nr_pages. + * + * The caller is responsible for releasing returned @pages, via put_page(). + * + * @vmas are valid only as long as mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page + * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must + * be called after the page is finished with, and before put_page is called. + * + * get_user_pages is typically used for fewer-copy IO operations, to get a + * handle on the memory by some means other than accesses via the user virtual + * addresses. The pages may be submitted for DMA to devices or accessed via + * their kernel linear mapping (via the kmap APIs). Care should be taken to + * use the correct cache flushing APIs. + * + * See also get_user_pages_fast, for performance critical applications. + * + * get_user_pages should be phased out in favor of + * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing + * should use get_user_pages because it cannot pass + * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. + */ +#ifdef CONFIG_MMU +long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *locked) +{ + /* + * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, + * never directly by the caller, so enforce that with an assertion: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + return -EINVAL; + + /* + * Parts of FOLL_LONGTERM behavior are incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. However, this only comes up if locked is set, and there are + * callers that do request FOLL_LONGTERM, but do not set locked. So, + * allow what we can. + */ + if (gup_flags & FOLL_LONGTERM) { + if (WARN_ON_ONCE(locked)) + return -EINVAL; + /* + * This will check the vmas (even if our vmas arg is NULL) + * and return -ENOTSUPP if DAX isn't allowed in this case: + */ + return __gup_longterm_locked(tsk, mm, start, nr_pages, pages, + vmas, gup_flags | FOLL_TOUCH | + FOLL_REMOTE); + } + + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, + locked, + gup_flags | FOLL_TOUCH | FOLL_REMOTE); +} +EXPORT_SYMBOL(get_user_pages_remote); + +#else /* CONFIG_MMU */ +long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *locked) +{ + return 0; +} +#endif /* !CONFIG_MMU */ + +/* * This is the same as get_user_pages_remote(), just with a * less-flexible calling convention where we assume that the task * and mm being operated on are the current task's and don't allow @@ -1622,6 +1678,13 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { + /* + * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, + * never directly by the caller, so enforce that with an assertion: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + return -EINVAL; + return __gup_longterm_locked(current, current->mm, start, nr_pages, pages, vmas, gup_flags | FOLL_TOUCH); } @@ -1807,20 +1870,6 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, } } -/* - * Return the compund head page with ref appropriately incremented, - * or NULL if that failed. - */ -static inline struct page *try_get_compound_head(struct page *page, int refs) -{ - struct page *head = compound_head(page); - if (WARN_ON_ONCE(page_ref_count(head) < 0)) - return NULL; - if (unlikely(!page_cache_add_speculative(head, refs))) - return NULL; - return head; -} - #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) @@ -1978,6 +2027,29 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, } #endif +static int record_subpages(struct page *page, unsigned long addr, + unsigned long end, struct page **pages) +{ + int nr; + + for (nr = 0; addr != end; addr += PAGE_SIZE) + pages[nr++] = page++; + + return nr; +} + +static void put_compound_head(struct page *page, int refs) +{ + VM_BUG_ON_PAGE(page_ref_count(page) < refs, page); + /* + * Calling put_page() for each ref is unnecessarily slow. Only the last + * ref needs a put_page(). + */ + if (refs > 1) + page_ref_sub(page, refs - 1); + put_page(page); +} + #ifdef CONFIG_ARCH_HAS_HUGEPD static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, unsigned long sz) @@ -2007,32 +2079,20 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, /* hugepages are never "special" */ VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - refs = 0; head = pte_page(pte); - page = head + ((addr & (sz-1)) >> PAGE_SHIFT); - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); + refs = record_subpages(page, addr, end, pages + *nr); head = try_get_compound_head(head, refs); - if (!head) { - *nr -= refs; + if (!head) return 0; - } if (unlikely(pte_val(pte) != pte_val(*ptep))) { - /* Could be optimized better */ - *nr -= refs; - while (refs--) - put_page(head); + put_compound_head(head, refs); return 0; } + *nr += refs; SetPageReferenced(head); return 1; } @@ -2079,28 +2139,19 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr); } - refs = 0; page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - do { - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); + refs = record_subpages(page, addr, end, pages + *nr); head = try_get_compound_head(pmd_page(orig), refs); - if (!head) { - *nr -= refs; + if (!head) return 0; - } if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { - *nr -= refs; - while (refs--) - put_page(head); + put_compound_head(head, refs); return 0; } + *nr += refs; SetPageReferenced(head); return 1; } @@ -2120,28 +2171,19 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr); } - refs = 0; page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - do { - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); + refs = record_subpages(page, addr, end, pages + *nr); head = try_get_compound_head(pud_page(orig), refs); - if (!head) { - *nr -= refs; + if (!head) return 0; - } if (unlikely(pud_val(orig) != pud_val(*pudp))) { - *nr -= refs; - while (refs--) - put_page(head); + put_compound_head(head, refs); return 0; } + *nr += refs; SetPageReferenced(head); return 1; } @@ -2157,28 +2199,20 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 0; BUILD_BUG_ON(pgd_devmap(orig)); - refs = 0; + page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); - do { - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); + refs = record_subpages(page, addr, end, pages + *nr); head = try_get_compound_head(pgd_page(orig), refs); - if (!head) { - *nr -= refs; + if (!head) return 0; - } if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { - *nr -= refs; - while (refs--) - put_page(head); + put_compound_head(head, refs); return 0; } + *nr += refs; SetPageReferenced(head); return 1; } @@ -2237,7 +2271,7 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, pud_t pud = READ_ONCE(*pudp); next = pud_addr_end(addr, end); - if (pud_none(pud)) + if (unlikely(!pud_present(pud))) return 0; if (unlikely(pud_huge(pud))) { if (!gup_huge_pud(pud, pudp, addr, next, flags, @@ -2393,29 +2427,15 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages, return ret; } -/** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying pin behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. - * - * Attempt to pin user pages in memory without taking mm->mmap_sem. - * If not successful, it will fall back to taking the lock and - * calling get_user_pages(). - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - */ -int get_user_pages_fast(unsigned long start, int nr_pages, - unsigned int gup_flags, struct page **pages) +static int internal_get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, + struct page **pages) { unsigned long addr, len, end; int nr = 0, ret = 0; - if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM))) + if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | + FOLL_FORCE | FOLL_PIN))) return -EINVAL; start = untagged_addr(start) & PAGE_MASK; @@ -2455,4 +2475,103 @@ int get_user_pages_fast(unsigned long start, int nr_pages, return ret; } + +/** + * get_user_pages_fast() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Attempt to pin user pages in memory without taking mm->mmap_sem. + * If not successful, it will fall back to taking the lock and + * calling get_user_pages(). + * + * Returns number of pages pinned. This may be fewer than the number requested. + * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns + * -errno. + */ +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + /* + * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, + * never directly by the caller, so enforce that: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + return -EINVAL; + + return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); +} EXPORT_SYMBOL_GPL(get_user_pages_fast); + +/** + * pin_user_pages_fast() - pin user pages in memory without taking locks + * + * For now, this is a placeholder function, until various call sites are + * converted to use the correct get_user_pages*() or pin_user_pages*() API. So, + * this is identical to get_user_pages_fast(). + * + * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * is NOT intended for Case 2 (RDMA: long-term pins). + */ +int pin_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + /* + * This is a placeholder, until the pin functionality is activated. + * Until then, just behave like the corresponding get_user_pages*() + * routine. + */ + return get_user_pages_fast(start, nr_pages, gup_flags, pages); +} +EXPORT_SYMBOL_GPL(pin_user_pages_fast); + +/** + * pin_user_pages_remote() - pin pages of a remote process (task != current) + * + * For now, this is a placeholder function, until various call sites are + * converted to use the correct get_user_pages*() or pin_user_pages*() API. So, + * this is identical to get_user_pages_remote(). + * + * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * is NOT intended for Case 2 (RDMA: long-term pins). + */ +long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *locked) +{ + /* + * This is a placeholder, until the pin functionality is activated. + * Until then, just behave like the corresponding get_user_pages*() + * routine. + */ + return get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags, pages, + vmas, locked); +} +EXPORT_SYMBOL(pin_user_pages_remote); + +/** + * pin_user_pages() - pin user pages in memory for use by other devices + * + * For now, this is a placeholder function, until various call sites are + * converted to use the correct get_user_pages*() or pin_user_pages*() API. So, + * this is identical to get_user_pages(). + * + * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * is NOT intended for Case 2 (RDMA: long-term pins). + */ +long pin_user_pages(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas) +{ + /* + * This is a placeholder, until the pin functionality is activated. + * Until then, just behave like the corresponding get_user_pages*() + * routine. + */ + return get_user_pages(start, nr_pages, gup_flags, pages, vmas); +} +EXPORT_SYMBOL(pin_user_pages); diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index ad9d5b1c4473..8dba38e79a9f 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -49,18 +49,21 @@ static int __gup_benchmark_ioctl(unsigned int cmd, nr = (next - addr) / PAGE_SIZE; } + /* Filter out most gup flags: only allow a tiny subset here: */ + gup->flags &= FOLL_WRITE; + switch (cmd) { case GUP_FAST_BENCHMARK: - nr = get_user_pages_fast(addr, nr, gup->flags & 1, + nr = get_user_pages_fast(addr, nr, gup->flags, pages + i); break; case GUP_LONGTERM_BENCHMARK: nr = get_user_pages(addr, nr, - (gup->flags & 1) | FOLL_LONGTERM, + gup->flags | FOLL_LONGTERM, pages + i, NULL); break; case GUP_BENCHMARK: - nr = get_user_pages(addr, nr, gup->flags & 1, pages + i, + nr = get_user_pages(addr, nr, gup->flags, pages + i, NULL); break; default: diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f39689a29128..b08b199f9a11 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -177,16 +177,13 @@ static ssize_t enabled_store(struct kobject *kobj, { ssize_t ret = count; - if (!memcmp("always", buf, - min(sizeof("always")-1, count))) { + if (sysfs_streq(buf, "always")) { clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); - } else if (!memcmp("madvise", buf, - min(sizeof("madvise")-1, count))) { + } else if (sysfs_streq(buf, "madvise")) { clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); - } else if (!memcmp("never", buf, - min(sizeof("never")-1, count))) { + } else if (sysfs_streq(buf, "never")) { clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); } else @@ -250,32 +247,27 @@ static ssize_t defrag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - if (!memcmp("always", buf, - min(sizeof("always")-1, count))) { + if (sysfs_streq(buf, "always")) { clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); - } else if (!memcmp("defer+madvise", buf, - min(sizeof("defer+madvise")-1, count))) { + } else if (sysfs_streq(buf, "defer+madvise")) { clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); - } else if (!memcmp("defer", buf, - min(sizeof("defer")-1, count))) { + } else if (sysfs_streq(buf, "defer")) { clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); - } else if (!memcmp("madvise", buf, - min(sizeof("madvise")-1, count))) { + } else if (sysfs_streq(buf, "madvise")) { clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); - } else if (!memcmp("never", buf, - min(sizeof("never")-1, count))) { + } else if (sysfs_streq(buf, "never")) { clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); @@ -2715,7 +2707,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) { struct page *head = compound_head(page); struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); - struct deferred_split *ds_queue = get_deferred_split_queue(page); + struct deferred_split *ds_queue = get_deferred_split_queue(head); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int count, mapcount, extra_pins, ret; @@ -2723,11 +2715,11 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) unsigned long flags; pgoff_t end; - VM_BUG_ON_PAGE(is_huge_zero_page(page), page); - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageCompound(page), page); + VM_BUG_ON_PAGE(is_huge_zero_page(head), head); + VM_BUG_ON_PAGE(!PageLocked(head), head); + VM_BUG_ON_PAGE(!PageCompound(head), head); - if (PageWriteback(page)) + if (PageWriteback(head)) return -EBUSY; if (PageAnon(head)) { @@ -2778,7 +2770,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out_unlock; } - mlocked = PageMlocked(page); + mlocked = PageMlocked(head); unmap_page(head); VM_BUG_ON_PAGE(compound_mapcount(head), head); @@ -2810,14 +2802,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ds_queue->split_queue_len--; list_del(page_deferred_list(head)); } + spin_unlock(&ds_queue->split_queue_lock); if (mapping) { - if (PageSwapBacked(page)) - __dec_node_page_state(page, NR_SHMEM_THPS); + if (PageSwapBacked(head)) + __dec_node_page_state(head, NR_SHMEM_THPS); else - __dec_node_page_state(page, NR_FILE_THPS); + __dec_node_page_state(head, NR_FILE_THPS); } - spin_unlock(&ds_queue->split_queue_lock); __split_huge_page(page, list, end, flags); if (PageSwapCache(head)) { swp_entry_t entry = { .val = page_private(head) }; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 244607663363..3a4259eeb5a0 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -13,7 +13,7 @@ * * The following locks and mutexes are used by kmemleak: * - * - kmemleak_lock (rwlock): protects the object_list modifications and + * - kmemleak_lock (raw_spinlock_t): protects the object_list modifications and * accesses to the object_tree_root. The object_list is the main list * holding the metadata (struct kmemleak_object) for the allocated memory * blocks. The object_tree_root is a red black tree used to look-up @@ -22,13 +22,13 @@ * object_tree_root in the create_object() function called from the * kmemleak_alloc() callback and removed in delete_object() called from the * kmemleak_free() callback - * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to - * the metadata (e.g. count) are protected by this lock. Note that some - * members of this structure may be protected by other means (atomic or - * kmemleak_lock). This lock is also held when scanning the corresponding - * memory block to avoid the kernel freeing it via the kmemleak_free() - * callback. This is less heavyweight than holding a global lock like - * kmemleak_lock during scanning + * - kmemleak_object.lock (raw_spinlock_t): protects a kmemleak_object. + * Accesses to the metadata (e.g. count) are protected by this lock. Note + * that some members of this structure may be protected by other means + * (atomic or kmemleak_lock). This lock is also held when scanning the + * corresponding memory block to avoid the kernel freeing it via the + * kmemleak_free() callback. This is less heavyweight than holding a global + * lock like kmemleak_lock during scanning. * - scan_mutex (mutex): ensures that only one thread may scan the memory for * unreferenced objects at a time. The gray_list contains the objects which * are already referenced or marked as false positives and need to be @@ -135,7 +135,7 @@ struct kmemleak_scan_area { * (use_count) and freed using the RCU mechanism. */ struct kmemleak_object { - spinlock_t lock; + raw_spinlock_t lock; unsigned int flags; /* object status flags */ struct list_head object_list; struct list_head gray_list; @@ -191,8 +191,8 @@ static int mem_pool_free_count = ARRAY_SIZE(mem_pool); static LIST_HEAD(mem_pool_free_list); /* search tree for object boundaries */ static struct rb_root object_tree_root = RB_ROOT; -/* rw_lock protecting the access to object_list and object_tree_root */ -static DEFINE_RWLOCK(kmemleak_lock); +/* protecting the access to object_list and object_tree_root */ +static DEFINE_RAW_SPINLOCK(kmemleak_lock); /* allocation caches for kmemleak internal data */ static struct kmem_cache *object_cache; @@ -426,7 +426,7 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp) } /* slab allocation failed, try the memory pool */ - write_lock_irqsave(&kmemleak_lock, flags); + raw_spin_lock_irqsave(&kmemleak_lock, flags); object = list_first_entry_or_null(&mem_pool_free_list, typeof(*object), object_list); if (object) @@ -435,7 +435,7 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp) object = &mem_pool[--mem_pool_free_count]; else pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n"); - write_unlock_irqrestore(&kmemleak_lock, flags); + raw_spin_unlock_irqrestore(&kmemleak_lock, flags); return object; } @@ -453,9 +453,9 @@ static void mem_pool_free(struct kmemleak_object *object) } /* add the object to the memory pool free list */ - write_lock_irqsave(&kmemleak_lock, flags); + raw_spin_lock_irqsave(&kmemleak_lock, flags); list_add(&object->object_list, &mem_pool_free_list); - write_unlock_irqrestore(&kmemleak_lock, flags); + raw_spin_unlock_irqrestore(&kmemleak_lock, flags); } /* @@ -514,9 +514,9 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) struct kmemleak_object *object; rcu_read_lock(); - read_lock_irqsave(&kmemleak_lock, flags); + raw_spin_lock_irqsave(&kmemleak_lock, flags); object = lookup_object(ptr, alias); - read_unlock_irqrestore(&kmemleak_lock, flags); + raw_spin_unlock_irqrestore(&kmemleak_lock, flags); /* check whether the object is still available */ if (object && !get_object(object)) @@ -546,11 +546,11 @@ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int ali unsigned long flags; struct kmemleak_object *object; - write_lock_irqsave(&kmemleak_lock, flags); + raw_spin_lock_irqsave(&kmemleak_lock, flags); object = lookup_object(ptr, alias); if (object) __remove_object(object); - write_unlock_irqrestore(&kmemleak_lock, flags); + raw_spin_unlock_irqrestore(&kmemleak_lock, flags); return object; } @@ -585,7 +585,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, INIT_LIST_HEAD(&object->object_list); INIT_LIST_HEAD(&object->gray_list); INIT_HLIST_HEAD(&object->area_list); - spin_lock_init(&object->lock); + raw_spin_lock_init(&object->lock); atomic_set(&object->use_count, 1); object->flags = OBJECT_ALLOCATED; object->pointer = ptr; @@ -617,7 +617,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, /* kernel backtrace */ object->trace_len = __save_stack_trace(object->trace); - write_lock_irqsave(&kmemleak_lock, flags); + raw_spin_lock_irqsave(&kmemleak_lock, flags); untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); min_addr = min(min_addr, untagged_ptr); @@ -649,7 +649,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, list_add_tail_rcu(&object->object_list, &object_list); out: - write_unlock_irqrestore(&kmemleak_lock, flags); + raw_spin_unlock_irqrestore(&kmemleak_lock, flags); return object; } @@ -667,9 +667,9 @@ static void __delete_object(struct kmemleak_object *object) * Locking here also ensures that the corresponding memory block * cannot be freed when it is being scanned. */ - spin_lock_irqsave(&object->lock, flags); + raw_spin_lock_irqsave(&object->lock, flags); object->flags &= ~OBJECT_ALLOCATED; - spin_unlock_irqrestore(&object->lock, flags); + raw_spin_unlock_irqrestore(&object->lock, flags); put_object(object); } @@ -739,9 +739,9 @@ static void paint_it(struct kmemleak_object *object, int color) { unsigned long flags; - spin_lock_irqsave(&object->lock, flags); + raw_spin_lock_irqsave(&object->lock, flags); __paint_it(object, color); - spin_unlock_irqrestore(&object->lock, flags); + raw_spin_unlock_irqrestore(&object->lock, flags); } static void paint_ptr(unsigned long ptr, int color) @@ -798,7 +798,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) if (scan_area_cache) area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); - spin_lock_irqsave(&object->lock, flags); + raw_spin_lock_irqsave(&object->lock, flags); if (!area) { pr_warn_once("Cannot allocate a scan area, scanning the full object\n"); /* mark the object for full scan to avoid false positives */ @@ -820,7 +820,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) hlist_add_head(&area->node, &object->area_list); out_unlock: - spin_unlock_irqrestore(&object->lock, flags); + raw_spin_unlock_irqrestore(&object->lock, flags); put_object(object); } @@ -842,9 +842,9 @@ static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref) return; } - spin_lock_irqsave(&object->lock, flags); + raw_spin_lock_irqsave(&object->lock, flags); object->excess_ref = excess_ref; - spin_unlock_irqrestore(&object->lock, flags); + raw_spin_unlock_irqrestore(&object->lock, flags); put_object(object); } @@ -864,9 +864,9 @@ static void object_no_scan(unsigned long ptr) return; } - spin_lock_irqsave(&object->lock, flags); + raw_spin_lock_irqsave(&object->lock, flags); object->flags |= OBJECT_NO_SCAN; - spin_unlock_irqrestore(&object->lock, flags); + raw_spin_unlock_irqrestore(&object->lock, flags); put_object(object); } @@ -1026,9 +1026,9 @@ void __ref kmemleak_update_trace(const void *ptr) return; } - spin_lock_irqsave(&object->lock, flags); + raw_spin_lock_irqsave(&object->lock, flags); object->trace_len = __save_stack_trace(object->trace); - spin_unlock_irqrestore(&object->lock, flags); + raw_spin_unlock_irqrestore(&object->lock, flags); put_object(object); } @@ -1233,7 +1233,7 @@ static void scan_block(void *_start, void *_end, unsigned long flags; unsigned long untagged_ptr; - read_lock_irqsave(&kmemleak_lock, flags); + raw_spin_lock_irqsave(&kmemleak_lock, flags); for (ptr = start; ptr < end; ptr++) { struct kmemleak_object *object; unsigned long pointer; @@ -1268,7 +1268,7 @@ static void scan_block(void *_start, void *_end, * previously acquired in scan_object(). These locks are * enclosed by scan_mutex. */ - spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); /* only pass surplus references (object already gray) */ if (color_gray(object)) { excess_ref = object->excess_ref; @@ -1277,7 +1277,7 @@ static void scan_block(void *_start, void *_end, excess_ref = 0; update_refs(object); } - spin_unlock(&object->lock); + raw_spin_unlock(&object->lock); if (excess_ref) { object = lookup_object(excess_ref, 0); @@ -1286,12 +1286,12 @@ static void scan_block(void *_start, void *_end, if (object == scanned) /* circular reference, ignore */ continue; - spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); update_refs(object); - spin_unlock(&object->lock); + raw_spin_unlock(&object->lock); } } - read_unlock_irqrestore(&kmemleak_lock, flags); + raw_spin_unlock_irqrestore(&kmemleak_lock, flags); } /* @@ -1324,7 +1324,7 @@ static void scan_object(struct kmemleak_object *object) * Once the object->lock is acquired, the corresponding memory block * cannot be freed (the same lock is acquired in delete_object). */ - spin_lock_irqsave(&object->lock, flags); + raw_spin_lock_irqsave(&object->lock, flags); if (object->flags & OBJECT_NO_SCAN) goto out; if (!(object->flags & OBJECT_ALLOCATED)) @@ -1344,9 +1344,9 @@ static void scan_object(struct kmemleak_object *object) if (start >= end) break; - spin_unlock_irqrestore(&object->lock, flags); + raw_spin_unlock_irqrestore(&object->lock, flags); cond_resched(); - spin_lock_irqsave(&object->lock, flags); + raw_spin_lock_irqsave(&object->lock, flags); } while (object->flags & OBJECT_ALLOCATED); } else hlist_for_each_entry(area, &object->area_list, node) @@ -1354,7 +1354,7 @@ static void scan_object(struct kmemleak_object *object) (void *)(area->start + area->size), object); out: - spin_unlock_irqrestore(&object->lock, flags); + raw_spin_unlock_irqrestore(&object->lock, flags); } /* @@ -1407,7 +1407,7 @@ static void kmemleak_scan(void) /* prepare the kmemleak_object's */ rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { - spin_lock_irqsave(&object->lock, flags); + raw_spin_lock_irqsave(&object->lock, flags); #ifdef DEBUG /* * With a few exceptions there should be a maximum of @@ -1424,7 +1424,7 @@ static void kmemleak_scan(void) if (color_gray(object) && get_object(object)) list_add_tail(&object->gray_list, &gray_list); - spin_unlock_irqrestore(&object->lock, flags); + raw_spin_unlock_irqrestore(&object->lock, flags); } rcu_read_unlock(); @@ -1492,14 +1492,14 @@ static void kmemleak_scan(void) */ rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { - spin_lock_irqsave(&object->lock, flags); + raw_spin_lock_irqsave(&object->lock, flags); if (color_white(object) && (object->flags & OBJECT_ALLOCATED) && update_checksum(object) && get_object(object)) { /* color it gray temporarily */ object->count = object->min_count; list_add_tail(&object->gray_list, &gray_list); } - spin_unlock_irqrestore(&object->lock, flags); + raw_spin_unlock_irqrestore(&object->lock, flags); } rcu_read_unlock(); @@ -1519,7 +1519,7 @@ static void kmemleak_scan(void) */ rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { - spin_lock_irqsave(&object->lock, flags); + raw_spin_lock_irqsave(&object->lock, flags); if (unreferenced_object(object) && !(object->flags & OBJECT_REPORTED)) { object->flags |= OBJECT_REPORTED; @@ -1529,7 +1529,7 @@ static void kmemleak_scan(void) new_leaks++; } - spin_unlock_irqrestore(&object->lock, flags); + raw_spin_unlock_irqrestore(&object->lock, flags); } rcu_read_unlock(); @@ -1681,10 +1681,10 @@ static int kmemleak_seq_show(struct seq_file *seq, void *v) struct kmemleak_object *object = v; unsigned long flags; - spin_lock_irqsave(&object->lock, flags); + raw_spin_lock_irqsave(&object->lock, flags); if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object)) print_unreferenced(seq, object); - spin_unlock_irqrestore(&object->lock, flags); + raw_spin_unlock_irqrestore(&object->lock, flags); return 0; } @@ -1714,9 +1714,9 @@ static int dump_str_object_info(const char *str) return -EINVAL; } - spin_lock_irqsave(&object->lock, flags); + raw_spin_lock_irqsave(&object->lock, flags); dump_object_info(object); - spin_unlock_irqrestore(&object->lock, flags); + raw_spin_unlock_irqrestore(&object->lock, flags); put_object(object); return 0; @@ -1735,11 +1735,11 @@ static void kmemleak_clear(void) rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { - spin_lock_irqsave(&object->lock, flags); + raw_spin_lock_irqsave(&object->lock, flags); if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object)) __paint_it(object, KMEMLEAK_GREY); - spin_unlock_irqrestore(&object->lock, flags); + raw_spin_unlock_irqrestore(&object->lock, flags); } rcu_read_unlock(); diff --git a/mm/memblock.c b/mm/memblock.c index 4bc2c7d8bf42..eba94ee3de0b 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -575,7 +575,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, * Return: * 0 on success, -errno on failure. */ -int __init_memblock memblock_add_range(struct memblock_type *type, +static int __init_memblock memblock_add_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size, int nid, enum memblock_flags flags) { @@ -694,7 +694,7 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; - memblock_dbg("memblock_add: [%pa-%pa] %pS\n", + memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, &base, &end, (void *)_RET_IP_); return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0); @@ -795,7 +795,7 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; - memblock_dbg("memblock_remove: [%pa-%pa] %pS\n", + memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, &base, &end, (void *)_RET_IP_); return memblock_remove_range(&memblock.memory, base, size); @@ -813,7 +813,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; - memblock_dbg(" memblock_free: [%pa-%pa] %pS\n", + memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, &base, &end, (void *)_RET_IP_); kmemleak_free_part_phys(base, size); @@ -824,12 +824,24 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; - memblock_dbg("memblock_reserve: [%pa-%pa] %pS\n", + memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, &base, &end, (void *)_RET_IP_); return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0); } +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP +int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size) +{ + phys_addr_t end = base + size - 1; + + memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, + &base, &end, (void *)_RET_IP_); + + return memblock_add_range(&memblock.physmem, base, size, MAX_NUMNODES, 0); +} +#endif + /** * memblock_setclr_flag - set or clear flag for a memory region * @base: base address of the region diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6c83cf4ed970..6f6dc8712e39 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5340,14 +5340,6 @@ static int mem_cgroup_move_account(struct page *page, __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (compound && !list_empty(page_deferred_list(page))) { - spin_lock(&from->deferred_split_queue.split_queue_lock); - list_del_init(page_deferred_list(page)); - from->deferred_split_queue.split_queue_len--; - spin_unlock(&from->deferred_split_queue.split_queue_lock); - } -#endif /* * It is safe to change page->mem_cgroup here because the page * is referenced, charged, and isolated - we can't race with @@ -5357,16 +5349,6 @@ static int mem_cgroup_move_account(struct page *page, /* caller should have done css_get */ page->mem_cgroup = to; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (compound && list_empty(page_deferred_list(page))) { - spin_lock(&to->deferred_split_queue.split_queue_lock); - list_add_tail(page_deferred_list(page), - &to->deferred_split_queue.split_queue); - to->deferred_split_queue.split_queue_len++; - spin_unlock(&to->deferred_split_queue.split_queue_lock); - } -#endif - spin_unlock_irqrestore(&from->move_lock, flags); ret = 0; @@ -6651,7 +6633,6 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) { struct mem_cgroup *memcg; unsigned int nr_pages; - bool compound; unsigned long flags; VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); @@ -6673,8 +6654,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) return; /* Force-charge the new page. The old one will be freed soon */ - compound = PageTransHuge(newpage); - nr_pages = compound ? hpage_nr_pages(newpage) : 1; + nr_pages = hpage_nr_pages(newpage); page_counter_charge(&memcg->memory, nr_pages); if (do_memsw_account()) @@ -6684,7 +6664,8 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) commit_charge(newpage, memcg, false); local_irq_save(flags); - mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); + mem_cgroup_charge_statistics(memcg, newpage, PageTransHuge(newpage), + nr_pages); memcg_check_events(memcg, newpage); local_irq_restore(flags); } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a91a072f2b2c..36d80915ddc2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -783,27 +783,18 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, return default_zone_for_pfn(nid, start_pfn, nr_pages); } -int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, + int online_type, int nid) { unsigned long flags; unsigned long onlined_pages = 0; struct zone *zone; int need_zonelists_rebuild = 0; - int nid; int ret; struct memory_notify arg; - struct memory_block *mem; mem_hotplug_begin(); - /* - * We can't use pfn_to_nid() because nid might be stored in struct page - * which is not yet initialized. Instead, we find nid from memory block. - */ - mem = find_memory_block(__pfn_to_section(pfn)); - nid = mem->nid; - put_device(&mem->dev); - /* associate pfn range with the zone */ zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages); move_pfn_range_to_zone(zone, pfn, nr_pages, NULL); @@ -1182,7 +1173,7 @@ static bool is_pageblock_removable_nolock(unsigned long pfn) if (!zone_spans_pfn(zone, pfn)) return false; - return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, + return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE, MEMORY_OFFLINE); } @@ -1764,8 +1755,6 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) BUG_ON(check_hotplug_memory_range(start, size)); - mem_hotplug_begin(); - /* * All memory blocks must be offlined before removing memory. Check * whether all memory blocks in question are offline and return error @@ -1778,9 +1767,14 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); - /* remove memory block devices before removing memory */ + /* + * Memory block device removal under the device_hotplug_lock is + * a barrier against racing online attempts. + */ remove_memory_block_devices(start, size); + mem_hotplug_begin(); + arch_remove_memory(nid, start, size, NULL); memblock_free(start, size); memblock_remove(start, size); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b2920ae87a61..977c641f78cf 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2821,6 +2821,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) char *flags = strchr(str, '='); int err = 1, mode; + if (flags) + *flags++ = '\0'; /* terminate mode string */ + if (nodelist) { /* NUL-terminate mode or flags string */ *nodelist++ = '\0'; @@ -2831,9 +2834,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) } else nodes_clear(nodes); - if (flags) - *flags++ = '\0'; /* terminate mode string */ - mode = match_string(policy_modes, MPOL_MAX, str); if (mode < 0) goto out; diff --git a/mm/memremap.c b/mm/memremap.c index c51c6bd2fe34..4c723d2049d5 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -27,7 +27,8 @@ static void devmap_managed_enable_put(void) static int devmap_managed_enable_get(struct dev_pagemap *pgmap) { - if (!pgmap->ops || !pgmap->ops->page_free) { + if (pgmap->type == MEMORY_DEVICE_PRIVATE && + (!pgmap->ops || !pgmap->ops->page_free)) { WARN(1, "Missing page_free method\n"); return -EINVAL; } @@ -410,48 +411,42 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, EXPORT_SYMBOL_GPL(get_dev_pagemap); #ifdef CONFIG_DEV_PAGEMAP_OPS -void __put_devmap_managed_page(struct page *page) +void free_devmap_managed_page(struct page *page) { - int count = page_ref_dec_return(page); - - /* - * If refcount is 1 then page is freed and refcount is stable as nobody - * holds a reference on the page. - */ - if (count == 1) { - /* Clear Active bit in case of parallel mark_page_accessed */ - __ClearPageActive(page); - __ClearPageWaiters(page); + /* notify page idle for dax */ + if (!is_device_private_page(page)) { + wake_up_var(&page->_refcount); + return; + } - mem_cgroup_uncharge(page); + /* Clear Active bit in case of parallel mark_page_accessed */ + __ClearPageActive(page); + __ClearPageWaiters(page); - /* - * When a device_private page is freed, the page->mapping field - * may still contain a (stale) mapping value. For example, the - * lower bits of page->mapping may still identify the page as - * an anonymous page. Ultimately, this entire field is just - * stale and wrong, and it will cause errors if not cleared. - * One example is: - * - * migrate_vma_pages() - * migrate_vma_insert_page() - * page_add_new_anon_rmap() - * __page_set_anon_rmap() - * ...checks page->mapping, via PageAnon(page) call, - * and incorrectly concludes that the page is an - * anonymous page. Therefore, it incorrectly, - * silently fails to set up the new anon rmap. - * - * For other types of ZONE_DEVICE pages, migration is either - * handled differently or not done at all, so there is no need - * to clear page->mapping. - */ - if (is_device_private_page(page)) - page->mapping = NULL; + mem_cgroup_uncharge(page); - page->pgmap->ops->page_free(page); - } else if (!count) - __put_page(page); + /* + * When a device_private page is freed, the page->mapping field + * may still contain a (stale) mapping value. For example, the + * lower bits of page->mapping may still identify the page as an + * anonymous page. Ultimately, this entire field is just stale + * and wrong, and it will cause errors if not cleared. One + * example is: + * + * migrate_vma_pages() + * migrate_vma_insert_page() + * page_add_new_anon_rmap() + * __page_set_anon_rmap() + * ...checks page->mapping, via PageAnon(page) call, + * and incorrectly concludes that the page is an + * anonymous page. Therefore, it incorrectly, + * silently fails to set up the new anon rmap. + * + * For other types of ZONE_DEVICE pages, migration is either + * handled differently or not done at all, so there is no need + * to clear page->mapping. + */ + page->mapping = NULL; + page->pgmap->ops->page_free(page); } -EXPORT_SYMBOL(__put_devmap_managed_page); #endif /* CONFIG_DEV_PAGEMAP_OPS */ diff --git a/mm/migrate.c b/mm/migrate.c index 86873b6f38a7..edf42ed90030 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -48,6 +48,7 @@ #include <linux/page_owner.h> #include <linux/sched/mm.h> #include <linux/ptrace.h> +#include <linux/oom.h> #include <asm/tlbflush.h> @@ -986,7 +987,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, } /* - * Anonymous and movable page->mapping will be cleard by + * Anonymous and movable page->mapping will be cleared by * free_pages_prepare so don't reset it here for keeping * the type to work PageAnon, for example. */ @@ -1199,8 +1200,7 @@ out: /* * A page that has been migrated has all references * removed and will be freed. A page that has not been - * migrated will have kepts its references and be - * restored. + * migrated will have kept its references and be restored. */ list_del(&page->lru); @@ -1627,8 +1627,19 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, start = i; } else if (node != current_node) { err = do_move_pages_to_node(mm, &pagelist, current_node); - if (err) + if (err) { + /* + * Positive err means the number of failed + * pages to migrate. Since we are going to + * abort and return the number of non-migrated + * pages, so need to incude the rest of the + * nr_pages that have not been attempted as + * well. + */ + if (err > 0) + err += nr_pages - i - 1; goto out; + } err = store_status(status, start, current_node, i - start); if (err) goto out; @@ -1659,8 +1670,11 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, goto out_flush; err = do_move_pages_to_node(mm, &pagelist, current_node); - if (err) + if (err) { + if (err > 0) + err += nr_pages - i - 1; goto out; + } if (i > start) { err = store_status(status, start, current_node, i - start); if (err) @@ -1674,9 +1688,16 @@ out_flush: /* Make sure we do not overwrite the existing error */ err1 = do_move_pages_to_node(mm, &pagelist, current_node); + /* + * Don't have to report non-attempted pages here since: + * - If the above loop is done gracefully all pages have been + * attempted. + * - If the above loop is aborted it means a fatal error + * happened, should return ret. + */ if (!err1) err1 = store_status(status, start, current_node, i - start); - if (!err) + if (err >= 0) err = err1; out: return err; @@ -2135,7 +2156,7 @@ static int migrate_vma_collect_hole(unsigned long start, struct migrate_vma *migrate = walk->private; unsigned long addr; - for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { + for (addr = start; addr < end; addr += PAGE_SIZE) { migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; migrate->dst[migrate->npages] = 0; migrate->npages++; @@ -2152,7 +2173,7 @@ static int migrate_vma_collect_skip(unsigned long start, struct migrate_vma *migrate = walk->private; unsigned long addr; - for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { + for (addr = start; addr < end; addr += PAGE_SIZE) { migrate->dst[migrate->npages] = 0; migrate->src[migrate->npages++] = 0; } @@ -2675,6 +2696,14 @@ int migrate_vma_setup(struct migrate_vma *args) } EXPORT_SYMBOL(migrate_vma_setup); +/* + * This code closely matches the code in: + * __handle_mm_fault() + * handle_pte_fault() + * do_anonymous_page() + * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE + * private page. + */ static void migrate_vma_insert_page(struct migrate_vma *migrate, unsigned long addr, struct page *page, @@ -2755,30 +2784,24 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (check_stable_address_space(mm)) + goto unlock_abort; + if (pte_present(*ptep)) { unsigned long pfn = pte_pfn(*ptep); - if (!is_zero_pfn(pfn)) { - pte_unmap_unlock(ptep, ptl); - mem_cgroup_cancel_charge(page, memcg, false); - goto abort; - } + if (!is_zero_pfn(pfn)) + goto unlock_abort; flush = true; - } else if (!pte_none(*ptep)) { - pte_unmap_unlock(ptep, ptl); - mem_cgroup_cancel_charge(page, memcg, false); - goto abort; - } + } else if (!pte_none(*ptep)) + goto unlock_abort; /* - * Check for usefaultfd but do not deliver the fault. Instead, + * Check for userfaultfd but do not deliver the fault. Instead, * just back off. */ - if (userfaultfd_missing(vma)) { - pte_unmap_unlock(ptep, ptl); - mem_cgroup_cancel_charge(page, memcg, false); - goto abort; - } + if (userfaultfd_missing(vma)) + goto unlock_abort; inc_mm_counter(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr, false); @@ -2802,6 +2825,9 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, *src = MIGRATE_PFN_MIGRATE; return; +unlock_abort: + pte_unmap_unlock(ptep, ptl); + mem_cgroup_cancel_charge(page, memcg, false); abort: *src &= ~MIGRATE_PFN_MIGRATE; } @@ -2834,9 +2860,8 @@ void migrate_vma_pages(struct migrate_vma *migrate) } if (!page) { - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) { + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) continue; - } if (!notified) { notified = true; diff --git a/mm/mmap.c b/mm/mmap.c index bc788548c4e5..6756b8bb0033 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1270,26 +1270,22 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_ */ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) { - struct anon_vma *anon_vma; - struct vm_area_struct *near; - - near = vma->vm_next; - if (!near) - goto try_prev; - - anon_vma = reusable_anon_vma(near, vma, near); - if (anon_vma) - return anon_vma; -try_prev: - near = vma->vm_prev; - if (!near) - goto none; - - anon_vma = reusable_anon_vma(near, near, vma); - if (anon_vma) - return anon_vma; -none: + struct anon_vma *anon_vma = NULL; + + /* Try next first. */ + if (vma->vm_next) { + anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next); + if (anon_vma) + return anon_vma; + } + + /* Try prev next. */ + if (vma->vm_prev) + anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma); + /* + * We might reach here with anon_vma == NULL if we can't find + * any reusable anon_vma. * There's no absolute need to look only at touching neighbours: * we could search further afield for "compatible" anon_vmas. * But it would probably just be a waste of time searching, @@ -1297,7 +1293,7 @@ none: * We're trying to allow mprotect remerging later on, * not trying to minimize memory used for anon_vmas. */ - return NULL; + return anon_vma; } /* diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d58c481b3df8..dfc357614e56 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -26,6 +26,7 @@ #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/task.h> +#include <linux/sched/debug.h> #include <linux/swap.h> #include <linux/timex.h> #include <linux/jiffies.h> @@ -620,6 +621,7 @@ static void oom_reap_task(struct task_struct *tsk) pr_info("oom_reaper: unable to reap pid:%d (%s)\n", task_pid_nr(tsk), tsk->comm); + sched_show_task(tsk); debug_show_all_locks(); done: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d047bf7d8fd4..15e908ad933b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5848,6 +5848,30 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) return false; } +#ifdef CONFIG_SPARSEMEM +/* Skip PFNs that belong to non-present sections */ +static inline __meminit unsigned long next_pfn(unsigned long pfn) +{ + unsigned long section_nr; + + section_nr = pfn_to_section_nr(++pfn); + if (present_section_nr(section_nr)) + return pfn; + + while (++section_nr <= __highest_present_section_nr) { + if (present_section_nr(section_nr)) + return section_nr_to_pfn(section_nr); + } + + return -1; +} +#else +static inline __meminit unsigned long next_pfn(unsigned long pfn) +{ + return pfn++; +} +#endif + /* * Initially all pages are reserved - free ones are freed * up by memblock_free_all() once the early boot process is @@ -5887,8 +5911,10 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * function. They do not exist on hotplugged memory. */ if (context == MEMMAP_EARLY) { - if (!early_pfn_valid(pfn)) + if (!early_pfn_valid(pfn)) { + pfn = next_pfn(pfn) - 1; continue; + } if (!early_pfn_in_nid(pfn, nid)) continue; if (overlap_memmap_init(zone, &pfn)) @@ -8154,20 +8180,22 @@ void *__init alloc_large_system_hash(const char *tablename, /* * This function checks whether pageblock includes unmovable pages or not. - * If @count is not zero, it is okay to include less @count unmovable pages * * PageLRU check without isolation or lru_lock could race so that * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable * check without lock_page also may miss some movable non-lru pages at * race condition. So you can't expect this function should be exact. + * + * Returns a page without holding a reference. If the caller wants to + * dereference that page (e.g., dumping), it has to make sure that that it + * cannot get removed (e.g., via memory unplug) concurrently. + * */ -bool has_unmovable_pages(struct zone *zone, struct page *page, int count, - int migratetype, int flags) +struct page *has_unmovable_pages(struct zone *zone, struct page *page, + int migratetype, int flags) { - unsigned long found; unsigned long iter = 0; unsigned long pfn = page_to_pfn(page); - const char *reason = "unmovable page"; /* * TODO we could make this much more efficient by not checking every @@ -8184,22 +8212,19 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * so consider them movable here. */ if (is_migrate_cma(migratetype)) - return false; + return NULL; - reason = "CMA page"; - goto unmovable; + return page; } - for (found = 0; iter < pageblock_nr_pages; iter++) { - unsigned long check = pfn + iter; - - if (!pfn_valid_within(check)) + for (; iter < pageblock_nr_pages; iter++) { + if (!pfn_valid_within(pfn + iter)) continue; - page = pfn_to_page(check); + page = pfn_to_page(pfn + iter); if (PageReserved(page)) - goto unmovable; + return page; /* * If the zone is movable and we have ruled out all reserved @@ -8219,7 +8244,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, unsigned int skip_pages; if (!hugepage_migration_supported(page_hstate(head))) - goto unmovable; + return page; skip_pages = compound_nr(head) - (page - head); iter += skip_pages - 1; @@ -8245,11 +8270,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) continue; - if (__PageMovable(page)) + if (__PageMovable(page) || PageLRU(page)) continue; - if (!PageLRU(page)) - found++; /* * If there are RECLAIMABLE pages, we need to check * it. But now, memory offline itself doesn't call @@ -8263,15 +8286,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * is set to both of a memory hole page and a _used_ kernel * page at boot. */ - if (found > count) - goto unmovable; + return page; } - return false; -unmovable: - WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); - if (flags & REPORT_FAILURE) - dump_page(pfn_to_page(pfn + iter), reason); - return true; + return NULL; } #ifdef CONFIG_CONTIG_ALLOC @@ -8675,10 +8692,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(!PageBuddy(page)); order = page_order(page); offlined_pages += 1 << order; -#ifdef CONFIG_DEBUG_VM - pr_info("remove from free list %lx %d %lx\n", - pfn, 1 << order, end_pfn); -#endif del_page_from_free_area(page, &zone->free_area[order]); pfn += (1 << order); } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 04ee1663cdbe..a9fd7c740c23 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -17,10 +17,9 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags) { + struct page *unmovable = NULL; struct zone *zone; - unsigned long flags, pfn; - struct memory_isolate_notify arg; - int notifier_ret; + unsigned long flags; int ret = -EBUSY; zone = page_zone(page); @@ -35,41 +34,12 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ if (is_migrate_isolate_page(page)) goto out; - pfn = page_to_pfn(page); - arg.start_pfn = pfn; - arg.nr_pages = pageblock_nr_pages; - arg.pages_found = 0; - - /* - * It may be possible to isolate a pageblock even if the - * migratetype is not MIGRATE_MOVABLE. The memory isolation - * notifier chain is used by balloon drivers to return the - * number of pages in a range that are held by the balloon - * driver to shrink memory. If all the pages are accounted for - * by balloons, are free, or on the LRU, isolation can continue. - * Later, for example, when memory hotplug notifier runs, these - * pages reported as "can be isolated" should be isolated(freed) - * by the balloon driver through the memory notifier chain. - */ - notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); - notifier_ret = notifier_to_errno(notifier_ret); - if (notifier_ret) - goto out; /* * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. * We just check MOVABLE pages. */ - if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype, - isol_flags)) - ret = 0; - - /* - * immobile means "not-on-lru" pages. If immobile is larger than - * removable-by-driver pages reported by notifier, we'll fail. - */ - -out: - if (!ret) { + unmovable = has_unmovable_pages(zone, page, migratetype, isol_flags); + if (!unmovable) { unsigned long nr_pages; int mt = get_pageblock_migratetype(page); @@ -79,11 +49,24 @@ out: NULL); __mod_zone_freepage_state(zone, -nr_pages, mt); + ret = 0; } +out: spin_unlock_irqrestore(&zone->lock, flags); - if (!ret) + if (!ret) { drain_all_pages(zone); + } else { + WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); + + if ((isol_flags & REPORT_FAILURE) && unmovable) + /* + * printk() with zone->lock held will likely trigger a + * lockdep splat, so defer it here. + */ + dump_page(unmovable, "unmovable page"); + } + return ret; } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index eff4b4520c8d..719c35246cfa 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -52,12 +52,16 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw) return true; } -static inline bool pfn_in_hpage(struct page *hpage, unsigned long pfn) +static inline bool pfn_is_match(struct page *page, unsigned long pfn) { - unsigned long hpage_pfn = page_to_pfn(hpage); + unsigned long page_pfn = page_to_pfn(page); + + /* normal page and hugetlbfs page */ + if (!PageTransCompound(page) || PageHuge(page)) + return page_pfn == pfn; /* THP can be referenced by any subpage */ - return pfn >= hpage_pfn && pfn - hpage_pfn < hpage_nr_pages(hpage); + return pfn >= page_pfn && pfn - page_pfn < hpage_nr_pages(page); } /** @@ -108,7 +112,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) pfn = pte_pfn(*pvmw->pte); } - return pfn_in_hpage(pvmw->page, pfn); + return pfn_is_match(pvmw->page, pfn); } /** diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 357aa7bef6c0..de41e830cdac 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -42,12 +42,11 @@ static int process_vm_rw_pages(struct page **pages, if (copy > len) copy = len; - if (vm_write) { + if (vm_write) copied = copy_page_from_iter(page, offset, copy, iter); - set_page_dirty_lock(page); - } else { + else copied = copy_page_to_iter(page, offset, copy, iter); - } + len -= copied; if (copied < copy && iov_iter_count(iter)) return -EFAULT; @@ -96,7 +95,7 @@ static int process_vm_rw_single_vec(unsigned long addr, flags |= FOLL_WRITE; while (!rc && nr_pages && iov_iter_count(iter)) { - int pages = min(nr_pages, max_pages_per_loop); + int pinned_pages = min(nr_pages, max_pages_per_loop); int locked = 1; size_t bytes; @@ -106,14 +105,15 @@ static int process_vm_rw_single_vec(unsigned long addr, * current/current->mm */ down_read(&mm->mmap_sem); - pages = get_user_pages_remote(task, mm, pa, pages, flags, - process_pages, NULL, &locked); + pinned_pages = pin_user_pages_remote(task, mm, pa, pinned_pages, + flags, process_pages, + NULL, &locked); if (locked) up_read(&mm->mmap_sem); - if (pages <= 0) + if (pinned_pages <= 0) return -EFAULT; - bytes = pages * PAGE_SIZE - start_offset; + bytes = pinned_pages * PAGE_SIZE - start_offset; if (bytes > len) bytes = len; @@ -122,10 +122,12 @@ static int process_vm_rw_single_vec(unsigned long addr, vm_write); len -= bytes; start_offset = 0; - nr_pages -= pages; - pa += pages * PAGE_SIZE; - while (pages) - put_page(process_pages[--pages]); + nr_pages -= pinned_pages; + pa += pinned_pages * PAGE_SIZE; + + /* If vm_write is set, the pages need to be made dirty: */ + unpin_user_pages_dirty_lock(process_pages, pinned_pages, + vm_write); } return rc; diff --git a/mm/slub.c b/mm/slub.c index 0ab92ec8c2a6..17dc00e33115 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -439,19 +439,38 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, } #ifdef CONFIG_SLUB_DEBUG +static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; +static DEFINE_SPINLOCK(object_map_lock); + /* * Determine a map of object in use on a page. * * Node listlock must be held to guarantee that the page does * not vanish from under us. */ -static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) +static unsigned long *get_map(struct kmem_cache *s, struct page *page) { void *p; void *addr = page_address(page); + VM_BUG_ON(!irqs_disabled()); + + spin_lock(&object_map_lock); + + bitmap_zero(object_map, page->objects); + for (p = page->freelist; p; p = get_freepointer(s, p)) - set_bit(slab_index(p, s, addr), map); + set_bit(slab_index(p, s, addr), object_map); + + return object_map; +} + +static void put_map(unsigned long *map) +{ + VM_BUG_ON(map != object_map); + lockdep_assert_held(&object_map_lock); + + spin_unlock(&object_map_lock); } static inline unsigned int size_from_object(struct kmem_cache *s) @@ -3675,13 +3694,12 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); void *p; - unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC); - if (!map) - return; + unsigned long *map; + slab_err(s, page, text, s->name); slab_lock(page); - get_map(s, page, map); + map = get_map(s, page); for_each_object(p, s, addr, page->objects) { if (!test_bit(slab_index(p, s, addr), map)) { @@ -3689,8 +3707,9 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, print_tracking(s, p); } } + put_map(map); + slab_unlock(page); - bitmap_free(map); #endif } @@ -4384,19 +4403,19 @@ static int count_total(struct page *page) #endif #ifdef CONFIG_SLUB_DEBUG -static void validate_slab(struct kmem_cache *s, struct page *page, - unsigned long *map) +static void validate_slab(struct kmem_cache *s, struct page *page) { void *p; void *addr = page_address(page); + unsigned long *map; + + slab_lock(page); if (!check_slab(s, page) || !on_freelist(s, page, NULL)) - return; + goto unlock; /* Now we know that a valid freelist exists */ - bitmap_zero(map, page->objects); - - get_map(s, page, map); + map = get_map(s, page); for_each_object(p, s, addr, page->objects) { u8 val = test_bit(slab_index(p, s, addr), map) ? SLUB_RED_INACTIVE : SLUB_RED_ACTIVE; @@ -4404,18 +4423,13 @@ static void validate_slab(struct kmem_cache *s, struct page *page, if (!check_object(s, page, p, val)) break; } -} - -static void validate_slab_slab(struct kmem_cache *s, struct page *page, - unsigned long *map) -{ - slab_lock(page); - validate_slab(s, page, map); + put_map(map); +unlock: slab_unlock(page); } static int validate_slab_node(struct kmem_cache *s, - struct kmem_cache_node *n, unsigned long *map) + struct kmem_cache_node *n) { unsigned long count = 0; struct page *page; @@ -4424,7 +4438,7 @@ static int validate_slab_node(struct kmem_cache *s, spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, slab_list) { - validate_slab_slab(s, page, map); + validate_slab(s, page); count++; } if (count != n->nr_partial) @@ -4435,7 +4449,7 @@ static int validate_slab_node(struct kmem_cache *s, goto out; list_for_each_entry(page, &n->full, slab_list) { - validate_slab_slab(s, page, map); + validate_slab(s, page); count++; } if (count != atomic_long_read(&n->nr_slabs)) @@ -4452,15 +4466,11 @@ static long validate_slab_cache(struct kmem_cache *s) int node; unsigned long count = 0; struct kmem_cache_node *n; - unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); - - if (!map) - return -ENOMEM; flush_all(s); for_each_kmem_cache_node(s, node, n) - count += validate_slab_node(s, n, map); - bitmap_free(map); + count += validate_slab_node(s, n); + return count; } /* @@ -4590,18 +4600,17 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, } static void process_slab(struct loc_track *t, struct kmem_cache *s, - struct page *page, enum track_item alloc, - unsigned long *map) + struct page *page, enum track_item alloc) { void *addr = page_address(page); void *p; + unsigned long *map; - bitmap_zero(map, page->objects); - get_map(s, page, map); - + map = get_map(s, page); for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) add_location(t, s, get_track(s, p, alloc)); + put_map(map); } static int list_locations(struct kmem_cache *s, char *buf, @@ -4612,11 +4621,9 @@ static int list_locations(struct kmem_cache *s, char *buf, struct loc_track t = { 0, 0, NULL }; int node; struct kmem_cache_node *n; - unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); - if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), - GFP_KERNEL)) { - bitmap_free(map); + if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), + GFP_KERNEL)) { return sprintf(buf, "Out of memory\n"); } /* Push back cpu slabs */ @@ -4631,9 +4638,9 @@ static int list_locations(struct kmem_cache *s, char *buf, spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, slab_list) - process_slab(&t, s, page, alloc, map); + process_slab(&t, s, page, alloc); list_for_each_entry(page, &n->full, slab_list) - process_slab(&t, s, page, alloc, map); + process_slab(&t, s, page, alloc); spin_unlock_irqrestore(&n->list_lock, flags); } @@ -4682,7 +4689,6 @@ static int list_locations(struct kmem_cache *s, char *buf, } free_loc_track(&t); - bitmap_free(map); if (!t.count) len += sprintf(buf, "No data\n"); return len; diff --git a/mm/sparse.c b/mm/sparse.c index 3822ecbd8a1f..3918fc3eaef1 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -789,7 +789,7 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, ms->usage = NULL; } memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); - ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr); + ms->section_mem_map = (unsigned long)NULL; } if (section_is_early && memmap) diff --git a/mm/swap.c b/mm/swap.c index 5341ae93861f..cf39d24ada2a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -813,8 +813,10 @@ void release_pages(struct page **pages, int nr) * processing, and instead, expect a call to * put_page_testzero(). */ - if (put_devmap_managed_page(page)) + if (page_is_devmap_managed(page)) { + put_devmap_managed_page(page); continue; + } } page = compound_head(page); @@ -1102,3 +1104,26 @@ void __init swap_setup(void) * _really_ don't want to cluster much more */ } + +#ifdef CONFIG_DEV_PAGEMAP_OPS +void put_devmap_managed_page(struct page *page) +{ + int count; + + if (WARN_ON_ONCE(!page_is_devmap_managed(page))) + return; + + count = page_ref_dec_return(page); + + /* + * devmap page refcounts are 1-based, rather than 0-based: if + * refcount is 1, then the page is free and the refcount is + * stable because nobody holds a reference on the page. + */ + if (count == 1) + free_devmap_managed_page(page); + else if (!count) + __put_page(page); +} +EXPORT_SYMBOL(put_devmap_managed_page); +#endif diff --git a/mm/swapfile.c b/mm/swapfile.c index bb3261d45b6a..6febae9ad3cd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2737,10 +2737,10 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) else type = si->type + 1; + ++(*pos); for (; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; - ++*pos; return si; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 572fb17c6273..c05eb9efec07 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -146,20 +146,6 @@ struct scan_control { struct reclaim_state reclaim_state; }; -#ifdef ARCH_HAS_PREFETCH -#define prefetch_prev_lru_page(_page, _base, _field) \ - do { \ - if ((_page)->lru.prev != _base) { \ - struct page *prev; \ - \ - prev = lru_to_page(&(_page->lru)); \ - prefetch(&prev->_field); \ - } \ - } while (0) -#else -#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) -#endif - #ifdef ARCH_HAS_PREFETCHW #define prefetchw_prev_lru_page(_page, _base, _field) \ do { \ @@ -2695,7 +2681,7 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); } -static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) +static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) { struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_reclaimed, nr_scanned; @@ -2874,8 +2860,6 @@ again: */ if (reclaimable) pgdat->kswapd_failures = 0; - - return reclaimable; } /* @@ -4126,10 +4110,8 @@ module_init(kswapd_init) */ int node_reclaim_mode __read_mostly; -#define RECLAIM_OFF 0 -#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ -#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ -#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ +#define RECLAIM_WRITE (1<<0) /* Writeout pages during reclaim */ +#define RECLAIM_UNMAP (1<<1) /* Unmap pages during reclaim */ /* * Priority for NODE_RECLAIM. This determines the fraction of pages diff --git a/mm/zswap.c b/mm/zswap.c index 46a322316e52..55094e63b72d 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -32,6 +32,7 @@ #include <linux/swapops.h> #include <linux/writeback.h> #include <linux/pagemap.h> +#include <linux/workqueue.h> /********************************* * statistics @@ -65,6 +66,11 @@ static u64 zswap_reject_kmemcache_fail; /* Duplicate store was encountered (rare) */ static u64 zswap_duplicate_entry; +/* Shrinker work queue */ +static struct workqueue_struct *shrink_wq; +/* Pool limit was hit, we need to calm down */ +static bool zswap_pool_reached_full; + /********************************* * tunables **********************************/ @@ -109,6 +115,11 @@ module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); +/* The threshold for accepting new pages after the max_pool_percent was hit */ +static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ +module_param_named(accept_threshold_percent, zswap_accept_thr_percent, + uint, 0644); + /* Enable/disable handling same-value filled pages (enabled by default) */ static bool zswap_same_filled_pages_enabled = true; module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, @@ -123,7 +134,8 @@ struct zswap_pool { struct crypto_comp * __percpu *tfm; struct kref kref; struct list_head list; - struct work_struct work; + struct work_struct release_work; + struct work_struct shrink_work; struct hlist_node node; char tfm_name[CRYPTO_MAX_ALG_NAME]; }; @@ -214,6 +226,13 @@ static bool zswap_is_full(void) DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } +static bool zswap_can_accept(void) +{ + return totalram_pages() * zswap_accept_thr_percent / 100 * + zswap_max_pool_percent / 100 > + DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); +} + static void zswap_update_total_size(void) { struct zswap_pool *pool; @@ -501,6 +520,16 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) return NULL; } +static void shrink_worker(struct work_struct *w) +{ + struct zswap_pool *pool = container_of(w, typeof(*pool), + shrink_work); + + if (zpool_shrink(pool->zpool, 1, NULL)) + zswap_reject_reclaim_fail++; + zswap_pool_put(pool); +} + static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { struct zswap_pool *pool; @@ -551,6 +580,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) */ kref_init(&pool->kref); INIT_LIST_HEAD(&pool->list); + INIT_WORK(&pool->shrink_work, shrink_worker); zswap_pool_debug("created", pool); @@ -624,7 +654,8 @@ static int __must_check zswap_pool_get(struct zswap_pool *pool) static void __zswap_pool_release(struct work_struct *work) { - struct zswap_pool *pool = container_of(work, typeof(*pool), work); + struct zswap_pool *pool = container_of(work, typeof(*pool), + release_work); synchronize_rcu(); @@ -647,8 +678,8 @@ static void __zswap_pool_empty(struct kref *kref) list_del_rcu(&pool->list); - INIT_WORK(&pool->work, __zswap_pool_release); - schedule_work(&pool->work); + INIT_WORK(&pool->release_work, __zswap_pool_release); + schedule_work(&pool->release_work); spin_unlock(&zswap_pools_lock); } @@ -942,22 +973,6 @@ end: return ret; } -static int zswap_shrink(void) -{ - struct zswap_pool *pool; - int ret; - - pool = zswap_pool_last_get(); - if (!pool) - return -ENOENT; - - ret = zpool_shrink(pool->zpool, 1, NULL); - - zswap_pool_put(pool); - - return ret; -} - static int zswap_is_page_same_filled(void *ptr, unsigned long *value) { unsigned int pos; @@ -1011,21 +1026,23 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* reclaim space if needed */ if (zswap_is_full()) { + struct zswap_pool *pool; + zswap_pool_limit_hit++; - if (zswap_shrink()) { - zswap_reject_reclaim_fail++; - ret = -ENOMEM; - goto reject; - } + zswap_pool_reached_full = true; + pool = zswap_pool_last_get(); + if (pool) + queue_work(shrink_wq, &pool->shrink_work); + ret = -ENOMEM; + goto reject; + } - /* A second zswap_is_full() check after - * zswap_shrink() to make sure it's now - * under the max_pool_percent - */ - if (zswap_is_full()) { + if (zswap_pool_reached_full) { + if (!zswap_can_accept()) { ret = -ENOMEM; goto reject; - } + } else + zswap_pool_reached_full = false; } /* allocate entry */ @@ -1332,11 +1349,18 @@ static int __init init_zswap(void) zswap_enabled = false; } + shrink_wq = create_workqueue("zswap-shrink"); + if (!shrink_wq) + goto fallback_fail; + frontswap_register_ops(&zswap_frontswap_ops); if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); return 0; +fallback_fail: + if (pool) + zswap_pool_destroy(pool); hp_fail: cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE); dstmem_fail: |