From 264a0ae164bc0e9144bebcd25ff030d067b1a878 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 21 Apr 2016 19:09:02 -0400 Subject: memcg: relocate charge moving from ->attach to ->post_attach Hello, So, this ended up a lot simpler than I originally expected. I tested it lightly and it seems to work fine. Petr, can you please test these two patches w/o the lru drain drop patch and see whether the problem is gone? Thanks. ------ 8< ------ If charge moving is used, memcg performs relabeling of the affected pages from its ->attach callback which is called under both cgroup_threadgroup_rwsem and thus can't create new kthreads. This is fragile as various operations may depend on workqueues making forward progress which relies on the ability to create new kthreads. There's no reason to perform charge moving from ->attach which is deep in the task migration path. Move it to ->post_attach which is called after the actual migration is finished and cgroup_threadgroup_rwsem is dropped. * move_charge_struct->mm is added and ->can_attach is now responsible for pinning and recording the target mm. mem_cgroup_clear_mc() is updated accordingly. This also simplifies mem_cgroup_move_task(). * mem_cgroup_move_task() is now called from ->post_attach instead of ->attach. Signed-off-by: Tejun Heo Cc: Johannes Weiner Acked-by: Michal Hocko Debugged-and-tested-by: Petr Mladek Reported-by: Cyril Hrubis Reported-by: Johannes Weiner Fixes: 1ed1328792ff ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem") Cc: # 4.4+ --- mm/memcontrol.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 36db05fa8acb..fe787f5c41bd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -207,6 +207,7 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); /* "mc" and its members are protected by cgroup_mutex */ static struct move_charge_struct { spinlock_t lock; /* for from, to */ + struct mm_struct *mm; struct mem_cgroup *from; struct mem_cgroup *to; unsigned long flags; @@ -4667,6 +4668,8 @@ static void __mem_cgroup_clear_mc(void) static void mem_cgroup_clear_mc(void) { + struct mm_struct *mm = mc.mm; + /* * we must clear moving_task before waking up waiters at the end of * task migration. @@ -4676,7 +4679,10 @@ static void mem_cgroup_clear_mc(void) spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; + mc.mm = NULL; spin_unlock(&mc.lock); + + mmput(mm); } static int mem_cgroup_can_attach(struct cgroup_taskset *tset) @@ -4733,6 +4739,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) VM_BUG_ON(mc.moved_swap); spin_lock(&mc.lock); + mc.mm = mm; mc.from = from; mc.to = memcg; mc.flags = move_flags; @@ -4742,8 +4749,9 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) ret = mem_cgroup_precharge_mc(mm); if (ret) mem_cgroup_clear_mc(); + } else { + mmput(mm); } - mmput(mm); return ret; } @@ -4852,11 +4860,11 @@ put: /* get_mctgt_type() gets the page */ return ret; } -static void mem_cgroup_move_charge(struct mm_struct *mm) +static void mem_cgroup_move_charge(void) { struct mm_walk mem_cgroup_move_charge_walk = { .pmd_entry = mem_cgroup_move_charge_pte_range, - .mm = mm, + .mm = mc.mm, }; lru_add_drain_all(); @@ -4868,7 +4876,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) atomic_inc(&mc.from->moving_account); synchronize_rcu(); retry: - if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) { /* * Someone who are holding the mmap_sem might be waiting in * waitq. So we cancel all extra charges, wake up all waiters, @@ -4885,23 +4893,16 @@ retry: * additional charge, the page walk just aborts. */ walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); - up_read(&mm->mmap_sem); + up_read(&mc.mm->mmap_sem); atomic_dec(&mc.from->moving_account); } -static void mem_cgroup_move_task(struct cgroup_taskset *tset) +static void mem_cgroup_move_task(void) { - struct cgroup_subsys_state *css; - struct task_struct *p = cgroup_taskset_first(tset, &css); - struct mm_struct *mm = get_task_mm(p); - - if (mm) { - if (mc.to) - mem_cgroup_move_charge(mm); - mmput(mm); - } - if (mc.to) + if (mc.to) { + mem_cgroup_move_charge(); mem_cgroup_clear_mc(); + } } #else /* !CONFIG_MMU */ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) @@ -4911,7 +4912,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup_taskset *tset) +static void mem_cgroup_move_task(void) { } #endif @@ -5195,7 +5196,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, - .attach = mem_cgroup_move_task, + .post_attach = mem_cgroup_move_task, .bind = mem_cgroup_bind, .dfl_cftypes = memory_files, .legacy_cftypes = mem_cgroup_legacy_files, -- cgit v1.2.3 From aa88b68c3b1dce8bc3fd54c8a7372a777ff265cd Mon Sep 17 00:00:00 2001 From: Kirill A. Shutemov Date: Thu, 28 Apr 2016 16:18:27 -0700 Subject: thp: keep huge zero page pinned until tlb flush Andrea has found[1] a race condition on MMU-gather based TLB flush vs split_huge_page() or shrinker which frees huge zero under us (patch 1/2 and 2/2 respectively). With new THP refcounting, we don't need patch 1/2: mmu_gather keeps the page pinned until flush is complete and the pin prevents the page from being split under us. We still need patch 2/2. This is simplified version of Andrea's patch. We don't need fancy encoding. [1] http://lkml.kernel.org/r/1447938052-22165-1-git-send-email-aarcange@redhat.com Signed-off-by: Kirill A. Shutemov Reported-by: Andrea Arcangeli Reviewed-by: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Mel Gorman Cc: Hugh Dickins Cc: Johannes Weiner Cc: Dave Hansen Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 5 +++++ mm/huge_memory.c | 6 +++--- mm/swap.c | 5 +++++ 3 files changed, 13 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7008623e24b1..d7b9e5346fba 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -152,6 +152,7 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) } struct page *get_huge_zero_page(void); +void put_huge_zero_page(void); #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) @@ -208,6 +209,10 @@ static inline bool is_huge_zero_page(struct page *page) return false; } +static inline void put_huge_zero_page(void) +{ + BUILD_BUG(); +} static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 86f9f8b82f8e..5346de05f471 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -232,7 +232,7 @@ retry: return READ_ONCE(huge_zero_page); } -static void put_huge_zero_page(void) +void put_huge_zero_page(void) { /* * Counter should never go to zero here. Only shrinker can put @@ -1684,12 +1684,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (vma_is_dax(vma)) { spin_unlock(ptl); if (is_huge_zero_pmd(orig_pmd)) - put_huge_zero_page(); + tlb_remove_page(tlb, pmd_page(orig_pmd)); } else if (is_huge_zero_pmd(orig_pmd)) { pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); atomic_long_dec(&tlb->mm->nr_ptes); spin_unlock(ptl); - put_huge_zero_page(); + tlb_remove_page(tlb, pmd_page(orig_pmd)); } else { struct page *page = pmd_page(orig_pmd); page_remove_rmap(page, true); diff --git a/mm/swap.c b/mm/swap.c index a0bc206b4ac6..03aacbcb013f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -728,6 +728,11 @@ void release_pages(struct page **pages, int nr, bool cold) zone = NULL; } + if (is_huge_zero_page(page)) { + put_huge_zero_page(); + continue; + } + page = compound_head(page); if (!put_page_testzero(page)) continue; -- cgit v1.2.3 From 3486b85a29c1741db99d0c522211c82d2b7a56d0 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 28 Apr 2016 16:18:32 -0700 Subject: mm/huge_memory: replace VM_NO_THP VM_BUG_ON with actual VMA check Khugepaged detects own VMAs by checking vm_file and vm_ops but this way it cannot distinguish private /dev/zero mappings from other special mappings like /dev/hpet which has no vm_ops and popultes PTEs in mmap. This fixes false-positive VM_BUG_ON and prevents installing THP where they are not expected. Link: http://lkml.kernel.org/r/CACT4Y+ZmuZMV5CjSFOeXviwQdABAgT7T+StKfTqan9YDtgEi5g@mail.gmail.com Fixes: 78f11a255749 ("mm: thp: fix /dev/zero MAP_PRIVATE and vm_flags cleanups") Signed-off-by: Konstantin Khlebnikov Reported-by: Dmitry Vyukov Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Dmitry Vyukov Cc: Andrea Arcangeli Cc: stable Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5346de05f471..df67b53ae3c5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1960,10 +1960,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma, * page fault if needed. */ return 0; - if (vma->vm_ops) + if (vma->vm_ops || (vm_flags & VM_NO_THP)) /* khugepaged not yet working on file or special mappings */ return 0; - VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) @@ -2352,8 +2351,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) return false; if (is_vma_temporary_stack(vma)) return false; - VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma); - return true; + return !(vma->vm_flags & VM_NO_THP); } static void collapse_huge_page(struct mm_struct *mm, -- cgit v1.2.3 From 28093f9f34cedeaea0f481c58446d9dac6dd620f Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 28 Apr 2016 16:18:35 -0700 Subject: numa: fix /proc//numa_maps for THP In gather_pte_stats() a THP pmd is cast into a pte, which is wrong because the layouts may differ depending on the architecture. On s390 this will lead to inaccurate numa_maps accounting in /proc because of misguided pte_present() and pte_dirty() checks on the fake pte. On other architectures pte_present() and pte_dirty() may work by chance, but there may be an issue with direct-access (dax) mappings w/o underlying struct pages when HAVE_PTE_SPECIAL is set and THP is available. In vm_normal_page() the fake pte will be checked with pte_special() and because there is no "special" bit in a pmd, this will always return false and the VM_PFNMAP | VM_MIXEDMAP checking will be skipped. On dax mappings w/o struct pages, an invalid struct page pointer would then be returned that can crash the kernel. This patch fixes the numa_maps THP handling by introducing new "_pmd" variants of the can_gather_numa_stats() and vm_normal_page() functions. Signed-off-by: Gerald Schaefer Cc: Naoya Horiguchi Cc: "Kirill A . Shutemov" Cc: Konstantin Khlebnikov Cc: Michal Hocko Cc: Vlastimil Babka Cc: Jerome Marchand Cc: Johannes Weiner Cc: Dave Hansen Cc: Mel Gorman Cc: Dan Williams Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Michael Holzheu Cc: [4.3+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 33 ++++++++++++++++++++++++++++++--- include/linux/mm.h | 2 ++ mm/memory.c | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 229cb546bee0..541583510cfb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1518,6 +1518,32 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, return page; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static struct page *can_gather_numa_stats_pmd(pmd_t pmd, + struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *page; + int nid; + + if (!pmd_present(pmd)) + return NULL; + + page = vm_normal_page_pmd(vma, addr, pmd); + if (!page) + return NULL; + + if (PageReserved(page)) + return NULL; + + nid = page_to_nid(page); + if (!node_isset(nid, node_states[N_MEMORY])) + return NULL; + + return page; +} +#endif + static int gather_pte_stats(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -1527,14 +1553,14 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, pte_t *orig_pte; pte_t *pte; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { - pte_t huge_pte = *(pte_t *)pmd; struct page *page; - page = can_gather_numa_stats(huge_pte, vma, addr); + page = can_gather_numa_stats_pmd(*pmd, vma, addr); if (page) - gather_stats(page, md, pte_dirty(huge_pte), + gather_stats(page, md, pmd_dirty(*pmd), HPAGE_PMD_SIZE/PAGE_SIZE); spin_unlock(ptl); return 0; @@ -1542,6 +1568,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; +#endif orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); do { struct page *page = can_gather_numa_stats(*pte, vma, addr); diff --git a/include/linux/mm.h b/include/linux/mm.h index 79b6c18d0a38..864d7221de84 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1140,6 +1140,8 @@ struct zap_details { struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); +struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t pmd); int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); diff --git a/mm/memory.c b/mm/memory.c index 93897f23cc11..305537fc8640 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -789,6 +789,46 @@ out: return pfn_to_page(pfn); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t pmd) +{ + unsigned long pfn = pmd_pfn(pmd); + + /* + * There is no pmd_special() but there may be special pmds, e.g. + * in a direct-access (dax) mapping, so let's just replicate the + * !HAVE_PTE_SPECIAL case from vm_normal_page() here. + */ + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + if (!pfn_valid(pfn)) + return NULL; + goto out; + } else { + unsigned long off; + off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } + } + + if (is_zero_pfn(pfn)) + return NULL; + if (unlikely(pfn > highest_memmap_pfn)) + return NULL; + + /* + * NOTE! We still have PageReserved() pages in the page tables. + * eg. VDSO mappings can cause them to exist. + */ +out: + return pfn_to_page(pfn); +} +#endif + /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range -- cgit v1.2.3 From 7bf52fb891b64b8d61caf0b82060adb9db761aec Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 28 Apr 2016 16:18:38 -0700 Subject: mm: vmscan: reclaim highmem zone if buffer_heads is over limit We have been reclaimed highmem zone if buffer_heads is over limit but commit 6b4f7799c6a5 ("mm: vmscan: invoke slab shrinkers from shrink_zone()") changed the behavior so it doesn't reclaim highmem zone although buffer_heads is over the limit. This patch restores the logic. Fixes: 6b4f7799c6a5 ("mm: vmscan: invoke slab shrinkers from shrink_zone()") Signed-off-by: Minchan Kim Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index b934223eaa45..c638b28310fc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2553,7 +2553,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) sc->gfp_mask |= __GFP_HIGHMEM; for_each_zone_zonelist_nodemask(zone, z, zonelist, - requested_highidx, sc->nodemask) { + gfp_zone(sc->gfp_mask), sc->nodemask) { enum zone_type classzone_idx; if (!populated_zone(zone)) -- cgit v1.2.3 From b06bad17c7435b600a1d7a35b56eff25e1d3dbc0 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 28 Apr 2016 16:18:41 -0700 Subject: mm: call swap_slot_free_notify() with page lock held Kyeongdon reported below error which is BUG_ON(!PageSwapCache(page)) in page_swap_info. The reason is that page_endio in rw_page unlocks the page if read I/O is completed so we need to hold a PG_lock again to check PageSwapCache. Otherwise, the page can be removed from swapcache. Kernel BUG at c00f9040 [verbose debug info unavailable] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP ARM Modules linked in: CPU: 4 PID: 13446 Comm: RenderThread Tainted: G W 3.10.84-g9f14aec-dirty #73 task: c3b73200 ti: dd192000 task.ti: dd192000 PC is at page_swap_info+0x10/0x2c LR is at swap_slot_free_notify+0x18/0x6c pc : [] lr : [] psr: 400f0113 sp : dd193d78 ip : c2deb1e4 fp : da015180 r10: 00000000 r9 : 000200da r8 : c120fe08 r7 : 00000000 r6 : 00000000 r5 : c249a6c0 r4 : = c249a6c0 r3 : 00000000 r2 : 40080009 r1 : 200f0113 r0 : = c249a6c0 .. .. Call Trace: page_swap_info+0x10/0x2c swap_slot_free_notify+0x18/0x6c swap_readpage+0x90/0x11c read_swap_cache_async+0x134/0x1ac swapin_readahead+0x70/0xb0 handle_pte_fault+0x320/0x6fc handle_mm_fault+0xc0/0xf0 do_page_fault+0x11c/0x36c do_DataAbort+0x34/0x118 Fixes: 3f2b1a04f44933f2 ("zram: revive swap_slot_free_notify") Signed-off-by: Minchan Kim Tested-by: Kyeongdon Kim Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_io.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index cd92e3d67a32..985f23cfa79b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -353,7 +353,11 @@ int swap_readpage(struct page *page) ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); if (!ret) { - swap_slot_free_notify(page); + if (trylock_page(page)) { + swap_slot_free_notify(page); + unlock_page(page); + } + count_vm_event(PSWPIN); return 0; } -- cgit v1.2.3 From d7e69488bd04de165667f6bc741c1c0ec6042ab9 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 28 Apr 2016 16:18:44 -0700 Subject: mm/hwpoison: fix wrong num_poisoned_pages accounting Currently, migration code increses num_poisoned_pages on *failed* migration page as well as successfully migrated one at the trial of memory-failure. It will make the stat wrong. As well, it marks the page as PG_HWPoison even if the migration trial failed. It would mean we cannot recover the corrupted page using memory-failure facility. This patches fixes it. Signed-off-by: Minchan Kim Reported-by: Vlastimil Babka Acked-by: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 6c822a7b27e0..f9dfb18a4eba 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -975,7 +975,13 @@ out: dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); /* Soft-offlined page shouldn't go through lru cache list */ - if (reason == MR_MEMORY_FAILURE) { + if (reason == MR_MEMORY_FAILURE && rc == MIGRATEPAGE_SUCCESS) { + /* + * With this release, we free successfully migrated + * page and set PG_HWPoison on just freed page + * intentionally. Although it's rather weird, it's how + * HWPoison flag works at the moment. + */ put_page(page); if (!test_set_page_hwpoison(page)) num_poisoned_pages_inc(); -- cgit v1.2.3 From fd901c95388b3bd5a6f749ed1d677a672b992298 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 28 Apr 2016 16:18:49 -0700 Subject: mm: wake kcompactd before kswapd's short sleep When kswapd goes to sleep it checks if the node is balanced and at first it sleeps only for HZ/10 time, then rechecks if the node is still balanced and nobody has woken it during the initial sleep. Only then it goes fully sleep until an allocation slowpath wakes it up again. For higher-order allocations, waking up kcompactd is done only before the full sleep. This turns out to be an issue in case another high-order allocation fails during the initial sleep. It will wake kswapd up, however kswapd considers the zone balanced from the order-0 perspective, and will just quickly try to sleep again. So if there's a longer stream of high-order allocations hitting the slowpath and waking up kswapd, it might never actually wake up kcompactd, which may be considered a regression from kswapd-based compaction. In the worst case, it might be that a single allocation that cannot direct reclaim/compact itself is waking kswapd in the retry loop and preventing kcompactd from being woken up and unblocking it. This patch makes sure kcompactd is woken up in such situations by simply moving the wakeup before the short initial sleep. More efficient solution would be to wake kcompactd immediately instead of kswapd if the node is already order-0 balanced, but in that case we should also move reset_isolation_suitable() call to kcompactd so it's not adding to the allocator's latency. Since it's late in the 4.6 cycle, let's go with the simpler change for now. Fixes: accf62422b3a ("mm, kswapd: replace kswapd compaction with waking up kcompactd") Signed-off-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index c638b28310fc..142cb61f4822 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3318,6 +3318,20 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, /* Try to sleep for a short interval */ if (prepare_kswapd_sleep(pgdat, order, remaining, balanced_classzone_idx)) { + /* + * Compaction records what page blocks it recently failed to + * isolate pages from and skips them in the future scanning. + * When kswapd is going to sleep, it is reasonable to assume + * that pages and compaction may succeed so reset the cache. + */ + reset_isolation_suitable(pgdat); + + /* + * We have freed the memory, now we should compact it to make + * allocation of the requested order possible. + */ + wakeup_kcompactd(pgdat, order, classzone_idx); + remaining = schedule_timeout(HZ/10); finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); @@ -3341,20 +3355,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, */ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); - /* - * Compaction records what page blocks it recently failed to - * isolate pages from and skips them in the future scanning. - * When kswapd is going to sleep, it is reasonable to assume - * that pages and compaction may succeed so reset the cache. - */ - reset_isolation_suitable(pgdat); - - /* - * We have freed the memory, now we should compact it to make - * allocation of the requested order possible. - */ - wakeup_kcompactd(pgdat, order, classzone_idx); - if (!kthread_should_stop()) schedule(); -- cgit v1.2.3 From c2e7e00b715d3c65f301bec8559d6af4ef8098ab Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 28 Apr 2016 16:19:03 -0700 Subject: mm/memory-failure: fix race with compound page split/merge get_hwpoison_page() must recheck relation between head and tail pages. n-horiguchi said: without this recheck, the race causes kernel to pin an irrelevant page, and finally makes kernel crash for refcount mismatch. Signed-off-by: Konstantin Khlebnikov Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 78f5f2641b91..ca5acee53b7a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -888,7 +888,15 @@ int get_hwpoison_page(struct page *page) } } - return get_page_unless_zero(head); + if (get_page_unless_zero(head)) { + if (head == compound_head(page)) + return 1; + + pr_info("MCE: %#lx cannot catch tail\n", page_to_pfn(page)); + put_page(head); + } + + return 0; } EXPORT_SYMBOL_GPL(get_hwpoison_page); -- cgit v1.2.3 From 74d369443325063a5f0260e63971decb950fd8fa Mon Sep 17 00:00:00 2001 From: Howard Cochran Date: Thu, 10 Mar 2016 01:12:39 -0500 Subject: writeback: Fix performance regression in wb_over_bg_thresh() Commit 947e9762a8dd ("writeback: update wb_over_bg_thresh() to use wb_domain aware operations") unintentionally changed this function's meaning from "are there more dirty pages than the background writeback threshold" to "are there more dirty pages than the writeback threshold". The background writeback threshold is typically half of the writeback threshold, so this had the effect of raising the number of dirty pages required to cause a writeback worker to perform background writeout. This can cause a very severe performance regression when a BDI uses BDI_CAP_STRICTLIMIT because balance_dirty_pages() and the writeback worker can now disagree on whether writeback should be initiated. For example, in a system having 1GB of RAM, a single spinning disk, and a "pass-through" FUSE filesystem mounted over the disk, application code mmapped a 128MB file on the disk and was randomly dirtying pages in that mapping. Because FUSE uses strictlimit and has a default max_ratio of only 1%, in balance_dirty_pages, thresh is ~200, bg_thresh is ~100, and the dirty_freerun_ceiling is the average of those, ~150. So, it pauses the dirtying processes when we have 151 dirty pages and wakes up a background writeback worker. But the worker tests the wrong threshold (200 instead of 100), so it does not initiate writeback and just returns. Thus, balance_dirty_pages keeps looping, sleeping and then waking up the worker who will do nothing. It remains stuck in this state until the few dirty pages that we have finally expire and we write them back for that reason. Then the whole process repeats, resulting in near-zero throughput through the FUSE BDI. The fix is to call the parameterized variant of wb_calc_thresh, so that the worker will do writeback if the bg_thresh is exceeded which was the behavior before the referenced commit. Fixes: 947e9762a8dd ("writeback: update wb_over_bg_thresh() to use wb_domain aware operations") Signed-off-by: Howard Cochran Acked-by: Tejun Heo Signed-off-by: Miklos Szeredi Cc: # v4.2+ Tested-by Sedat Dilek Signed-off-by: Jens Axboe --- mm/page-writeback.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 11ff8f758631..eeaa431ee4ec 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1910,7 +1910,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) if (gdtc->dirty > gdtc->bg_thresh) return true; - if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(gdtc)) + if (wb_stat(wb, WB_RECLAIMABLE) > + wb_calc_thresh(gdtc->wb, gdtc->bg_thresh)) return true; if (mdtc) { @@ -1924,7 +1925,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) if (mdtc->dirty > mdtc->bg_thresh) return true; - if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(mdtc)) + if (wb_stat(wb, WB_RECLAIMABLE) > + wb_calc_thresh(mdtc->wb, mdtc->bg_thresh)) return true; } -- cgit v1.2.3 From 145bdaa1501bf1c8a6cfa8ea5e347b9a46aad1b7 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 5 May 2016 16:22:00 -0700 Subject: mm: thp: correct split_huge_pages file permission split_huge_pages doesn't support get method at all, so the read permission sounds confusing, change the permission to write only. And, add "\n" to the output of set method to make it more readable. Signed-off-by: Yang Shi Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index df67b53ae3c5..f7daa7de8f48 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3452,7 +3452,7 @@ next: } } - pr_info("%lu of %lu THP split", split, total); + pr_info("%lu of %lu THP split\n", split, total); return 0; } @@ -3463,7 +3463,7 @@ static int __init split_huge_pages_debugfs(void) { void *ret; - ret = debugfs_create_file("split_huge_pages", 0644, NULL, NULL, + ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL, &split_huge_pages_fops); if (!ret) pr_warn("Failed to create split_huge_pages in debugfs"); -- cgit v1.2.3 From 684283988f703811b8a05136d0d54f1c31025ad3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 May 2016 16:22:09 -0700 Subject: huge pagecache: mmap_sem is unlocked when truncation splits pmd zap_pmd_range()'s CONFIG_DEBUG_VM !rwsem_is_locked(&mmap_sem) BUG() will be invalid with huge pagecache, in whatever way it is implemented: truncation of a hugely-mapped file to an unhugely-aligned size would easily hit it. (Although anon THP could in principle apply khugepaged to private file mappings, which are not excluded by the MADV_HUGEPAGE restrictions, in practice there's a vm_ops check which excludes them, so it never hits this BUG() - there's no interface to "truncate" an anonymous mapping.) We could complicate the test, to check i_mmap_rwsem also when there's a vm_file; but my inclination was to make zap_pmd_range() more readable by simply deleting this check. A search has shown no report of the issue in the years since commit e0897d75f0b2 ("mm, thp: print useful information when mmap_sem is unlocked in zap_pmd_range") expanded it from VM_BUG_ON() - though I cannot point to what commit I would say then fixed the issue. But there are a couple of other patches now floating around, neither yet in the tree: let's agree to retain the check as a VM_BUG_ON_VMA(), as Matthew Wilcox has done; but subject to a vma_is_anonymous() check, as Kirill Shutemov has done. And let's get this in, without waiting for any particular huge pagecache implementation to reach the tree. Matthew said "We can reproduce this BUG() in the current Linus tree with DAX PMDs". Signed-off-by: Hugh Dickins Tested-by: Matthew Wilcox Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Andres Lagar-Cavilla Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 305537fc8640..52c218e2b724 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1222,15 +1222,8 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { -#ifdef CONFIG_DEBUG_VM - if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { - pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", - __func__, addr, end, - vma->vm_start, - vma->vm_end); - BUG(); - } -#endif + VM_BUG_ON_VMA(vma_is_anonymous(vma) && + !rwsem_is_locked(&tlb->mm->mmap_sem), vma); split_huge_pmd(vma, pmd, addr); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; -- cgit v1.2.3 From bc22af74f271ef76b2e6f72f3941f91f0da3f5f8 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 5 May 2016 16:22:12 -0700 Subject: mm: update min_free_kbytes from khugepaged after core initialization Khugepaged attempts to raise min_free_kbytes if its set too low. However, on boot khugepaged sets min_free_kbytes first from subsys_initcall(), and then the mm 'core' over-rides min_free_kbytes after from init_per_zone_wmark_min(), via a module_init() call. Khugepaged used to use a late_initcall() to set min_free_kbytes (such that it occurred after the core initialization), however this was removed when the initialization of min_free_kbytes was integrated into the starting of the khugepaged thread. The fix here is simply to invoke the core initialization using a core_initcall() instead of module_init(), such that the previous initialization ordering is restored. I didn't restore the late_initcall() since start_stop_khugepaged() already sets min_free_kbytes via set_recommended_min_free_kbytes(). This was noticed when we had a number of page allocation failures when moving a workload to a kernel with this new initialization ordering. On an 8GB system this restores min_free_kbytes back to 67584 from 11365 when CONFIG_TRANSPARENT_HUGEPAGE=y is set and either CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y or CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y. Fixes: 79553da293d3 ("thp: cleanup khugepaged startup") Signed-off-by: Jason Baron Acked-by: Kirill A. Shutemov Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 59de90d5d3a3..c1069efcc4d7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6485,7 +6485,7 @@ int __meminit init_per_zone_wmark_min(void) setup_per_zone_inactive_ratio(); return 0; } -module_init(init_per_zone_wmark_min) +core_initcall(init_per_zone_wmark_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so -- cgit v1.2.3 From 14af4a5e9b26ad251f81c174e8a43f3e179434a5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 5 May 2016 16:22:15 -0700 Subject: mm, cma: prevent nr_isolated_* counters from going negative /proc/sys/vm/stat_refresh warns nr_isolated_anon and nr_isolated_file go increasingly negative under compaction: which would add delay when should be none, or no delay when should delay. The bug in compaction was due to a recent mmotm patch, but much older instance of the bug was also noticed in isolate_migratepages_range() which is used for CMA and gigantic hugepage allocations. The bug is caused by putback_movable_pages() in an error path decrementing the isolated counters without them being previously incremented by acct_isolated(). Fix isolate_migratepages_range() by removing the error-path putback, thus reaching acct_isolated() with migratepages still isolated, and leaving putback to caller like most other places do. Fixes: edc2ca612496 ("mm, compaction: move pageblock checks up from isolate_migratepages_range()") [vbabka@suse.cz: expanded the changelog] Signed-off-by: Hugh Dickins Signed-off-by: Vlastimil Babka Acked-by: Joonsoo Kim Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index ccf97b02b85f..4a0d4b8ccde8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -852,16 +852,8 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, ISOLATE_UNEVICTABLE); - /* - * In case of fatal failure, release everything that might - * have been isolated in the previous iteration, and signal - * the failure back to caller. - */ - if (!pfn) { - putback_movable_pages(&cc->migratepages); - cc->nr_migratepages = 0; + if (!pfn) break; - } if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) break; -- cgit v1.2.3 From 32a4e169039927bfb6ee9f0ccbbe3a8aaf13a4bc Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Thu, 5 May 2016 16:22:23 -0700 Subject: mm/zswap: provide unique zpool name Instead of using "zswap" as the name for all zpools created, add an atomic counter and use "zswap%x" with the counter number for each zpool created, to provide a unique name for each new zpool. As zsmalloc, one of the zpool implementations, requires/expects a unique name for each pool created, zswap should provide a unique name. The zsmalloc pool creation does not fail if a new pool with a conflicting name is created, unless CONFIG_ZSMALLOC_STAT is enabled; in that case, zsmalloc pool creation fails with -ENOMEM. Then zswap will be unable to change its compressor parameter if its zpool is zsmalloc; it also will be unable to change its zpool parameter back to zsmalloc, if it has any existing old zpool using zsmalloc with page(s) in it. Attempts to change the parameters will result in failure to create the zpool. This changes zswap to provide a unique name for each zpool creation. Fixes: f1c54846ee45 ("zswap: dynamic pool creation") Signed-off-by: Dan Streetman Reported-by: Sergey Senozhatsky Reviewed-by: Sergey Senozhatsky Cc: Dan Streetman Cc: Minchan Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index 91dad80d068b..de0f119b1780 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -170,6 +170,8 @@ static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; static LIST_HEAD(zswap_pools); /* protects zswap_pools list modification */ static DEFINE_SPINLOCK(zswap_pools_lock); +/* pool counter to provide unique names to zpool */ +static atomic_t zswap_pools_count = ATOMIC_INIT(0); /* used by param callback function */ static bool zswap_init_started; @@ -565,6 +567,7 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { struct zswap_pool *pool; + char name[38]; /* 'zswap' + 32 char (max) num + \0 */ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; pool = kzalloc(sizeof(*pool), GFP_KERNEL); @@ -573,7 +576,10 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) return NULL; } - pool->zpool = zpool_create_pool(type, "zswap", gfp, &zswap_zpool_ops); + /* unique name for each pool specifically required by zsmalloc */ + snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); + + pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops); if (!pool->zpool) { pr_err("%s zpool not available\n", type); goto error; -- cgit v1.2.3 From 172400c69cb0d0d684b7cd75ac75872b3d7c61a1 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 5 May 2016 16:22:32 -0700 Subject: mm: fix kcompactd hang during memory offlining Assume memory47 is the last online block left in node1. This will hang: # echo offline > /sys/devices/system/node/node1/memory47/state After a couple of minutes, the following pops up in dmesg: INFO: task bash:957 blocked for more than 120 seconds. Not tainted 4.6.0-rc6+ #6 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. bash D ffff8800b7adbaf8 0 957 951 0x00000000 Call Trace: schedule+0x35/0x80 schedule_timeout+0x1ac/0x270 wait_for_completion+0xe1/0x120 kthread_stop+0x4f/0x110 kcompactd_stop+0x26/0x40 __offline_pages.constprop.28+0x7e6/0x840 offline_pages+0x11/0x20 memory_block_action+0x73/0x1d0 memory_subsys_offline+0x47/0x60 device_offline+0x86/0xb0 store_mem_state+0xda/0xf0 dev_attr_store+0x18/0x30 sysfs_kf_write+0x37/0x40 kernfs_fop_write+0x11d/0x170 __vfs_write+0x37/0x120 vfs_write+0xa9/0x1a0 SyS_write+0x55/0xc0 entry_SYSCALL_64_fastpath+0x1a/0xa4 kcompactd is waiting for kcompactd_max_order > 0 when it's woken up to actually exit. Check kthread_should_stop() to break out of the wait. Fixes: 698b1b306 ("mm, compaction: introduce kcompactd"). Reported-by: Reza Arbab Tested-by: Reza Arbab Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 4a0d4b8ccde8..8fa254043801 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1733,7 +1733,7 @@ void compaction_unregister_node(struct node *node) static inline bool kcompactd_work_requested(pg_data_t *pgdat) { - return pgdat->kcompactd_max_order > 0; + return pgdat->kcompactd_max_order > 0 || kthread_should_stop(); } static bool kcompactd_node_suitable(pg_data_t *pgdat) @@ -1797,6 +1797,8 @@ static void kcompactd_do_work(pg_data_t *pgdat) INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); + if (kthread_should_stop()) + return; status = compact_zone(zone, &cc); if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone), -- cgit v1.2.3 From 44f43e99fe70833058482d183e99fdfd11220996 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 9 May 2016 16:28:49 -0700 Subject: zsmalloc: fix zs_can_compact() integer overflow zs_can_compact() has two race conditions in its core calculation: unsigned long obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) - zs_stat_get(class, OBJ_USED); 1) classes are not locked, so the numbers of allocated and used objects can change by the concurrent ops happening on other CPUs 2) shrinker invokes it from preemptible context Depending on the circumstances, thus, OBJ_ALLOCATED can become less than OBJ_USED, which can result in either very high or negative `total_scan' value calculated later in do_shrink_slab(). do_shrink_slab() has some logic to prevent those cases: vmscan: shrink_slab: zs_shrinker_scan+0x0/0x28 [zsmalloc] negative objects to delete nr=-62 vmscan: shrink_slab: zs_shrinker_scan+0x0/0x28 [zsmalloc] negative objects to delete nr=-62 vmscan: shrink_slab: zs_shrinker_scan+0x0/0x28 [zsmalloc] negative objects to delete nr=-64 vmscan: shrink_slab: zs_shrinker_scan+0x0/0x28 [zsmalloc] negative objects to delete nr=-62 vmscan: shrink_slab: zs_shrinker_scan+0x0/0x28 [zsmalloc] negative objects to delete nr=-62 vmscan: shrink_slab: zs_shrinker_scan+0x0/0x28 [zsmalloc] negative objects to delete nr=-62 However, due to the way `total_scan' is calculated, not every shrinker->count_objects() overflow can be spotted and handled. To demonstrate the latter, I added some debugging code to do_shrink_slab() (x86_64) and the results were: vmscan: OVERFLOW: shrinker->count_objects() == -1 [18446744073709551615] vmscan: but total_scan > 0: 92679974445502 vmscan: resulting total_scan: 92679974445502 [..] vmscan: OVERFLOW: shrinker->count_objects() == -1 [18446744073709551615] vmscan: but total_scan > 0: 22634041808232578 vmscan: resulting total_scan: 22634041808232578 Even though shrinker->count_objects() has returned an overflowed value, the resulting `total_scan' is positive, and, what is more worrisome, it is insanely huge. This value is getting used later on in shrinker->scan_objects() loop: while (total_scan >= batch_size || total_scan >= freeable) { unsigned long ret; unsigned long nr_to_scan = min(batch_size, total_scan); shrinkctl->nr_to_scan = nr_to_scan; ret = shrinker->scan_objects(shrinker, shrinkctl); if (ret == SHRINK_STOP) break; freed += ret; count_vm_events(SLABS_SCANNED, nr_to_scan); total_scan -= nr_to_scan; cond_resched(); } `total_scan >= batch_size' is true for a very-very long time and 'total_scan >= freeable' is also true for quite some time, because `freeable < 0' and `total_scan' is large enough, for example, 22634041808232578. The only break condition, in the given scheme of things, is shrinker->scan_objects() == SHRINK_STOP test, which is a bit too weak to rely on, especially in heavy zsmalloc-usage scenarios. To fix the issue, take a pool stat snapshot and use it instead of racy zs_stat_get() calls. Link: http://lkml.kernel.org/r/20160509140052.3389-1-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Cc: [4.3+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index e72efb109fde..fe47fbba995a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1735,10 +1735,13 @@ static struct page *isolate_source_page(struct size_class *class) static unsigned long zs_can_compact(struct size_class *class) { unsigned long obj_wasted; + unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); + unsigned long obj_used = zs_stat_get(class, OBJ_USED); - obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) - - zs_stat_get(class, OBJ_USED); + if (obj_allocated <= obj_used) + return 0; + obj_wasted = obj_allocated - obj_used; obj_wasted /= get_maxobj_per_zspage(class->size, class->pages_per_zspage); -- cgit v1.2.3