diff options
author | Linus Torvalds | 2020-08-12 11:24:12 -0700 |
---|---|---|
committer | Linus Torvalds | 2020-08-12 11:24:12 -0700 |
commit | 9ad57f6dfc2345ed5d3a8bf4dabac0a34069c54c (patch) | |
tree | 9e12a809a2020178eab234395b0f3e1149cb3c0d /mm | |
parent | 24fb33d40d60bd7d196400e7d5b26ff566fd98b7 (diff) | |
parent | 64019a2e467a288a16b65ab55ddcbf58c1b00187 (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton:
- most of the rest of MM (memcg, hugetlb, vmscan, proc, compaction,
mempolicy, oom-kill, hugetlbfs, migration, thp, cma, util,
memory-hotplug, cleanups, uaccess, migration, gup, pagemap),
- various other subsystems (alpha, misc, sparse, bitmap, lib, bitops,
checkpatch, autofs, minix, nilfs, ufs, fat, signals, kmod, coredump,
exec, kdump, rapidio, panic, kcov, kgdb, ipc).
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (164 commits)
mm/gup: remove task_struct pointer for all gup code
mm: clean up the last pieces of page fault accountings
mm/xtensa: use general page fault accounting
mm/x86: use general page fault accounting
mm/sparc64: use general page fault accounting
mm/sparc32: use general page fault accounting
mm/sh: use general page fault accounting
mm/s390: use general page fault accounting
mm/riscv: use general page fault accounting
mm/powerpc: use general page fault accounting
mm/parisc: use general page fault accounting
mm/openrisc: use general page fault accounting
mm/nios2: use general page fault accounting
mm/nds32: use general page fault accounting
mm/mips: use general page fault accounting
mm/microblaze: use general page fault accounting
mm/m68k: use general page fault accounting
mm/ia64: use general page fault accounting
mm/hexagon: use general page fault accounting
mm/csky: use general page fault accounting
...
Diffstat (limited to 'mm')
42 files changed, 1017 insertions, 452 deletions
@@ -52,7 +52,7 @@ unsigned long cma_get_size(const struct cma *cma) const char *cma_get_name(const struct cma *cma) { - return cma->name ? cma->name : "(undefined)"; + return cma->name; } static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, @@ -93,17 +93,15 @@ static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, mutex_unlock(&cma->lock); } -static int __init cma_activate_area(struct cma *cma) +static void __init cma_activate_area(struct cma *cma) { unsigned long base_pfn = cma->base_pfn, pfn = base_pfn; unsigned i = cma->count >> pageblock_order; struct zone *zone; cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL); - if (!cma->bitmap) { - cma->count = 0; - return -ENOMEM; - } + if (!cma->bitmap) + goto out_error; WARN_ON_ONCE(!pfn_valid(pfn)); zone = page_zone(pfn_to_page(pfn)); @@ -133,25 +131,22 @@ static int __init cma_activate_area(struct cma *cma) spin_lock_init(&cma->mem_head_lock); #endif - return 0; + return; not_in_zone: - pr_err("CMA area %s could not be activated\n", cma->name); bitmap_free(cma->bitmap); +out_error: cma->count = 0; - return -EINVAL; + pr_err("CMA area %s could not be activated\n", cma->name); + return; } static int __init cma_init_reserved_areas(void) { int i; - for (i = 0; i < cma_area_count; i++) { - int ret = cma_activate_area(&cma_areas[i]); - - if (ret) - return ret; - } + for (i = 0; i < cma_area_count; i++) + cma_activate_area(&cma_areas[i]); return 0; } @@ -202,13 +197,12 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, * subsystems (like slab allocator) are available. */ cma = &cma_areas[cma_area_count]; - if (name) { - cma->name = name; - } else { - cma->name = kasprintf(GFP_KERNEL, "cma%d\n", cma_area_count); - if (!cma->name) - return -ENOMEM; - } + + if (name) + snprintf(cma->name, CMA_MAX_NAME, name); + else + snprintf(cma->name, CMA_MAX_NAME, "cma%d\n", cma_area_count); + cma->base_pfn = PFN_DOWN(base); cma->count = size >> PAGE_SHIFT; cma->order_per_bit = order_per_bit; @@ -425,7 +419,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, struct page *page = NULL; int ret = -ENOMEM; - if (!cma || !cma->count) + if (!cma || !cma->count || !cma->bitmap) return NULL; pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma, @@ -4,6 +4,8 @@ #include <linux/debugfs.h> +#define CMA_MAX_NAME 64 + struct cma { unsigned long base_pfn; unsigned long count; @@ -15,7 +17,7 @@ struct cma { spinlock_t mem_head_lock; struct debugfs_u32_array dfs_bitmap; #endif - const char *name; + char name[CMA_MAX_NAME]; }; extern struct cma cma_areas[MAX_CMA_AREAS]; diff --git a/mm/compaction.c b/mm/compaction.c index 86375605faa9..b89581bf859c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -50,6 +50,24 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order) #define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order) +/* + * Fragmentation score check interval for proactive compaction purposes. + */ +static const unsigned int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500; + +/* + * Page order with-respect-to which proactive compaction + * calculates external fragmentation, which is used as + * the "fragmentation score" of a node/zone. + */ +#if defined CONFIG_TRANSPARENT_HUGEPAGE +#define COMPACTION_HPAGE_ORDER HPAGE_PMD_ORDER +#elif defined CONFIG_HUGETLBFS +#define COMPACTION_HPAGE_ORDER HUGETLB_PAGE_ORDER +#else +#define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT) +#endif + static unsigned long release_freepages(struct list_head *freelist) { struct page *page, *next; @@ -136,7 +154,7 @@ EXPORT_SYMBOL(__ClearPageMovable); /* * Compaction is deferred when compaction fails to result in a page - * allocation success. 1 << compact_defer_limit compactions are skipped up + * allocation success. 1 << compact_defer_shift, compactions are skipped up * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT */ void defer_compaction(struct zone *zone, int order) @@ -1459,7 +1477,7 @@ static void isolate_freepages(struct compact_control *cc) * this pfn aligned down to the pageblock boundary, because we do * block_start_pfn -= pageblock_nr_pages in the for loop. * For ending point, take care when isolating in last pageblock of a - * a zone which ends in the middle of a pageblock. + * zone which ends in the middle of a pageblock. * The low boundary is the end of the pageblock the migration scanner * is using. */ @@ -1857,6 +1875,76 @@ static inline bool is_via_compact_memory(int order) return order == -1; } +static bool kswapd_is_running(pg_data_t *pgdat) +{ + return pgdat->kswapd && (pgdat->kswapd->state == TASK_RUNNING); +} + +/* + * A zone's fragmentation score is the external fragmentation wrt to the + * COMPACTION_HPAGE_ORDER scaled by the zone's size. It returns a value + * in the range [0, 100]. + * + * The scaling factor ensures that proactive compaction focuses on larger + * zones like ZONE_NORMAL, rather than smaller, specialized zones like + * ZONE_DMA32. For smaller zones, the score value remains close to zero, + * and thus never exceeds the high threshold for proactive compaction. + */ +static unsigned int fragmentation_score_zone(struct zone *zone) +{ + unsigned long score; + + score = zone->present_pages * + extfrag_for_order(zone, COMPACTION_HPAGE_ORDER); + return div64_ul(score, zone->zone_pgdat->node_present_pages + 1); +} + +/* + * The per-node proactive (background) compaction process is started by its + * corresponding kcompactd thread when the node's fragmentation score + * exceeds the high threshold. The compaction process remains active till + * the node's score falls below the low threshold, or one of the back-off + * conditions is met. + */ +static unsigned int fragmentation_score_node(pg_data_t *pgdat) +{ + unsigned int score = 0; + int zoneid; + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct zone *zone; + + zone = &pgdat->node_zones[zoneid]; + score += fragmentation_score_zone(zone); + } + + return score; +} + +static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low) +{ + unsigned int wmark_low; + + /* + * Cap the low watermak to avoid excessive compaction + * activity in case a user sets the proactivess tunable + * close to 100 (maximum). + */ + wmark_low = max(100U - sysctl_compaction_proactiveness, 5U); + return low ? wmark_low : min(wmark_low + 10, 100U); +} + +static bool should_proactive_compact_node(pg_data_t *pgdat) +{ + int wmark_high; + + if (!sysctl_compaction_proactiveness || kswapd_is_running(pgdat)) + return false; + + wmark_high = fragmentation_score_wmark(pgdat, false); + return fragmentation_score_node(pgdat) > wmark_high; +} + static enum compact_result __compact_finished(struct compact_control *cc) { unsigned int order; @@ -1883,6 +1971,25 @@ static enum compact_result __compact_finished(struct compact_control *cc) return COMPACT_PARTIAL_SKIPPED; } + if (cc->proactive_compaction) { + int score, wmark_low; + pg_data_t *pgdat; + + pgdat = cc->zone->zone_pgdat; + if (kswapd_is_running(pgdat)) + return COMPACT_PARTIAL_SKIPPED; + + score = fragmentation_score_zone(cc->zone); + wmark_low = fragmentation_score_wmark(pgdat, true); + + if (score > wmark_low) + ret = COMPACT_CONTINUE; + else + ret = COMPACT_SUCCESS; + + goto out; + } + if (is_via_compact_memory(cc->order)) return COMPACT_CONTINUE; @@ -1941,6 +2048,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) } } +out: if (cc->contended || fatal_signal_pending(current)) ret = COMPACT_CONTENDED; @@ -2421,6 +2529,41 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, return rc; } +/* + * Compact all zones within a node till each zone's fragmentation score + * reaches within proactive compaction thresholds (as determined by the + * proactiveness tunable). + * + * It is possible that the function returns before reaching score targets + * due to various back-off conditions, such as, contention on per-node or + * per-zone locks. + */ +static void proactive_compact_node(pg_data_t *pgdat) +{ + int zoneid; + struct zone *zone; + struct compact_control cc = { + .order = -1, + .mode = MIGRATE_SYNC_LIGHT, + .ignore_skip_hint = true, + .whole_zone = true, + .gfp_mask = GFP_KERNEL, + .proactive_compaction = true, + }; + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + cc.zone = zone; + + compact_zone(&cc, NULL); + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } +} /* Compact all zones within a node */ static void compact_node(int nid) @@ -2468,6 +2611,13 @@ static void compact_nodes(void) int sysctl_compact_memory; /* + * Tunable for proactive compaction. It determines how + * aggressively the kernel should compact memory in the + * background. It takes values in the range [0, 100]. + */ +unsigned int __read_mostly sysctl_compaction_proactiveness = 20; + +/* * This is the entry point for compacting all nodes via * /proc/sys/vm/compact_memory */ @@ -2646,6 +2796,7 @@ static int kcompactd(void *p) { pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; + unsigned int proactive_defer = 0; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -2661,12 +2812,34 @@ static int kcompactd(void *p) unsigned long pflags; trace_mm_compaction_kcompactd_sleep(pgdat->node_id); - wait_event_freezable(pgdat->kcompactd_wait, - kcompactd_work_requested(pgdat)); + if (wait_event_freezable_timeout(pgdat->kcompactd_wait, + kcompactd_work_requested(pgdat), + msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) { + + psi_memstall_enter(&pflags); + kcompactd_do_work(pgdat); + psi_memstall_leave(&pflags); + continue; + } - psi_memstall_enter(&pflags); - kcompactd_do_work(pgdat); - psi_memstall_leave(&pflags); + /* kcompactd wait timeout */ + if (should_proactive_compact_node(pgdat)) { + unsigned int prev_score, score; + + if (proactive_defer) { + proactive_defer--; + continue; + } + prev_score = fragmentation_score_node(pgdat); + proactive_compact_node(pgdat); + score = fragmentation_score_node(pgdat); + /* + * Defer proactive compaction if the fragmentation + * score did not go down i.e. no progress made. + */ + proactive_defer = score < prev_score ? + 0 : 1 << COMPACT_MAX_DEFER_SHIFT; + } } return 0; diff --git a/mm/filemap.c b/mm/filemap.c index f2bb5ff0293d..8e75bce0346d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2885,7 +2885,7 @@ filler: * Case a, the page will be up to date when the page is unlocked. * There is no need to serialise on the page lock here as the page * is pinned so the lock gives no additional protection. Even if the - * the page is truncated, the data is still valid if PageUptodate as + * page is truncated, the data is still valid if PageUptodate as * it's a race vs truncate race. * Case b, the page will not be up to date * Case c, the page may be truncated but in itself, the data may still @@ -859,7 +859,7 @@ unmap: * does not include FOLL_NOWAIT, the mmap_lock may be released. If it * is, *@locked will be set to 0 and -EBUSY returned. */ -static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, +static int faultin_page(struct vm_area_struct *vma, unsigned long address, unsigned int *flags, int *locked) { unsigned int fault_flags = 0; @@ -884,7 +884,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_TRIED; } - ret = handle_mm_fault(vma, address, fault_flags); + ret = handle_mm_fault(vma, address, fault_flags, NULL); if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, *flags); @@ -893,13 +893,6 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, BUG(); } - if (tsk) { - if (ret & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - } - if (ret & VM_FAULT_RETRY) { if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) *locked = 0; @@ -969,7 +962,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) /** * __get_user_pages() - pin user pages in memory - * @tsk: task_struct of target task * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin @@ -1028,7 +1020,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ -static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, +static long __get_user_pages(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) @@ -1110,8 +1102,7 @@ retry: page = follow_page_mask(vma, start, foll_flags, &ctx); if (!page) { - ret = faultin_page(tsk, vma, start, &foll_flags, - locked); + ret = faultin_page(vma, start, &foll_flags, locked); switch (ret) { case 0: goto retry; @@ -1185,8 +1176,6 @@ static bool vma_permits_fault(struct vm_area_struct *vma, /** * fixup_user_fault() - manually resolve a user page fault - * @tsk: the task_struct to use for page fault accounting, or - * NULL if faults are not to be recorded. * @mm: mm_struct of target mm * @address: user address * @fault_flags:flags to pass down to handle_mm_fault() @@ -1214,7 +1203,7 @@ static bool vma_permits_fault(struct vm_area_struct *vma, * This function will not return with an unlocked mmap_lock. So it has not the * same semantics wrt the @mm->mmap_lock as does filemap_fault(). */ -int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, +int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { @@ -1238,7 +1227,7 @@ retry: fatal_signal_pending(current)) return -EINTR; - ret = handle_mm_fault(vma, address, fault_flags); + ret = handle_mm_fault(vma, address, fault_flags, NULL); major |= ret & VM_FAULT_MAJOR; if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, 0); @@ -1255,12 +1244,6 @@ retry: goto retry; } - if (tsk) { - if (major) - tsk->maj_flt++; - else - tsk->min_flt++; - } return 0; } EXPORT_SYMBOL_GPL(fixup_user_fault); @@ -1269,8 +1252,7 @@ EXPORT_SYMBOL_GPL(fixup_user_fault); * Please note that this function, unlike __get_user_pages will not * return 0 for nr_pages > 0 without FOLL_NOWAIT */ -static __always_inline long __get_user_pages_locked(struct task_struct *tsk, - struct mm_struct *mm, +static __always_inline long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, @@ -1303,7 +1285,7 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, pages_done = 0; lock_dropped = false; for (;;) { - ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages, + ret = __get_user_pages(mm, start, nr_pages, flags, pages, vmas, locked); if (!locked) /* VM_FAULT_RETRY couldn't trigger, bypass */ @@ -1363,7 +1345,7 @@ retry: } *locked = 1; - ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED, + ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED, pages, NULL, locked); if (!*locked) { /* Continue to retry until we succeeded */ @@ -1450,7 +1432,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. */ - return __get_user_pages(current, mm, start, nr_pages, gup_flags, + return __get_user_pages(mm, start, nr_pages, gup_flags, NULL, NULL, locked); } @@ -1534,7 +1516,7 @@ struct page *get_dump_page(unsigned long addr) struct vm_area_struct *vma; struct page *page; - if (__get_user_pages(current, current->mm, addr, 1, + if (__get_user_pages(current->mm, addr, 1, FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, NULL) < 1) return NULL; @@ -1543,8 +1525,7 @@ struct page *get_dump_page(unsigned long addr) } #endif /* CONFIG_ELF_CORE */ #else /* CONFIG_MMU */ -static long __get_user_pages_locked(struct task_struct *tsk, - struct mm_struct *mm, unsigned long start, +static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, struct vm_area_struct **vmas, int *locked, unsigned int foll_flags) @@ -1609,59 +1590,7 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) } #ifdef CONFIG_CMA -static struct page *new_non_cma_page(struct page *page, unsigned long private) -{ - /* - * We want to make sure we allocate the new page from the same node - * as the source page. - */ - int nid = page_to_nid(page); - /* - * Trying to allocate a page for migration. Ignore allocation - * failure warnings. We don't force __GFP_THISNODE here because - * this node here is the node where we have CMA reservation and - * in some case these nodes will have really less non movable - * allocation memory. - */ - gfp_t gfp_mask = GFP_USER | __GFP_NOWARN; - - if (PageHighMem(page)) - gfp_mask |= __GFP_HIGHMEM; - -#ifdef CONFIG_HUGETLB_PAGE - if (PageHuge(page)) { - struct hstate *h = page_hstate(page); - /* - * We don't want to dequeue from the pool because pool pages will - * mostly be from the CMA region. - */ - return alloc_migrate_huge_page(h, gfp_mask, nid, NULL); - } -#endif - if (PageTransHuge(page)) { - struct page *thp; - /* - * ignore allocation failure warnings - */ - gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN; - - /* - * Remove the movable mask so that we don't allocate from - * CMA area again. - */ - thp_gfpmask &= ~__GFP_MOVABLE; - thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER); - if (!thp) - return NULL; - prep_transhuge_page(thp); - return thp; - } - - return __alloc_pages_node(nid, gfp_mask, 0); -} - -static long check_and_migrate_cma_pages(struct task_struct *tsk, - struct mm_struct *mm, +static long check_and_migrate_cma_pages(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, @@ -1674,6 +1603,10 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk, bool migrate_allow = true; LIST_HEAD(cma_page_list); long ret = nr_pages; + struct migration_target_control mtc = { + .nid = NUMA_NO_NODE, + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN, + }; check_again: for (i = 0; i < nr_pages;) { @@ -1719,8 +1652,8 @@ check_again: for (i = 0; i < nr_pages; i++) put_page(pages[i]); - if (migrate_pages(&cma_page_list, new_non_cma_page, - NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) { + if (migrate_pages(&cma_page_list, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) { /* * some of the pages failed migration. Do get_user_pages * without migration. @@ -1735,7 +1668,7 @@ check_again: * again migrating any new CMA pages which we failed to isolate * earlier. */ - ret = __get_user_pages_locked(tsk, mm, start, nr_pages, + ret = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL, gup_flags); @@ -1749,8 +1682,7 @@ check_again: return ret; } #else -static long check_and_migrate_cma_pages(struct task_struct *tsk, - struct mm_struct *mm, +static long check_and_migrate_cma_pages(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, @@ -1765,8 +1697,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk, * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which * allows us to process the FOLL_LONGTERM flag. */ -static long __gup_longterm_locked(struct task_struct *tsk, - struct mm_struct *mm, +static long __gup_longterm_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, @@ -1791,11 +1722,10 @@ static long __gup_longterm_locked(struct task_struct *tsk, flags = memalloc_nocma_save(); } - rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, + rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas_tmp, NULL, gup_flags); if (gup_flags & FOLL_LONGTERM) { - memalloc_nocma_restore(flags); if (rc < 0) goto out; @@ -1806,32 +1736,31 @@ static long __gup_longterm_locked(struct task_struct *tsk, goto out; } - rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages, + rc = check_and_migrate_cma_pages(mm, start, rc, pages, vmas_tmp, gup_flags); +out: + memalloc_nocma_restore(flags); } -out: if (vmas_tmp != vmas) kfree(vmas_tmp); return rc; } #else /* !CONFIG_FS_DAX && !CONFIG_CMA */ -static __always_inline long __gup_longterm_locked(struct task_struct *tsk, - struct mm_struct *mm, +static __always_inline long __gup_longterm_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, struct vm_area_struct **vmas, unsigned int flags) { - return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, + return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL, flags); } #endif /* CONFIG_FS_DAX || CONFIG_CMA */ #ifdef CONFIG_MMU -static long __get_user_pages_remote(struct task_struct *tsk, - struct mm_struct *mm, +static long __get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) @@ -1850,20 +1779,18 @@ static long __get_user_pages_remote(struct task_struct *tsk, * This will check the vmas (even if our vmas arg is NULL) * and return -ENOTSUPP if DAX isn't allowed in this case: */ - return __gup_longterm_locked(tsk, mm, start, nr_pages, pages, + return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } - return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, + return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, locked, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } /** * get_user_pages_remote() - pin user pages in memory - * @tsk: the task_struct to use for page fault accounting, or - * NULL if faults are not to be recorded. * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin @@ -1922,7 +1849,7 @@ static long __get_user_pages_remote(struct task_struct *tsk, * should use get_user_pages_remote because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ -long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, +long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) @@ -1934,13 +1861,13 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) return -EINVAL; - return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags, + return __get_user_pages_remote(mm, start, nr_pages, gup_flags, pages, vmas, locked); } EXPORT_SYMBOL(get_user_pages_remote); #else /* CONFIG_MMU */ -long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, +long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) @@ -1948,8 +1875,7 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, return 0; } -static long __get_user_pages_remote(struct task_struct *tsk, - struct mm_struct *mm, +static long __get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) @@ -1969,11 +1895,10 @@ static long __get_user_pages_remote(struct task_struct *tsk, * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. * - * This is the same as get_user_pages_remote(), just with a - * less-flexible calling convention where we assume that the task - * and mm being operated on are the current task's and don't allow - * passing of a locked parameter. We also obviously don't pass - * FOLL_REMOTE in here. + * This is the same as get_user_pages_remote(), just with a less-flexible + * calling convention where we assume that the mm being operated on belongs to + * the current task, and doesn't allow passing of a locked parameter. We also + * obviously don't pass FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, @@ -1986,7 +1911,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) return -EINVAL; - return __gup_longterm_locked(current, current->mm, start, nr_pages, + return __gup_longterm_locked(current->mm, start, nr_pages, pages, vmas, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages); @@ -1996,7 +1921,7 @@ EXPORT_SYMBOL(get_user_pages); * * mmap_read_lock(mm); * do_something() - * get_user_pages(tsk, mm, ..., pages, NULL); + * get_user_pages(mm, ..., pages, NULL); * mmap_read_unlock(mm); * * to: @@ -2004,7 +1929,7 @@ EXPORT_SYMBOL(get_user_pages); * int locked = 1; * mmap_read_lock(mm); * do_something() - * get_user_pages_locked(tsk, mm, ..., pages, &locked); + * get_user_pages_locked(mm, ..., pages, &locked); * if (locked) * mmap_read_unlock(mm); * @@ -2042,7 +1967,7 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) return -EINVAL; - return __get_user_pages_locked(current, current->mm, start, nr_pages, + return __get_user_pages_locked(current->mm, start, nr_pages, pages, NULL, locked, gup_flags | FOLL_TOUCH); } @@ -2052,12 +1977,12 @@ EXPORT_SYMBOL(get_user_pages_locked); * get_user_pages_unlocked() is suitable to replace the form: * * mmap_read_lock(mm); - * get_user_pages(tsk, mm, ..., pages, NULL); + * get_user_pages(mm, ..., pages, NULL); * mmap_read_unlock(mm); * * with: * - * get_user_pages_unlocked(tsk, mm, ..., pages); + * get_user_pages_unlocked(mm, ..., pages); * * It is functionally equivalent to get_user_pages_fast so * get_user_pages_fast should be used instead if specific gup_flags @@ -2080,7 +2005,7 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, return -EINVAL; mmap_read_lock(mm); - ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, + ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL, &locked, gup_flags | FOLL_TOUCH); if (locked) mmap_read_unlock(mm); @@ -2725,7 +2650,7 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages, */ if (gup_flags & FOLL_LONGTERM) { mmap_read_lock(current->mm); - ret = __gup_longterm_locked(current, current->mm, + ret = __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL, gup_flags); mmap_read_unlock(current->mm); @@ -2968,10 +2893,8 @@ int pin_user_pages_fast_only(unsigned long start, int nr_pages, EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); /** - * pin_user_pages_remote() - pin pages of a remote process (task != current) + * pin_user_pages_remote() - pin pages of a remote process * - * @tsk: the task_struct to use for page fault accounting, or - * NULL if faults are not to be recorded. * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin @@ -2992,7 +2915,7 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. */ -long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, +long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) @@ -3002,7 +2925,7 @@ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, return -EINVAL; gup_flags |= FOLL_PIN; - return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags, + return __get_user_pages_remote(mm, start, nr_pages, gup_flags, pages, vmas, locked); } EXPORT_SYMBOL(pin_user_pages_remote); @@ -3034,7 +2957,7 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, return -EINVAL; gup_flags |= FOLL_PIN; - return __gup_longterm_locked(current, current->mm, start, nr_pages, + return __gup_longterm_locked(current->mm, start, nr_pages, pages, vmas, gup_flags); } EXPORT_SYMBOL(pin_user_pages); @@ -3079,7 +3002,7 @@ long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, return -EINVAL; gup_flags |= FOLL_PIN; - return __get_user_pages_locked(current, current->mm, start, nr_pages, + return __get_user_pages_locked(current->mm, start, nr_pages, pages, NULL, locked, gup_flags | FOLL_TOUCH); } @@ -75,7 +75,8 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end, } for (; addr < end; addr += PAGE_SIZE) - if (handle_mm_fault(vma, addr, fault_flags) & VM_FAULT_ERROR) + if (handle_mm_fault(vma, addr, fault_flags, NULL) & + VM_FAULT_ERROR) return -EFAULT; return -EBUSY; } @@ -249,7 +250,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, swp_entry_t entry = pte_to_swp_entry(pte); /* - * Never fault in device private pages pages, but just report + * Never fault in device private pages, but just report * the PFN even if not present. */ if (hmm_is_device_private_entry(range, entry)) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 206f52b36ffb..2ccff8472cd4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -303,24 +303,6 @@ static ssize_t hpage_pmd_size_show(struct kobject *kobj, static struct kobj_attribute hpage_pmd_size_attr = __ATTR_RO(hpage_pmd_size); -#ifdef CONFIG_DEBUG_VM -static ssize_t debug_cow_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return single_hugepage_flag_show(kobj, attr, buf, - TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); -} -static ssize_t debug_cow_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - return single_hugepage_flag_store(kobj, attr, buf, count, - TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); -} -static struct kobj_attribute debug_cow_attr = - __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); -#endif /* CONFIG_DEBUG_VM */ - static struct attribute *hugepage_attr[] = { &enabled_attr.attr, &defrag_attr.attr, @@ -329,9 +311,6 @@ static struct attribute *hugepage_attr[] = { #ifdef CONFIG_SHMEM &shmem_enabled_attr.attr, #endif -#ifdef CONFIG_DEBUG_VM - &debug_cow_attr.attr, -#endif NULL, }; @@ -640,7 +619,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr, true); - lru_cache_add_active_or_unevictable(page, vma); + lru_cache_add_inactive_or_unevictable(page, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e52c878940bb..a301c2d672bf 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -19,6 +19,7 @@ #include <linux/memblock.h> #include <linux/sysfs.h> #include <linux/slab.h> +#include <linux/sched/mm.h> #include <linux/mmdebug.h> #include <linux/sched/signal.h> #include <linux/rmap.h> @@ -133,7 +134,7 @@ void hugepage_put_subpool(struct hugepage_subpool *spool) /* * Subpool accounting for allocating and reserving pages. * Return -ENOMEM if there are not enough resources to satisfy the - * the request. Otherwise, return the number of pages by which the + * request. Otherwise, return the number of pages by which the * global pools must be adjusted (upward). The returned value may * only be different than the passed value (delta) in the case where * a subpool minimum size must be maintained. @@ -1040,10 +1041,16 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) { struct page *page; + bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA); + + list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { + if (nocma && is_migrate_cma_page(page)) + continue; - list_for_each_entry(page, &h->hugepage_freelists[nid], lru) if (!PageHWPoison(page)) break; + } + /* * if 'non-isolated free hugepage' not found on the list, * the allocation fails. @@ -1093,15 +1100,6 @@ retry_cpuset: return NULL; } -/* Movability of hugepages depends on migration support. */ -static inline gfp_t htlb_alloc_mask(struct hstate *h) -{ - if (hugepage_movable_supported(h)) - return GFP_HIGHUSER_MOVABLE; - else - return GFP_HIGHUSER; -} - static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve, @@ -1944,7 +1942,7 @@ out_unlock: return page; } -struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, +static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct page *page; @@ -1986,31 +1984,9 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, } /* page migration callback function */ -struct page *alloc_huge_page_node(struct hstate *h, int nid) -{ - gfp_t gfp_mask = htlb_alloc_mask(h); - struct page *page = NULL; - - if (nid != NUMA_NO_NODE) - gfp_mask |= __GFP_THISNODE; - - spin_lock(&hugetlb_lock); - if (h->free_huge_pages - h->resv_huge_pages > 0) - page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL); - spin_unlock(&hugetlb_lock); - - if (!page) - page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL); - - return page; -} - -/* page migration callback function */ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, - nodemask_t *nmask) + nodemask_t *nmask, gfp_t gfp_mask) { - gfp_t gfp_mask = htlb_alloc_mask(h); - spin_lock(&hugetlb_lock); if (h->free_huge_pages - h->resv_huge_pages > 0) { struct page *page; @@ -2038,7 +2014,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, gfp_mask = htlb_alloc_mask(h); node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - page = alloc_huge_page_nodemask(h, node, nodemask); + page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask); mpol_cond_put(mpol); return page; @@ -2167,7 +2143,7 @@ static void return_unused_surplus_pages(struct hstate *h, * evenly across all nodes with memory. Iterate across these nodes * until we can no longer free unreserved surplus pages. This occurs * when the nodes with surplus pages have no free pages. - * free_pool_huge_page() will balance the the freed pages across the + * free_pool_huge_page() will balance the freed pages across the * on-line nodes with memory and will handle the hstate accounting. * * Note that we decrement resv_huge_pages as we free the pages. If @@ -3458,13 +3434,21 @@ static int __init default_hugepagesz_setup(char *s) } __setup("default_hugepagesz=", default_hugepagesz_setup); -static unsigned int cpuset_mems_nr(unsigned int *array) +static unsigned int allowed_mems_nr(struct hstate *h) { int node; unsigned int nr = 0; + nodemask_t *mpol_allowed; + unsigned int *array = h->free_huge_pages_node; + gfp_t gfp_mask = htlb_alloc_mask(h); - for_each_node_mask(node, cpuset_current_mems_allowed) - nr += array[node]; + mpol_allowed = policy_nodemask_current(gfp_mask); + + for_each_node_mask(node, cpuset_current_mems_allowed) { + if (!mpol_allowed || + (mpol_allowed && node_isset(node, *mpol_allowed))) + nr += array[node]; + } return nr; } @@ -3643,12 +3627,18 @@ static int hugetlb_acct_memory(struct hstate *h, long delta) * we fall back to check against current free page availability as * a best attempt and hopefully to minimize the impact of changing * semantics that cpuset has. + * + * Apart from cpuset, we also have memory policy mechanism that + * also determines from which node the kernel will allocate memory + * in a NUMA system. So similar to cpuset, we also should consider + * the memory policy of the current task. Similar to the description + * above. */ if (delta > 0) { if (gather_surplus_pages(h, delta) < 0) goto out; - if (delta > cpuset_mems_nr(h->free_huge_pages_node)) { + if (delta > allowed_mems_nr(h)) { return_unused_surplus_pages(h, delta); goto out; } @@ -3953,7 +3943,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, continue; ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, &address, ptep)) { + if (huge_pmd_unshare(mm, vma, &address, ptep)) { spin_unlock(ptl); /* * We just unmapped a page of PMDs by clearing a PUD. @@ -4540,10 +4530,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); - } else { - ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); - if (!ptep) - return VM_FAULT_OOM; } /* @@ -5020,7 +5006,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, &address, ptep)) { + if (huge_pmd_unshare(mm, vma, &address, ptep)) { pages++; spin_unlock(ptl); shared_pmd = true; @@ -5401,12 +5387,14 @@ out: * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user */ -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) +int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long *addr, pte_t *ptep) { pgd_t *pgd = pgd_offset(mm, *addr); p4d_t *p4d = p4d_offset(pgd, *addr); pud_t *pud = pud_offset(p4d, *addr); + i_mmap_assert_write_locked(vma->vm_file->f_mapping); BUG_ON(page_count(virt_to_page(ptep)) == 0); if (page_count(virt_to_page(ptep)) == 1) return 0; @@ -5424,7 +5412,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) return NULL; } -int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) +int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long *addr, pte_t *ptep) { return 0; } @@ -5694,12 +5683,14 @@ void __init hugetlb_cma_reserve(int order) reserved = 0; for_each_node_state(nid, N_ONLINE) { int res; + char name[20]; size = min(per_node, hugetlb_cma_size - reserved); size = round_up(size, PAGE_SIZE << order); + snprintf(name, 20, "hugetlb%d", nid); res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order, - 0, false, "hugetlb", + 0, false, name, &hugetlb_cma[nid], nid); if (res) { pr_warn("hugetlb_cma: reservation failed: err %d, node %d", diff --git a/mm/internal.h b/mm/internal.h index 9886db20d94f..d11a9a8d2135 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -239,6 +239,7 @@ struct compact_control { bool no_set_skip_hint; /* Don't mark blocks for skipping */ bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ + bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock or sched contention */ bool rescan; /* Rescanning the same pageblock */ @@ -612,5 +613,11 @@ static inline bool is_migrate_highatomic_page(struct page *page) } void setup_zone_pageset(struct zone *zone); -extern struct page *alloc_new_node_page(struct page *page, unsigned long node); + +struct migration_target_control { + int nid; /* preferred node id */ + nodemask_t *nmask; + gfp_t gfp_mask; +}; + #endif /* __MM_INTERNAL_H */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b52bd46ad146..15a9af791014 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1173,7 +1173,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address, true); - lru_cache_add_active_or_unevictable(new_page, vma); + lru_cache_add_inactive_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); @@ -480,7 +480,8 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) break; if (PageKsm(page)) ret = handle_mm_fault(vma, addr, - FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE); + FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, + NULL); else ret = VM_FAULT_WRITE; put_page(page); diff --git a/mm/maccess.c b/mm/maccess.c index f98ff91e32c6..3bd70405f2d8 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -205,15 +205,14 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) long copy_from_user_nofault(void *dst, const void __user *src, size_t size) { long ret = -EFAULT; - mm_segment_t old_fs = get_fs(); + mm_segment_t old_fs = force_uaccess_begin(); - set_fs(USER_DS); if (access_ok(src, size)) { pagefault_disable(); ret = __copy_from_user_inatomic(dst, src, size); pagefault_enable(); } - set_fs(old_fs); + force_uaccess_end(old_fs); if (ret) return -EFAULT; @@ -233,15 +232,14 @@ EXPORT_SYMBOL_GPL(copy_from_user_nofault); long copy_to_user_nofault(void __user *dst, const void *src, size_t size) { long ret = -EFAULT; - mm_segment_t old_fs = get_fs(); + mm_segment_t old_fs = force_uaccess_begin(); - set_fs(USER_DS); if (access_ok(dst, size)) { pagefault_disable(); ret = __copy_to_user_inatomic(dst, src, size); pagefault_enable(); } - set_fs(old_fs); + force_uaccess_end(old_fs); if (ret) return -EFAULT; @@ -270,17 +268,17 @@ EXPORT_SYMBOL_GPL(copy_to_user_nofault); long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count) { - mm_segment_t old_fs = get_fs(); + mm_segment_t old_fs; long ret; if (unlikely(count <= 0)) return 0; - set_fs(USER_DS); + old_fs = force_uaccess_begin(); pagefault_disable(); ret = strncpy_from_user(dst, unsafe_addr, count); pagefault_enable(); - set_fs(old_fs); + force_uaccess_end(old_fs); if (ret >= count) { ret = count; @@ -310,14 +308,14 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, */ long strnlen_user_nofault(const void __user *unsafe_addr, long count) { - mm_segment_t old_fs = get_fs(); + mm_segment_t old_fs; int ret; - set_fs(USER_DS); + old_fs = force_uaccess_begin(); pagefault_disable(); ret = strnlen_user(unsafe_addr, count); pagefault_enable(); - set_fs(old_fs); + force_uaccess_end(old_fs); return ret; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8d9ceea7fe4d..d59fd9af6e63 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -781,7 +781,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) if (mem_cgroup_disabled()) return; - if (vmstat_item_in_bytes(idx)) + if (memcg_stat_item_in_bytes(idx)) threshold <<= PAGE_SHIFT; x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); @@ -1488,6 +1488,8 @@ static char *memory_stat_format(struct mem_cgroup *memcg) seq_buf_printf(&s, "slab %llu\n", (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) + memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B))); + seq_buf_printf(&s, "percpu %llu\n", + (u64)memcg_page_state(memcg, MEMCG_PERCPU_B)); seq_buf_printf(&s, "sock %llu\n", (u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE); @@ -1528,12 +1530,18 @@ static char *memory_stat_format(struct mem_cgroup *memcg) seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT), memcg_events(memcg, PGMAJFAULT)); - seq_buf_printf(&s, "workingset_refault %lu\n", - memcg_page_state(memcg, WORKINGSET_REFAULT)); - seq_buf_printf(&s, "workingset_activate %lu\n", - memcg_page_state(memcg, WORKINGSET_ACTIVATE)); + seq_buf_printf(&s, "workingset_refault_anon %lu\n", + memcg_page_state(memcg, WORKINGSET_REFAULT_ANON)); + seq_buf_printf(&s, "workingset_refault_file %lu\n", + memcg_page_state(memcg, WORKINGSET_REFAULT_FILE)); + seq_buf_printf(&s, "workingset_activate_anon %lu\n", + memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON)); + seq_buf_printf(&s, "workingset_activate_file %lu\n", + memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE)); + seq_buf_printf(&s, "workingset_restore %lu\n", + memcg_page_state(memcg, WORKINGSET_RESTORE_ANON)); seq_buf_printf(&s, "workingset_restore %lu\n", - memcg_page_state(memcg, WORKINGSET_RESTORE)); + memcg_page_state(memcg, WORKINGSET_RESTORE_FILE)); seq_buf_printf(&s, "workingset_nodereclaim %lu\n", memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); @@ -2414,7 +2422,7 @@ static void high_work_func(struct work_struct *work) * * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the * overage ratio to a delay. - * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the + * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the * proposed penalty in order to reduce to a reasonable number of jiffies, and * to produce a reasonable delay curve. * @@ -5129,13 +5137,18 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; - pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat); + /* We charge the parent cgroup, never the current task */ + WARN_ON_ONCE(!current->active_memcg); + + pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat, + GFP_KERNEL_ACCOUNT); if (!pn->lruvec_stat_local) { kfree(pn); return 1; } - pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat); + pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat, + GFP_KERNEL_ACCOUNT); if (!pn->lruvec_stat_cpu) { free_percpu(pn->lruvec_stat_local); kfree(pn); @@ -5209,11 +5222,16 @@ static struct mem_cgroup *mem_cgroup_alloc(void) goto fail; } - memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu); + /* We charge the parent cgroup, never the current task */ + WARN_ON_ONCE(!current->active_memcg); + + memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu, + GFP_KERNEL_ACCOUNT); if (!memcg->vmstats_local) goto fail; - memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu); + memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, + GFP_KERNEL_ACCOUNT); if (!memcg->vmstats_percpu) goto fail; @@ -5262,7 +5280,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) struct mem_cgroup *memcg; long error = -ENOMEM; + memalloc_use_memcg(parent); memcg = mem_cgroup_alloc(); + memalloc_unuse_memcg(); if (IS_ERR(memcg)) return ERR_CAST(memcg); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 47b8ccb1fb9b..f1aa6433f404 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1648,9 +1648,12 @@ EXPORT_SYMBOL(unpoison_memory); static struct page *new_page(struct page *p, unsigned long private) { - int nid = page_to_nid(p); + struct migration_target_control mtc = { + .nid = page_to_nid(p), + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + }; - return new_page_nodemask(p, nid, &node_states[N_MEMORY]); + return alloc_migration_target(p, (unsigned long)&mtc); } /* diff --git a/mm/memory.c b/mm/memory.c index c39a13b09602..228efaca75d3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -71,6 +71,8 @@ #include <linux/dax.h> #include <linux/oom.h> #include <linux/numa.h> +#include <linux/perf_event.h> +#include <linux/ptrace.h> #include <trace/events/kmem.h> @@ -1800,7 +1802,7 @@ out_unlock: * @pfn: source kernel pfn * @pgprot: pgprot flags for the inserted page * - * This is exactly like vmf_insert_pfn(), except that it allows drivers to + * This is exactly like vmf_insert_pfn(), except that it allows drivers * to override pgprot on a per-page basis. * * This only makes sense for IO mappings, and it makes no sense for @@ -1936,7 +1938,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, * @pfn: source kernel pfn * @pgprot: pgprot flags for the inserted page * - * This is exactly like vmf_insert_mixed(), except that it allows drivers to + * This is exactly like vmf_insert_mixed(), except that it allows drivers * to override pgprot on a per-page basis. * * Typically this function should be used by drivers to set caching- and @@ -2715,7 +2717,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); page_add_new_anon_rmap(new_page, vma, vmf->address, false); - lru_cache_add_active_or_unevictable(new_page, vma); + lru_cache_add_inactive_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the @@ -3098,6 +3100,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) int locked; int exclusive = 0; vm_fault_t ret = 0; + void *shadow = NULL; if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) goto out; @@ -3149,13 +3152,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_page; } - /* - * XXX: Move to lru_cache_add() when it - * supports new vs putback - */ - spin_lock_irq(&page_pgdat(page)->lru_lock); - lru_note_cost_page(page); - spin_unlock_irq(&page_pgdat(page)->lru_lock); + shadow = get_shadow_from_swap_cache(entry); + if (shadow) + workingset_refault(page, shadow); lru_cache_add(page); swap_readpage(page, true); @@ -3266,10 +3265,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(page != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address, false); - lru_cache_add_active_or_unevictable(page, vma); + lru_cache_add_inactive_or_unevictable(page, vma); } else { do_page_add_anon_rmap(page, vma, vmf->address, exclusive); - activate_page(page); } swap_free(entry); @@ -3414,7 +3412,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); - lru_cache_add_active_or_unevictable(page, vma); + lru_cache_add_inactive_or_unevictable(page, vma); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); @@ -3672,7 +3670,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); - lru_cache_add_active_or_unevictable(page, vma); + lru_cache_add_inactive_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); @@ -4360,6 +4358,67 @@ retry_pud: return handle_pte_fault(&vmf); } +/** + * mm_account_fault - Do page fault accountings + * + * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting + * of perf event counters, but we'll still do the per-task accounting to + * the task who triggered this page fault. + * @address: the faulted address. + * @flags: the fault flags. + * @ret: the fault retcode. + * + * This will take care of most of the page fault accountings. Meanwhile, it + * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter + * updates. However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should + * still be in per-arch page fault handlers at the entry of page fault. + */ +static inline void mm_account_fault(struct pt_regs *regs, + unsigned long address, unsigned int flags, + vm_fault_t ret) +{ + bool major; + + /* + * We don't do accounting for some specific faults: + * + * - Unsuccessful faults (e.g. when the address wasn't valid). That + * includes arch_vma_access_permitted() failing before reaching here. + * So this is not a "this many hardware page faults" counter. We + * should use the hw profiling for that. + * + * - Incomplete faults (VM_FAULT_RETRY). They will only be counted + * once they're completed. + */ + if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY)) + return; + + /* + * We define the fault as a major fault when the final successful fault + * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't + * handle it immediately previously). + */ + major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED); + + if (major) + current->maj_flt++; + else + current->min_flt++; + + /* + * If the fault is done for GUP, regs will be NULL. We only do the + * accounting for the per thread fault counters who triggered the + * fault, and we skip the perf event updates. + */ + if (!regs) + return; + + if (major) + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); + else + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); +} + /* * By the time we get here, we already hold the mm semaphore * @@ -4367,7 +4426,7 @@ retry_pud: * return value. See filemap_fault() and __lock_page_or_retry(). */ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, - unsigned int flags) + unsigned int flags, struct pt_regs *regs) { vm_fault_t ret; @@ -4408,6 +4467,8 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, mem_cgroup_oom_synchronize(false); } + mm_account_fault(regs, address, flags, ret); + return ret; } EXPORT_SYMBOL_GPL(handle_mm_fault); @@ -4681,7 +4742,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, void *maddr; struct page *page = NULL; - ret = get_user_pages_remote(tsk, mm, addr, 1, + ret = get_user_pages_remote(mm, addr, 1, gup_flags, &page, &vma, NULL); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ac6961abaa10..c32ead89c911 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -350,6 +350,16 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, return err; } +#ifdef CONFIG_NUMA +int __weak memory_add_physaddr_to_nid(u64 start) +{ + pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", + start); + return 0; +} +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +#endif + /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, @@ -844,8 +854,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, node_states_set_node(nid, &arg); if (need_zonelists_rebuild) build_all_zonelists(NULL); - else - zone_pcp_update(zone); + zone_pcp_update(zone); init_per_zone_wmark_min(); @@ -1267,19 +1276,23 @@ found: static struct page *new_node_page(struct page *page, unsigned long private) { - int nid = page_to_nid(page); nodemask_t nmask = node_states[N_MEMORY]; + struct migration_target_control mtc = { + .nid = page_to_nid(page), + .nmask = &nmask, + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + }; /* * try to allocate from a different node but reuse this node if there * are no other online nodes to be used (e.g. we are offlining a part * of the only existing node) */ - node_clear(nid, nmask); + node_clear(mtc.nid, nmask); if (nodes_empty(nmask)) - node_set(nid, nmask); + node_set(mtc.nid, nmask); - return new_page_nodemask(page, nid, &nmask); + return alloc_migration_target(page, (unsigned long)&mtc); } static int @@ -1747,7 +1760,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) */ rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb); if (rc) - goto done; + return rc; /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); @@ -1771,9 +1784,8 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) try_offline_node(nid); -done: mem_hotplug_done(); - return rc; + return 0; } /** diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b9e85d467352..afaa09ff9f6c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -129,7 +129,7 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES]; /** * numa_map_to_online_node - Find closest online node - * @nid: Node id to start the search + * @node: Node id to start the search * * Lookup the next closest node by distance if @nid is not online. */ @@ -1065,27 +1065,6 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist, return 0; } -/* page allocation callback for NUMA node migration */ -struct page *alloc_new_node_page(struct page *page, unsigned long node) -{ - if (PageHuge(page)) - return alloc_huge_page_node(page_hstate(compound_head(page)), - node); - else if (PageTransHuge(page)) { - struct page *thp; - - thp = alloc_pages_node(node, - (GFP_TRANSHUGE | __GFP_THISNODE), - HPAGE_PMD_ORDER); - if (!thp) - return NULL; - prep_transhuge_page(thp); - return thp; - } else - return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE | - __GFP_THISNODE, 0); -} - /* * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. @@ -1096,6 +1075,10 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, nodemask_t nmask; LIST_HEAD(pagelist); int err = 0; + struct migration_target_control mtc = { + .nid = dest, + .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + }; nodes_clear(nmask); node_set(source, nmask); @@ -1110,8 +1093,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest, - MIGRATE_SYNC, MR_SYSCALL); + err = migrate_pages(&pagelist, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); } @@ -1632,11 +1615,11 @@ static int kernel_get_mempolicy(int __user *policy, int pval; nodemask_t nodes; - addr = untagged_addr(addr); - if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; + addr = untagged_addr(addr); + err = do_get_mempolicy(&pval, &nodes, addr, flags); if (err) @@ -1890,7 +1873,7 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) * Return a nodemask representing a mempolicy for filtering nodes for * page allocation */ -static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) +nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) { /* Lower zones don't get a nodemask applied for MPOL_BIND */ if (unlikely(policy->mode == MPOL_BIND) && diff --git a/mm/migrate.c b/mm/migrate.c index d179657f8685..5053439be6ab 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1418,22 +1418,35 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, enum migrate_mode mode, int reason) { int retry = 1; + int thp_retry = 1; int nr_failed = 0; int nr_succeeded = 0; + int nr_thp_succeeded = 0; + int nr_thp_failed = 0; + int nr_thp_split = 0; int pass = 0; + bool is_thp = false; struct page *page; struct page *page2; int swapwrite = current->flags & PF_SWAPWRITE; - int rc; + int rc, nr_subpages; if (!swapwrite) current->flags |= PF_SWAPWRITE; - for(pass = 0; pass < 10 && retry; pass++) { + for (pass = 0; pass < 10 && (retry || thp_retry); pass++) { retry = 0; + thp_retry = 0; list_for_each_entry_safe(page, page2, from, lru) { retry: + /* + * THP statistics is based on the source huge page. + * Capture required information that might get lost + * during migration. + */ + is_thp = PageTransHuge(page); + nr_subpages = hpage_nr_pages(page); cond_resched(); if (PageHuge(page)) @@ -1464,15 +1477,30 @@ retry: unlock_page(page); if (!rc) { list_safe_reset_next(page, page2, lru); + nr_thp_split++; goto retry; } } + if (is_thp) { + nr_thp_failed++; + nr_failed += nr_subpages; + goto out; + } nr_failed++; goto out; case -EAGAIN: + if (is_thp) { + thp_retry++; + break; + } retry++; break; case MIGRATEPAGE_SUCCESS: + if (is_thp) { + nr_thp_succeeded++; + nr_succeeded += nr_subpages; + break; + } nr_succeeded++; break; default: @@ -1482,19 +1510,27 @@ retry: * removed from migration page list and not * retried in the next outer loop. */ + if (is_thp) { + nr_thp_failed++; + nr_failed += nr_subpages; + break; + } nr_failed++; break; } } } - nr_failed += retry; + nr_failed += retry + thp_retry; + nr_thp_failed += thp_retry; rc = nr_failed; out: - if (nr_succeeded) - count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); - if (nr_failed) - count_vm_events(PGMIGRATE_FAIL, nr_failed); - trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); + count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); + count_vm_events(PGMIGRATE_FAIL, nr_failed); + count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); + count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed); + count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split); + trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded, + nr_thp_failed, nr_thp_split, mode, reason); if (!swapwrite) current->flags &= ~PF_SWAPWRITE; @@ -1502,6 +1538,49 @@ out: return rc; } +struct page *alloc_migration_target(struct page *page, unsigned long private) +{ + struct migration_target_control *mtc; + gfp_t gfp_mask; + unsigned int order = 0; + struct page *new_page = NULL; + int nid; + int zidx; + + mtc = (struct migration_target_control *)private; + gfp_mask = mtc->gfp_mask; + nid = mtc->nid; + if (nid == NUMA_NO_NODE) + nid = page_to_nid(page); + + if (PageHuge(page)) { + struct hstate *h = page_hstate(compound_head(page)); + + gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); + return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask); + } + + if (PageTransHuge(page)) { + /* + * clear __GFP_RECLAIM to make the migration callback + * consistent with regular THP allocations. + */ + gfp_mask &= ~__GFP_RECLAIM; + gfp_mask |= GFP_TRANSHUGE; + order = HPAGE_PMD_ORDER; + } + zidx = zone_idx(page_zone(page)); + if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE) + gfp_mask |= __GFP_HIGHMEM; + + new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask); + + if (new_page && PageTransHuge(new_page)) + prep_transhuge_page(new_page); + + return new_page; +} + #ifdef CONFIG_NUMA static int store_status(int __user *status, int start, int value, int nr) @@ -1519,9 +1598,13 @@ static int do_move_pages_to_node(struct mm_struct *mm, struct list_head *pagelist, int node) { int err; + struct migration_target_control mtc = { + .nid = node, + .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + }; - err = migrate_pages(pagelist, alloc_new_node_page, NULL, node, - MIGRATE_SYNC, MR_SYSCALL); + err = migrate_pages(pagelist, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(pagelist); return err; @@ -2168,6 +2251,16 @@ static int migrate_vma_collect_hole(unsigned long start, struct migrate_vma *migrate = walk->private; unsigned long addr; + /* Only allow populating anonymous memory. */ + if (!vma_is_anonymous(walk->vma)) { + for (addr = start; addr < end; addr += PAGE_SIZE) { + migrate->src[migrate->npages] = 0; + migrate->dst[migrate->npages] = 0; + migrate->npages++; + } + return 0; + } + for (addr = start; addr < end; addr += PAGE_SIZE) { migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; migrate->dst[migrate->npages] = 0; @@ -2260,8 +2353,10 @@ again: pte = *ptep; if (pte_none(pte)) { - mpfn = MIGRATE_PFN_MIGRATE; - migrate->cpages++; + if (vma_is_anonymous(vma)) { + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + } goto next; } @@ -2619,7 +2714,7 @@ restore: /** * migrate_vma_setup() - prepare to migrate a range of memory - * @args: contains the vma, start, and and pfns arrays for the migration + * @args: contains the vma, start, and pfns arrays for the migration * * Returns: negative errno on failures, 0 when 0 or more pages were migrated * without an error. @@ -2830,7 +2925,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, inc_mm_counter(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr, false); if (!is_zone_device_page(page)) - lru_cache_add_active_or_unevictable(page, vma); + lru_cache_add_inactive_or_unevictable(page, vma); get_page(page); if (flush) { diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 352bb9f3ecc0..4fc918163dd3 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -166,7 +166,7 @@ static void mn_itree_inv_end(struct mmu_notifier_subscriptions *subscriptions) /** * mmu_interval_read_begin - Begin a read side critical section against a VA * range - * interval_sub: The interval subscription + * @interval_sub: The interval subscription * * mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a * collision-retry scheme similar to seqcount for the VA range under @@ -686,7 +686,7 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_register); /** * mmu_notifier_register - Register a notifier on a mm - * @mn: The notifier to attach + * @subscription: The notifier to attach * @mm: The mm to attach the notifier to * * Must not hold mmap_lock nor any other VM related lock when calling @@ -856,7 +856,7 @@ static void mmu_notifier_free_rcu(struct rcu_head *rcu) /** * mmu_notifier_put - Release the reference on the notifier - * @mn: The notifier to act on + * @subscription: The notifier to act on * * This function must be paired with each mmu_notifier_get(), it releases the * reference obtained by the get. If this is the last reference then process @@ -965,7 +965,8 @@ static int __mmu_interval_notifier_insert( * @interval_sub: Interval subscription to register * @start: Starting virtual address to monitor * @length: Length of the range to monitor - * @mm : mm_struct to attach to + * @mm: mm_struct to attach to + * @ops: Interval notifier operations to be called on matching events * * This function subscribes the interval notifier for notifications from the * mm. Upon return the ops related to mmu_interval_notifier will be called diff --git a/mm/nommu.c b/mm/nommu.c index 340ae7774c13..75a327149af1 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1762,8 +1762,8 @@ EXPORT_SYMBOL_GPL(access_process_vm); * @newsize: The proposed filesize of the inode * * Check the shared mappings on an inode on behalf of a shrinking truncate to - * make sure that that any outstanding VMAs aren't broken and then shrink the - * vm_regions that extend that beyond so that do_mmap() doesn't + * make sure that any outstanding VMAs aren't broken and then shrink the + * vm_regions that extend beyond so that do_mmap() doesn't * automatically grant mappings that are too large. */ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d30ce75f23fb..e90f25d6385d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -196,17 +196,17 @@ static bool is_dump_unreclaim_slabs(void) * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ -unsigned long oom_badness(struct task_struct *p, unsigned long totalpages) +long oom_badness(struct task_struct *p, unsigned long totalpages) { long points; long adj; if (oom_unkillable_task(p)) - return 0; + return LONG_MIN; p = find_lock_task_mm(p); if (!p) - return 0; + return LONG_MIN; /* * Do not even consider tasks which are explicitly marked oom @@ -218,7 +218,7 @@ unsigned long oom_badness(struct task_struct *p, unsigned long totalpages) test_bit(MMF_OOM_SKIP, &p->mm->flags) || in_vfork(p)) { task_unlock(p); - return 0; + return LONG_MIN; } /* @@ -233,11 +233,7 @@ unsigned long oom_badness(struct task_struct *p, unsigned long totalpages) adj *= totalpages / 1000; points += adj; - /* - * Never return 0 for an eligible task regardless of the root bonus and - * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). - */ - return points > 0 ? points : 1; + return points; } static const char * const oom_constraint_text[] = { @@ -310,7 +306,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) static int oom_evaluate_task(struct task_struct *task, void *arg) { struct oom_control *oc = arg; - unsigned long points; + long points; if (oom_unkillable_task(task)) goto next; @@ -336,12 +332,12 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) * killed first if it triggers an oom, then select it. */ if (oom_task_origin(task)) { - points = ULONG_MAX; + points = LONG_MAX; goto select; } points = oom_badness(task, oc->totalpages); - if (!points || points < oc->chosen_points) + if (points == LONG_MIN || points < oc->chosen_points) goto next; select: @@ -365,6 +361,8 @@ abort: */ static void select_bad_process(struct oom_control *oc) { + oc->chosen_points = LONG_MIN; + if (is_memcg_oom(oc)) mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc); else { @@ -863,6 +861,8 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) p = find_lock_task_mm(victim); if (!p) { + pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n", + message, task_pid_nr(victim), victim->comm); put_task_struct(victim); return; } else if (victim != p) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 167732f4d124..8b7d0ecf30b1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4282,7 +4282,7 @@ retry: /* * If an allocation failed after direct reclaim, it could be because * pages are pinned on the per-cpu lists or in high alloc reserves. - * Shrink them them and try again + * Shrink them and try again */ if (!page && !drained) { unreserve_highatomic_pageblock(ac, false); @@ -6192,7 +6192,7 @@ static int zone_batchsize(struct zone *zone) * locking. * * Any new users of pcp->batch and pcp->high should ensure they can cope with - * those fields changing asynchronously (acording the the above rule). + * those fields changing asynchronously (acording to the above rule). * * mutex_is_locked(&pcp_batch_high_lock) required when calling this function * outside of boot time (or some other assurance that no concurrent updaters @@ -8203,7 +8203,7 @@ void *__init alloc_large_system_hash(const char *tablename, * race condition. So you can't expect this function should be exact. * * Returns a page without holding a reference. If the caller wants to - * dereference that page (e.g., dumping), it has to make sure that that it + * dereference that page (e.g., dumping), it has to make sure that it * cannot get removed (e.g., via memory unplug) concurrently. * */ @@ -8347,6 +8347,10 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, unsigned long pfn = start; unsigned int tries = 0; int ret = 0; + struct migration_target_control mtc = { + .nid = zone_to_nid(cc->zone), + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + }; migrate_prep(); @@ -8373,8 +8377,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, &cc->migratepages); cc->nr_migratepages -= nr_reclaimed; - ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - NULL, 0, cc->mode, MR_CONTIG_RANGE); + ret = migrate_pages(&cc->migratepages, alloc_migration_target, + NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE); } if (ret < 0) { putback_movable_pages(&cc->migratepages); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f6d07c5f0d34..242c03121d73 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -306,8 +306,3 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, return pfn < end_pfn ? -EBUSY : 0; } - -struct page *alloc_migrate_target(struct page *page, unsigned long private) -{ - return new_page_nodemask(page, numa_node_id(), &node_states[N_MEMORY]); -} diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 0468ba500bd4..18b768ac7dca 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -6,6 +6,25 @@ #include <linux/percpu.h> /* + * There are two chunk types: root and memcg-aware. + * Chunks of each type have separate slots list. + * + * Memcg-aware chunks have an attached vector of obj_cgroup pointers, which is + * used to store memcg membership data of a percpu object. Obj_cgroups are + * ref-counted pointers to a memory cgroup with an ability to switch dynamically + * to the parent memory cgroup. This allows to reclaim a deleted memory cgroup + * without reclaiming of all outstanding objects, which hold a reference at it. + */ +enum pcpu_chunk_type { + PCPU_CHUNK_ROOT, +#ifdef CONFIG_MEMCG_KMEM + PCPU_CHUNK_MEMCG, +#endif + PCPU_NR_CHUNK_TYPES, + PCPU_FAIL_ALLOC = PCPU_NR_CHUNK_TYPES +}; + +/* * pcpu_block_md is the metadata block struct. * Each chunk's bitmap is split into a number of full blocks. * All units are in terms of bits. @@ -54,6 +73,9 @@ struct pcpu_chunk { int end_offset; /* additional area required to have the region end page aligned */ +#ifdef CONFIG_MEMCG_KMEM + struct obj_cgroup **obj_cgroups; /* vector of object cgroups */ +#endif int nr_pages; /* # of pages served by this chunk */ int nr_populated; /* # of populated pages */ @@ -63,7 +85,7 @@ struct pcpu_chunk { extern spinlock_t pcpu_lock; -extern struct list_head *pcpu_slot; +extern struct list_head *pcpu_chunk_lists; extern int pcpu_nr_slots; extern int pcpu_nr_empty_pop_pages; @@ -106,6 +128,37 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) return pcpu_nr_pages_to_map_bits(chunk->nr_pages); } +#ifdef CONFIG_MEMCG_KMEM +static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk) +{ + if (chunk->obj_cgroups) + return PCPU_CHUNK_MEMCG; + return PCPU_CHUNK_ROOT; +} + +static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type) +{ + return chunk_type == PCPU_CHUNK_MEMCG; +} + +#else +static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk) +{ + return PCPU_CHUNK_ROOT; +} + +static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type) +{ + return false; +} +#endif + +static inline struct list_head *pcpu_chunk_list(enum pcpu_chunk_type chunk_type) +{ + return &pcpu_chunk_lists[pcpu_nr_slots * + pcpu_is_memcg_chunk(chunk_type)]; +} + #ifdef CONFIG_PERCPU_STATS #include <linux/spinlock.h> diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 20d2b69a13b0..35c9941077ee 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -44,7 +44,8 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, /* nada */ } -static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) +static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type, + gfp_t gfp) { const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; struct pcpu_chunk *chunk; @@ -52,7 +53,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) unsigned long flags; int i; - chunk = pcpu_alloc_chunk(gfp); + chunk = pcpu_alloc_chunk(type, gfp); if (!chunk) return NULL; diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index 32558063c3f9..c8400a2adbc2 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -34,11 +34,15 @@ static int find_max_nr_alloc(void) { struct pcpu_chunk *chunk; int slot, max_nr_alloc; + enum pcpu_chunk_type type; max_nr_alloc = 0; - for (slot = 0; slot < pcpu_nr_slots; slot++) - list_for_each_entry(chunk, &pcpu_slot[slot], list) - max_nr_alloc = max(max_nr_alloc, chunk->nr_alloc); + for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) + for (slot = 0; slot < pcpu_nr_slots; slot++) + list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot], + list) + max_nr_alloc = max(max_nr_alloc, + chunk->nr_alloc); return max_nr_alloc; } @@ -129,6 +133,9 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, P("cur_min_alloc", cur_min_alloc); P("cur_med_alloc", cur_med_alloc); P("cur_max_alloc", cur_max_alloc); +#ifdef CONFIG_MEMCG_KMEM + P("memcg_aware", pcpu_is_memcg_chunk(pcpu_chunk_type(chunk))); +#endif seq_putc(m, '\n'); } @@ -137,6 +144,7 @@ static int percpu_stats_show(struct seq_file *m, void *v) struct pcpu_chunk *chunk; int slot, max_nr_alloc; int *buffer; + enum pcpu_chunk_type type; alloc_buffer: spin_lock_irq(&pcpu_lock); @@ -202,18 +210,18 @@ alloc_buffer: chunk_map_stats(m, pcpu_reserved_chunk, buffer); } - for (slot = 0; slot < pcpu_nr_slots; slot++) { - list_for_each_entry(chunk, &pcpu_slot[slot], list) { - if (chunk == pcpu_first_chunk) { - seq_puts(m, "Chunk: <- First Chunk\n"); - chunk_map_stats(m, chunk, buffer); - - - } else { - seq_puts(m, "Chunk:\n"); - chunk_map_stats(m, chunk, buffer); + for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) { + for (slot = 0; slot < pcpu_nr_slots; slot++) { + list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot], + list) { + if (chunk == pcpu_first_chunk) { + seq_puts(m, "Chunk: <- First Chunk\n"); + chunk_map_stats(m, chunk, buffer); + } else { + seq_puts(m, "Chunk:\n"); + chunk_map_stats(m, chunk, buffer); + } } - } } diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index a2b395acef89..e46f7a6917f9 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -328,12 +328,13 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, pcpu_free_pages(chunk, pages, page_start, page_end); } -static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) +static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type, + gfp_t gfp) { struct pcpu_chunk *chunk; struct vm_struct **vms; - chunk = pcpu_alloc_chunk(gfp); + chunk = pcpu_alloc_chunk(type, gfp); if (!chunk) return NULL; diff --git a/mm/percpu.c b/mm/percpu.c index b626766160ce..f4709629e6de 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -37,9 +37,14 @@ * takes care of normal allocations. * * The allocator organizes chunks into lists according to free size and - * tries to allocate from the fullest chunk first. Each chunk is managed - * by a bitmap with metadata blocks. The allocation map is updated on - * every allocation and free to reflect the current state while the boundary + * memcg-awareness. To make a percpu allocation memcg-aware the __GFP_ACCOUNT + * flag should be passed. All memcg-aware allocations are sharing one set + * of chunks and all unaccounted allocations and allocations performed + * by processes belonging to the root memory cgroup are using the second set. + * + * The allocator tries to allocate from the fullest chunk first. Each chunk + * is managed by a bitmap with metadata blocks. The allocation map is updated + * on every allocation and free to reflect the current state while the boundary * map is only updated on allocation. Each metadata block contains * information to help mitigate the need to iterate over large portions * of the bitmap. The reverse mapping from page to chunk is stored in @@ -81,6 +86,7 @@ #include <linux/kmemleak.h> #include <linux/sched.h> #include <linux/sched/mm.h> +#include <linux/memcontrol.h> #include <asm/cacheflush.h> #include <asm/sections.h> @@ -160,7 +166,7 @@ struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init; DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */ -struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */ +struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */ /* chunks which need their map areas extended, protected by pcpu_lock */ static LIST_HEAD(pcpu_map_extend_chunks); @@ -500,6 +506,9 @@ static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot, bool move_front) { if (chunk != pcpu_reserved_chunk) { + struct list_head *pcpu_slot; + + pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk)); if (move_front) list_move(&chunk->list, &pcpu_slot[slot]); else @@ -1211,11 +1220,14 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, * * This function determines the size of an allocation to free using * the boundary bitmap and clears the allocation map. + * + * RETURNS: + * Number of freed bytes. */ -static void pcpu_free_area(struct pcpu_chunk *chunk, int off) +static int pcpu_free_area(struct pcpu_chunk *chunk, int off) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; - int bit_off, bits, end, oslot; + int bit_off, bits, end, oslot, freed; lockdep_assert_held(&pcpu_lock); pcpu_stats_area_dealloc(chunk); @@ -1230,8 +1242,10 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int off) bits = end - bit_off; bitmap_clear(chunk->alloc_map, bit_off, bits); + freed = bits * PCPU_MIN_ALLOC_SIZE; + /* update metadata */ - chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE; + chunk->free_bytes += freed; /* update first free bit */ chunk_md->first_free = min(chunk_md->first_free, bit_off); @@ -1239,6 +1253,8 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int off) pcpu_block_update_hint_free(chunk, bit_off, bits); pcpu_chunk_relocate(chunk, oslot); + + return freed; } static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits) @@ -1334,6 +1350,10 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); +#ifdef CONFIG_MEMCG_KMEM + /* first chunk isn't memcg-aware */ + chunk->obj_cgroups = NULL; +#endif pcpu_init_md_blocks(chunk); /* manage populated page bitmap */ @@ -1373,7 +1393,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, return chunk; } -static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) +static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp) { struct pcpu_chunk *chunk; int region_bits; @@ -1401,6 +1421,16 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) if (!chunk->md_blocks) goto md_blocks_fail; +#ifdef CONFIG_MEMCG_KMEM + if (pcpu_is_memcg_chunk(type)) { + chunk->obj_cgroups = + pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) * + sizeof(struct obj_cgroup *), gfp); + if (!chunk->obj_cgroups) + goto objcg_fail; + } +#endif + pcpu_init_md_blocks(chunk); /* init metadata */ @@ -1408,6 +1438,10 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) return chunk; +#ifdef CONFIG_MEMCG_KMEM +objcg_fail: + pcpu_mem_free(chunk->md_blocks); +#endif md_blocks_fail: pcpu_mem_free(chunk->bound_map); bound_map_fail: @@ -1422,6 +1456,9 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; +#ifdef CONFIG_MEMCG_KMEM + pcpu_mem_free(chunk->obj_cgroups); +#endif pcpu_mem_free(chunk->md_blocks); pcpu_mem_free(chunk->bound_map); pcpu_mem_free(chunk->alloc_map); @@ -1498,7 +1535,8 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end, gfp_t gfp); static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end); -static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp); +static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type, + gfp_t gfp); static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); static struct page *pcpu_addr_to_page(void *addr); static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); @@ -1540,6 +1578,87 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) return pcpu_get_page_chunk(pcpu_addr_to_page(addr)); } +#ifdef CONFIG_MEMCG_KMEM +static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, + struct obj_cgroup **objcgp) +{ + struct obj_cgroup *objcg; + + if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT) || + memcg_kmem_bypass()) + return PCPU_CHUNK_ROOT; + + objcg = get_obj_cgroup_from_current(); + if (!objcg) + return PCPU_CHUNK_ROOT; + + if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) { + obj_cgroup_put(objcg); + return PCPU_FAIL_ALLOC; + } + + *objcgp = objcg; + return PCPU_CHUNK_MEMCG; +} + +static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, + struct pcpu_chunk *chunk, int off, + size_t size) +{ + if (!objcg) + return; + + if (chunk) { + chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg; + + rcu_read_lock(); + mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, + size * num_possible_cpus()); + rcu_read_unlock(); + } else { + obj_cgroup_uncharge(objcg, size * num_possible_cpus()); + obj_cgroup_put(objcg); + } +} + +static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) +{ + struct obj_cgroup *objcg; + + if (!pcpu_is_memcg_chunk(pcpu_chunk_type(chunk))) + return; + + objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT]; + chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL; + + obj_cgroup_uncharge(objcg, size * num_possible_cpus()); + + rcu_read_lock(); + mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, + -(size * num_possible_cpus())); + rcu_read_unlock(); + + obj_cgroup_put(objcg); +} + +#else /* CONFIG_MEMCG_KMEM */ +static enum pcpu_chunk_type +pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp) +{ + return PCPU_CHUNK_ROOT; +} + +static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, + struct pcpu_chunk *chunk, int off, + size_t size) +{ +} + +static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + /** * pcpu_alloc - the percpu allocator * @size: size of area to allocate in bytes @@ -1561,6 +1680,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, gfp_t pcpu_gfp; bool is_atomic; bool do_warn; + enum pcpu_chunk_type type; + struct list_head *pcpu_slot; + struct obj_cgroup *objcg = NULL; static int warn_limit = 10; struct pcpu_chunk *chunk, *next; const char *err; @@ -1595,16 +1717,23 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, return NULL; } + type = pcpu_memcg_pre_alloc_hook(size, gfp, &objcg); + if (unlikely(type == PCPU_FAIL_ALLOC)) + return NULL; + pcpu_slot = pcpu_chunk_list(type); + if (!is_atomic) { /* * pcpu_balance_workfn() allocates memory under this mutex, * and it may wait for memory reclaim. Allow current task * to become OOM victim, in case of memory pressure. */ - if (gfp & __GFP_NOFAIL) + if (gfp & __GFP_NOFAIL) { mutex_lock(&pcpu_alloc_mutex); - else if (mutex_lock_killable(&pcpu_alloc_mutex)) + } else if (mutex_lock_killable(&pcpu_alloc_mutex)) { + pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size); return NULL; + } } spin_lock_irqsave(&pcpu_lock, flags); @@ -1659,7 +1788,7 @@ restart: } if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { - chunk = pcpu_create_chunk(pcpu_gfp); + chunk = pcpu_create_chunk(type, pcpu_gfp); if (!chunk) { err = "failed to allocate new chunk"; goto fail; @@ -1716,6 +1845,8 @@ area_found: trace_percpu_alloc_percpu(reserved, is_atomic, size, align, chunk->base_addr, off, ptr); + pcpu_memcg_post_alloc_hook(objcg, chunk, off, size); + return ptr; fail_unlock: @@ -1737,6 +1868,9 @@ fail: } else { mutex_unlock(&pcpu_alloc_mutex); } + + pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size); + return NULL; } @@ -1796,8 +1930,8 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) } /** - * pcpu_balance_workfn - manage the amount of free chunks and populated pages - * @work: unused + * __pcpu_balance_workfn - manage the amount of free chunks and populated pages + * @type: chunk type * * Reclaim all fully free chunks except for the first one. This is also * responsible for maintaining the pool of empty populated pages. However, @@ -1806,11 +1940,12 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) * allocation causes the failure as it is possible that requests can be * serviced from already backed regions. */ -static void pcpu_balance_workfn(struct work_struct *work) +static void __pcpu_balance_workfn(enum pcpu_chunk_type type) { /* gfp flags passed to underlying allocators */ const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; LIST_HEAD(to_free); + struct list_head *pcpu_slot = pcpu_chunk_list(type); struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; int slot, nr_to_pop, ret; @@ -1908,7 +2043,7 @@ retry_pop: if (nr_to_pop) { /* ran out of chunks to populate, create a new one and retry */ - chunk = pcpu_create_chunk(gfp); + chunk = pcpu_create_chunk(type, gfp); if (chunk) { spin_lock_irq(&pcpu_lock); pcpu_chunk_relocate(chunk, -1); @@ -1921,6 +2056,20 @@ retry_pop: } /** + * pcpu_balance_workfn - manage the amount of free chunks and populated pages + * @work: unused + * + * Call __pcpu_balance_workfn() for each chunk type. + */ +static void pcpu_balance_workfn(struct work_struct *work) +{ + enum pcpu_chunk_type type; + + for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) + __pcpu_balance_workfn(type); +} + +/** * free_percpu - free percpu area * @ptr: pointer to area to free * @@ -1934,8 +2083,9 @@ void free_percpu(void __percpu *ptr) void *addr; struct pcpu_chunk *chunk; unsigned long flags; - int off; + int size, off; bool need_balance = false; + struct list_head *pcpu_slot; if (!ptr) return; @@ -1949,7 +2099,11 @@ void free_percpu(void __percpu *ptr) chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->base_addr; - pcpu_free_area(chunk, off); + size = pcpu_free_area(chunk, off); + + pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk)); + + pcpu_memcg_free_hook(chunk, off, size); /* if there are more than one fully free chunks, wake up grim reaper */ if (chunk->free_bytes == pcpu_unit_size) { @@ -2260,6 +2414,7 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, int map_size; unsigned long tmp_addr; size_t alloc_size; + enum pcpu_chunk_type type; #define PCPU_SETUP_BUG_ON(cond) do { \ if (unlikely(cond)) { \ @@ -2377,13 +2532,18 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, * empty chunks. */ pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; - pcpu_slot = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_slot[0]), - SMP_CACHE_BYTES); - if (!pcpu_slot) + pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots * + sizeof(pcpu_chunk_lists[0]) * + PCPU_NR_CHUNK_TYPES, + SMP_CACHE_BYTES); + if (!pcpu_chunk_lists) panic("%s: Failed to allocate %zu bytes\n", __func__, - pcpu_nr_slots * sizeof(pcpu_slot[0])); - for (i = 0; i < pcpu_nr_slots; i++) - INIT_LIST_HEAD(&pcpu_slot[i]); + pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) * + PCPU_NR_CHUNK_TYPES); + + for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) + for (i = 0; i < pcpu_nr_slots; i++) + INIT_LIST_HEAD(&pcpu_chunk_list(type)[i]); /* * The end of the static region needs to be aligned with the diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index cc85ce81914a..29c052099aff 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -105,7 +105,7 @@ static int process_vm_rw_single_vec(unsigned long addr, * current/current->mm */ mmap_read_lock(mm); - pinned_pages = pin_user_pages_remote(task, mm, pa, pinned_pages, + pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages, flags, process_pages, NULL, &locked); if (locked) diff --git a/mm/rmap.c b/mm/rmap.c index 5fe2dedce1fc..6cce9ef06753 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1469,7 +1469,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * do this outside rmap routines. */ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); - if (huge_pmd_unshare(mm, &address, pvmw.pte)) { + if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { /* * huge_pmd_unshare unmapped an entire PMD * page. There is no way of knowing exactly diff --git a/mm/shmem.c b/mm/shmem.c index eb6b36d89722..271548ca20f3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1434,7 +1434,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) list_add(&info->swaplist, &shmem_swaplist); if (add_to_swap_cache(page, swap, - __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN) == 0) { + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, + NULL) == 0) { spin_lock_irq(&info->lock); shmem_recalc_inode(inode); info->swapped++; @@ -1685,7 +1686,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * Swap in the page pointed to by *pagep. * Caller has to make sure that *pagep contains a valid swapped page. * Returns 0 and the page in pagep if success. On failure, returns the - * the error code and NULL in *pagep. + * error code and NULL in *pagep. */ static int shmem_swapin_page(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, diff --git a/mm/slab_common.c b/mm/slab_common.c index a513f3237155..f9ccd5dc13f3 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -419,7 +419,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) /* * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the * @slab_caches_to_rcu_destroy list. The slab pages are freed - * through RCU and and the associated kmem_cache are dereferenced + * through RCU and the associated kmem_cache are dereferenced * while freeing the pages, so the kmem_caches should be freed only * after the pending RCU operations are finished. As rcu_barrier() * is a pretty slow operation, we batch all pending destructions diff --git a/mm/swap.c b/mm/swap.c index de257c0a89b1..9285e60c7d6e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -476,23 +476,24 @@ void lru_cache_add(struct page *page) EXPORT_SYMBOL(lru_cache_add); /** - * lru_cache_add_active_or_unevictable + * lru_cache_add_inactive_or_unevictable * @page: the page to be added to LRU * @vma: vma in which page is mapped for determining reclaimability * - * Place @page on the active or unevictable LRU list, depending on its + * Place @page on the inactive or unevictable LRU list, depending on its * evictability. Note that if the page is not evictable, it goes * directly back onto it's zone's unevictable list, it does NOT use a * per cpu pagevec. */ -void lru_cache_add_active_or_unevictable(struct page *page, +void lru_cache_add_inactive_or_unevictable(struct page *page, struct vm_area_struct *vma) { + bool unevictable; + VM_BUG_ON_PAGE(PageLRU(page), page); - if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) - SetPageActive(page); - else if (!TestSetPageMlocked(page)) { + unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED; + if (unlikely(unevictable) && !TestSetPageMlocked(page)) { /* * We use the irq-unsafe __mod_zone_page_stat because this * counter is not modified from interrupt context, and the pte diff --git a/mm/swap_state.c b/mm/swap_state.c index e82f4f8b1f63..b73aabdfd35a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -106,16 +106,32 @@ void show_swap_cache_info(void) printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); } +void *get_shadow_from_swap_cache(swp_entry_t entry) +{ + struct address_space *address_space = swap_address_space(entry); + pgoff_t idx = swp_offset(entry); + struct page *page; + + page = find_get_entry(address_space, idx); + if (xa_is_value(page)) + return page; + if (page) + put_page(page); + return NULL; +} + /* * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) +int add_to_swap_cache(struct page *page, swp_entry_t entry, + gfp_t gfp, void **shadowp) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); unsigned long i, nr = hpage_nr_pages(page); + void *old; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapCache(page), page); @@ -125,16 +141,25 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) SetPageSwapCache(page); do { + unsigned long nr_shadows = 0; + xas_lock_irq(&xas); xas_create_range(&xas); if (xas_error(&xas)) goto unlock; for (i = 0; i < nr; i++) { VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); + old = xas_load(&xas); + if (xa_is_value(old)) { + nr_shadows++; + if (shadowp) + *shadowp = old; + } set_page_private(page + i, entry.val + i); xas_store(&xas, page); xas_next(&xas); } + address_space->nrexceptional -= nr_shadows; address_space->nrpages += nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); ADD_CACHE_INFO(add_total, nr); @@ -154,7 +179,8 @@ unlock: * This must be called only on pages that have * been verified to be in the swap cache. */ -void __delete_from_swap_cache(struct page *page, swp_entry_t entry) +void __delete_from_swap_cache(struct page *page, + swp_entry_t entry, void *shadow) { struct address_space *address_space = swap_address_space(entry); int i, nr = hpage_nr_pages(page); @@ -166,12 +192,14 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry) VM_BUG_ON_PAGE(PageWriteback(page), page); for (i = 0; i < nr; i++) { - void *entry = xas_store(&xas, NULL); + void *entry = xas_store(&xas, shadow); VM_BUG_ON_PAGE(entry != page, entry); set_page_private(page + i, 0); xas_next(&xas); } ClearPageSwapCache(page); + if (shadow) + address_space->nrexceptional += nr; address_space->nrpages -= nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); ADD_CACHE_INFO(del_total, nr); @@ -208,7 +236,7 @@ int add_to_swap(struct page *page) * Add it to the swap cache. */ err = add_to_swap_cache(page, entry, - __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); + __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); if (err) /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely @@ -246,13 +274,44 @@ void delete_from_swap_cache(struct page *page) struct address_space *address_space = swap_address_space(entry); xa_lock_irq(&address_space->i_pages); - __delete_from_swap_cache(page, entry); + __delete_from_swap_cache(page, entry, NULL); xa_unlock_irq(&address_space->i_pages); put_swap_page(page, entry); page_ref_sub(page, hpage_nr_pages(page)); } +void clear_shadow_from_swap_cache(int type, unsigned long begin, + unsigned long end) +{ + unsigned long curr = begin; + void *old; + + for (;;) { + unsigned long nr_shadows = 0; + swp_entry_t entry = swp_entry(type, curr); + struct address_space *address_space = swap_address_space(entry); + XA_STATE(xas, &address_space->i_pages, curr); + + xa_lock_irq(&address_space->i_pages); + xas_for_each(&xas, old, end) { + if (!xa_is_value(old)) + continue; + xas_store(&xas, NULL); + nr_shadows++; + } + address_space->nrexceptional -= nr_shadows; + xa_unlock_irq(&address_space->i_pages); + + /* search the next swapcache until we meet end */ + curr >>= SWAP_ADDRESS_SPACE_SHIFT; + curr++; + curr <<= SWAP_ADDRESS_SPACE_SHIFT; + if (curr > end) + break; + } +} + /* * If we are the only user, then try to free up the swap cache. * @@ -361,6 +420,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, { struct swap_info_struct *si; struct page *page; + void *shadow = NULL; *new_page_allocated = false; @@ -429,7 +489,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, __SetPageSwapBacked(page); /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK)) { + if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) { put_swap_page(page, entry); goto fail_unlock; } @@ -439,10 +499,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, goto fail_unlock; } - /* XXX: Move to lru_cache_add() when it supports new vs putback */ - spin_lock_irq(&page_pgdat(page)->lru_lock); - lru_note_cost_page(page); - spin_unlock_irq(&page_pgdat(page)->lru_lock); + if (shadow) + workingset_refault(page, shadow); /* Caller will initiate read into locked page */ SetPageWorkingset(page); diff --git a/mm/swapfile.c b/mm/swapfile.c index 6c26916e95fd..e653eea1eb88 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -696,6 +696,7 @@ static void add_to_avail_list(struct swap_info_struct *p) static void swap_range_free(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { + unsigned long begin = offset; unsigned long end = offset + nr_entries - 1; void (*swap_slot_free_notify)(struct block_device *, unsigned long); @@ -721,6 +722,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, swap_slot_free_notify(si->bdev, offset); offset++; } + clear_shadow_from_swap_cache(si->type, begin, end); } static void set_cluster_next(struct swap_info_struct *si, unsigned long next) @@ -1915,7 +1917,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, page_add_anon_rmap(page, vma, addr, false); } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr, false); - lru_cache_add_active_or_unevictable(page, vma); + lru_cache_add_inactive_or_unevictable(page, vma); } swap_free(entry); /* diff --git a/mm/usercopy.c b/mm/usercopy.c index 660717a1ea5c..b3de3c4eefba 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -43,7 +43,7 @@ static noinline int check_stack_object(const void *obj, unsigned long len) /* * Reject: object partially overlaps the stack (passing the - * the check above means at least one end is within the stack, + * check above means at least one end is within the stack, * so if this check fails, the other end is outside the stack). */ if (obj < stack || stackend < obj + len) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index b80419320c7d..9a3d451402d7 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -123,7 +123,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, inc_mm_counter(dst_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, dst_vma, dst_addr, false); - lru_cache_add_active_or_unevictable(page, dst_vma); + lru_cache_add_inactive_or_unevictable(page, dst_vma); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); diff --git a/mm/vmscan.c b/mm/vmscan.c index 72da290b171b..738115ed75e2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -854,6 +854,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, { unsigned long flags; int refcount; + void *shadow = NULL; BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); @@ -896,13 +897,13 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; mem_cgroup_swapout(page, swap); - __delete_from_swap_cache(page, swap); + if (reclaimed && !mapping_exiting(mapping)) + shadow = workingset_eviction(page, target_memcg); + __delete_from_swap_cache(page, swap, shadow); xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); - workingset_eviction(page, target_memcg); } else { void (*freepage)(struct page *); - void *shadow = NULL; freepage = mapping->a_ops->freepage; /* @@ -998,8 +999,6 @@ static enum page_references page_check_references(struct page *page, return PAGEREF_RECLAIM; if (referenced_ptes) { - if (PageSwapBacked(page)) - return PAGEREF_ACTIVATE; /* * All mapped pages start out with page table * references from the instantiating fault, so we need @@ -1022,7 +1021,7 @@ static enum page_references page_check_references(struct page *page, /* * Activate file-backed executable pages after first usage. */ - if (vm_flags & VM_EXEC) + if ((vm_flags & VM_EXEC) && !PageSwapBacked(page)) return PAGEREF_ACTIVATE; return PAGEREF_KEEP; @@ -2685,7 +2684,10 @@ again: if (!sc->force_deactivate) { unsigned long refaults; - if (inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE_ANON); + if (refaults != target_lruvec->refaults[0] || + inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) sc->may_deactivate |= DEACTIVATE_ANON; else sc->may_deactivate &= ~DEACTIVATE_ANON; @@ -2696,8 +2698,8 @@ again: * rid of any stale active pages quickly. */ refaults = lruvec_page_state(target_lruvec, - WORKINGSET_ACTIVATE); - if (refaults != target_lruvec->refaults || + WORKINGSET_ACTIVATE_FILE); + if (refaults != target_lruvec->refaults[1] || inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) sc->may_deactivate |= DEACTIVATE_FILE; else @@ -2796,7 +2798,7 @@ again: set_bit(PGDAT_DIRTY, &pgdat->flags); /* - * If kswapd scans pages marked marked for immediate + * If kswapd scans pages marked for immediate * reclaim and under writeback (nr_immediate), it * implies that pages are cycling through the LRU * faster than they are written so also forcibly stall. @@ -2974,8 +2976,10 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) unsigned long refaults; target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); - refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE); - target_lruvec->refaults = refaults; + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); + target_lruvec->refaults[0] = refaults; + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE); + target_lruvec->refaults[1] = refaults; } /* @@ -3369,7 +3373,7 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) /* * Check for watermark boosts top-down as the higher zones * are more likely to be boosted. Both watermarks and boosts - * should not be checked at the time time as reclaim would + * should not be checked at the same time as reclaim would * start prematurely when there is no boosting and a lower * zone is balanced. */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 2b866cbab11d..727a26d1ec1d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1096,6 +1096,24 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); } +/* + * Calculates external fragmentation within a zone wrt the given order. + * It is defined as the percentage of pages found in blocks of size + * less than 1 << order. It returns values in range [0, 100]. + */ +unsigned int extfrag_for_order(struct zone *zone, unsigned int order) +{ + struct contig_page_info info; + + fill_contig_page_info(zone, order, &info); + if (info.free_pages == 0) + return 0; + + return div_u64((info.free_pages - + (info.free_blocks_suitable << order)) * 100, + info.free_pages); +} + /* Same as __fragmentation index but allocs contig_page_info on stack */ int fragmentation_index(struct zone *zone, unsigned int order) { @@ -1167,9 +1185,12 @@ const char * const vmstat_text[] = { "nr_isolated_anon", "nr_isolated_file", "workingset_nodes", - "workingset_refault", - "workingset_activate", - "workingset_restore", + "workingset_refault_anon", + "workingset_refault_file", + "workingset_activate_anon", + "workingset_activate_file", + "workingset_restore_anon", + "workingset_restore_file", "workingset_nodereclaim", "nr_anon_pages", "nr_mapped", @@ -1256,6 +1277,9 @@ const char * const vmstat_text[] = { #ifdef CONFIG_MIGRATION "pgmigrate_success", "pgmigrate_fail", + "thp_migration_success", + "thp_migration_fail", + "thp_migration_split", #endif #ifdef CONFIG_COMPACTION "compact_migrate_scanned", diff --git a/mm/workingset.c b/mm/workingset.c index b199726924dd..8cbe4e3cbe5c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -6,6 +6,7 @@ */ #include <linux/memcontrol.h> +#include <linux/mm_inline.h> #include <linux/writeback.h> #include <linux/shmem_fs.h> #include <linux/pagemap.h> @@ -280,6 +281,7 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) */ void workingset_refault(struct page *page, void *shadow) { + bool file = page_is_file_lru(page); struct mem_cgroup *eviction_memcg; struct lruvec *eviction_lruvec; unsigned long refault_distance; @@ -346,27 +348,34 @@ void workingset_refault(struct page *page, void *shadow) memcg = page_memcg(page); lruvec = mem_cgroup_lruvec(memcg, pgdat); - inc_lruvec_state(lruvec, WORKINGSET_REFAULT); + inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file); /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if - * all the memory was available to the page cache. Whether - * cache can compete with anon or not depends on having swap. + * all the memory was available to the workingset. Whether + * workingset competition needs to consider anon or not depends + * on having swap. */ workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); - if (mem_cgroup_get_nr_swap_pages(memcg) > 0) { + if (!file) { workingset_size += lruvec_page_state(eviction_lruvec, - NR_INACTIVE_ANON); + NR_INACTIVE_FILE); + } + if (mem_cgroup_get_nr_swap_pages(memcg) > 0) { workingset_size += lruvec_page_state(eviction_lruvec, NR_ACTIVE_ANON); + if (file) { + workingset_size += lruvec_page_state(eviction_lruvec, + NR_INACTIVE_ANON); + } } if (refault_distance > workingset_size) goto out; SetPageActive(page); workingset_age_nonresident(lruvec, hpage_nr_pages(page)); - inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file); /* Page was active prior to eviction */ if (workingset) { @@ -375,7 +384,7 @@ void workingset_refault(struct page *page, void *shadow) spin_lock_irq(&page_pgdat(page)->lru_lock); lru_note_cost_page(page); spin_unlock_irq(&page_pgdat(page)->lru_lock); - inc_lruvec_state(lruvec, WORKINGSET_RESTORE); + inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file); } out: rcu_read_unlock(); diff --git a/mm/zpool.c b/mm/zpool.c index 863669212070..3744a2d1a624 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -239,15 +239,15 @@ const char *zpool_get_type(struct zpool *zpool) } /** - * zpool_malloc_support_movable() - Check if the zpool support - * allocate movable memory + * zpool_malloc_support_movable() - Check if the zpool supports + * allocating movable memory * @zpool: The zpool to check * - * This returns if the zpool support allocate movable memory. + * This returns if the zpool supports allocating movable memory. * * Implementations must guarantee this to be thread-safe. * - * Returns: true if if the zpool support allocate movable memory, false if not + * Returns: true if the zpool supports allocating movable memory, false if not */ bool zpool_malloc_support_movable(struct zpool *zpool) { diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 952a01e45c6a..c36fdff9a371 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -79,7 +79,7 @@ /* * Object location (<PFN>, <obj_idx>) is encoded as - * as single (unsigned long) handle value. + * a single (unsigned long) handle value. * * Note that object index <obj_idx> starts from 0. * |