diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/hmm.c | 5 | ||||
-rw-r--r-- | mm/kmemleak.c | 3 | ||||
-rw-r--r-- | mm/mempolicy.c | 214 | ||||
-rw-r--r-- | mm/migrate.c | 61 | ||||
-rw-r--r-- | mm/page_alloc.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 2 | ||||
-rw-r--r-- | mm/vmstat.c | 48 |
7 files changed, 150 insertions, 187 deletions
@@ -295,10 +295,13 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, goto fault; /* + * Bypass devmap pte such as DAX page when all pfn requested + * flags(pfn_req_flags) are fulfilled. * Since each architecture defines a struct page for the zero page, just * fall through and treat it like a normal page. */ - if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) { + if (pte_special(pte) && !pte_devmap(pte) && + !is_zero_pfn(pte_pfn(pte))) { if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) { pte_unmap(ptep); return -EFAULT; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index b59f1761d817..b57383c17cf6 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -113,7 +113,8 @@ #define BYTES_PER_POINTER sizeof(void *) /* GFP bitmask for kmemleak internal allocations */ -#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ +#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \ + __GFP_NOLOCKDEP)) | \ __GFP_NORETRY | __GFP_NOMEMALLOC | \ __GFP_NOWARN) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5e90b3fb7794..1592b081c58e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1362,16 +1362,33 @@ mpol_out: /* * User space interface with variable sized bitmaps for nodelists. */ +static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, + unsigned long maxnode) +{ + unsigned long nlongs = BITS_TO_LONGS(maxnode); + int ret; + + if (in_compat_syscall()) + ret = compat_get_bitmap(mask, + (const compat_ulong_t __user *)nmask, + maxnode); + else + ret = copy_from_user(mask, nmask, + nlongs * sizeof(unsigned long)); + + if (ret) + return -EFAULT; + + if (maxnode % BITS_PER_LONG) + mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; + + return 0; +} /* Copy a node mask from user space. */ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { - unsigned long k; - unsigned long t; - unsigned long nlongs; - unsigned long endmask; - --maxnode; nodes_clear(*nodes); if (maxnode == 0 || !nmask) @@ -1379,49 +1396,29 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, if (maxnode > PAGE_SIZE*BITS_PER_BYTE) return -EINVAL; - nlongs = BITS_TO_LONGS(maxnode); - if ((maxnode % BITS_PER_LONG) == 0) - endmask = ~0UL; - else - endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; - /* * When the user specified more nodes than supported just check - * if the non supported part is all zero. - * - * If maxnode have more longs than MAX_NUMNODES, check - * the bits in that area first. And then go through to - * check the rest bits which equal or bigger than MAX_NUMNODES. - * Otherwise, just check bits [MAX_NUMNODES, maxnode). + * if the non supported part is all zero, one word at a time, + * starting at the end. */ - if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { - for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { - if (get_user(t, nmask + k)) - return -EFAULT; - if (k == nlongs - 1) { - if (t & endmask) - return -EINVAL; - } else if (t) - return -EINVAL; - } - nlongs = BITS_TO_LONGS(MAX_NUMNODES); - endmask = ~0UL; - } + while (maxnode > MAX_NUMNODES) { + unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); + unsigned long t; - if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) { - unsigned long valid_mask = endmask; - - valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); - if (get_user(t, nmask + nlongs - 1)) + if (get_bitmap(&t, &nmask[maxnode / BITS_PER_LONG], bits)) return -EFAULT; - if (t & valid_mask) + + if (maxnode - bits >= MAX_NUMNODES) { + maxnode -= bits; + } else { + maxnode = MAX_NUMNODES; + t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); + } + if (t) return -EINVAL; } - if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) - return -EFAULT; - nodes_addr(*nodes)[nlongs-1] &= endmask; - return 0; + return get_bitmap(nodes_addr(*nodes), nmask, maxnode); } /* Copy a kernel node mask to user space */ @@ -1430,6 +1427,10 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, { unsigned long copy = ALIGN(maxnode-1, 64) / 8; unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); + bool compat = in_compat_syscall(); + + if (compat) + nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); if (copy > nbytes) { if (copy > PAGE_SIZE) @@ -1437,7 +1438,13 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, if (clear_user((char __user *)mask + nbytes, copy - nbytes)) return -EFAULT; copy = nbytes; + maxnode = nr_node_ids; } + + if (compat) + return compat_put_bitmap((compat_ulong_t __user *)mask, + nodes_addr(*nodes), maxnode); + return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } @@ -1642,116 +1649,6 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); } -#ifdef CONFIG_COMPAT - -COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, - compat_ulong_t __user *, nmask, - compat_ulong_t, maxnode, - compat_ulong_t, addr, compat_ulong_t, flags) -{ - long err; - unsigned long __user *nm = NULL; - unsigned long nr_bits, alloc_size; - DECLARE_BITMAP(bm, MAX_NUMNODES); - - nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids); - alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - - if (nmask) - nm = compat_alloc_user_space(alloc_size); - - err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags); - - if (!err && nmask) { - unsigned long copy_size; - copy_size = min_t(unsigned long, sizeof(bm), alloc_size); - err = copy_from_user(bm, nm, copy_size); - /* ensure entire bitmap is zeroed */ - err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); - err |= compat_put_bitmap(nmask, bm, nr_bits); - } - - return err; -} - -COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, - compat_ulong_t, maxnode) -{ - unsigned long __user *nm = NULL; - unsigned long nr_bits, alloc_size; - DECLARE_BITMAP(bm, MAX_NUMNODES); - - nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); - alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - - if (nmask) { - if (compat_get_bitmap(bm, nmask, nr_bits)) - return -EFAULT; - nm = compat_alloc_user_space(alloc_size); - if (copy_to_user(nm, bm, alloc_size)) - return -EFAULT; - } - - return kernel_set_mempolicy(mode, nm, nr_bits+1); -} - -COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, - compat_ulong_t, mode, compat_ulong_t __user *, nmask, - compat_ulong_t, maxnode, compat_ulong_t, flags) -{ - unsigned long __user *nm = NULL; - unsigned long nr_bits, alloc_size; - nodemask_t bm; - - nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); - alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - - if (nmask) { - if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits)) - return -EFAULT; - nm = compat_alloc_user_space(alloc_size); - if (copy_to_user(nm, nodes_addr(bm), alloc_size)) - return -EFAULT; - } - - return kernel_mbind(start, len, mode, nm, nr_bits+1, flags); -} - -COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, - compat_ulong_t, maxnode, - const compat_ulong_t __user *, old_nodes, - const compat_ulong_t __user *, new_nodes) -{ - unsigned long __user *old = NULL; - unsigned long __user *new = NULL; - nodemask_t tmp_mask; - unsigned long nr_bits; - unsigned long size; - - nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); - size = ALIGN(nr_bits, BITS_PER_LONG) / 8; - if (old_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) - return -EFAULT; - old = compat_alloc_user_space(new_nodes ? size * 2 : size); - if (new_nodes) - new = old + size / sizeof(unsigned long); - if (copy_to_user(old, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - if (new_nodes) { - if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) - return -EFAULT; - if (new == NULL) - new = compat_alloc_user_space(size); - if (copy_to_user(new, nodes_addr(tmp_mask), size)) - return -EFAULT; - } - return kernel_migrate_pages(pid, nr_bits + 1, old, new); -} - -#endif /* CONFIG_COMPAT */ - bool vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_IO | VM_PFNMAP)) @@ -1979,17 +1876,26 @@ unsigned int mempolicy_slab_node(void) */ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) { - unsigned nnodes = nodes_weight(pol->nodes); - unsigned target; + nodemask_t nodemask = pol->nodes; + unsigned int target, nnodes; int i; int nid; + /* + * The barrier will stabilize the nodemask in a register or on + * the stack so that it will stop changing under the code. + * + * Between first_node() and next_node(), pol->nodes could be changed + * by other threads. So we put pol->nodes in a local stack. + */ + barrier(); + nnodes = nodes_weight(nodemask); if (!nnodes) return numa_node_id(); target = (unsigned int)n % nnodes; - nid = first_node(pol->nodes); + nid = first_node(nodemask); for (i = 0; i < target; i++) - nid = next_node(nid, pol->nodes); + nid = next_node(nid, nodemask); return nid; } diff --git a/mm/migrate.c b/mm/migrate.c index a0aeb3fe46a7..a6a7743ee98f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -960,7 +960,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, int force, enum migrate_mode mode) { int rc = -EAGAIN; - int page_was_mapped = 0; + bool page_was_mapped = false; struct anon_vma *anon_vma = NULL; bool is_lru = !__PageMovable(page); @@ -1008,7 +1008,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } /* - * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, + * By try_to_migrate(), page->mapcount goes down to 0 here. In this case, * we cannot notice that anon_vma is freed while we migrates a page. * This get_anon_vma() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() @@ -1063,7 +1063,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, page); try_to_migrate(page, 0); - page_was_mapped = 1; + page_was_mapped = true; } if (!page_mapped(page)) @@ -1900,6 +1900,23 @@ set_status: mmap_read_unlock(mm); } +static int get_compat_pages_array(const void __user *chunk_pages[], + const void __user * __user *pages, + unsigned long chunk_nr) +{ + compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages; + compat_uptr_t p; + int i; + + for (i = 0; i < chunk_nr; i++) { + if (get_user(p, pages32 + i)) + return -EFAULT; + chunk_pages[i] = compat_ptr(p); + } + + return 0; +} + /* * Determine the nodes of a user array of pages and store it in * a user array of status. @@ -1919,8 +1936,15 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) chunk_nr = DO_PAGES_STAT_CHUNK_NR; - if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) - break; + if (in_compat_syscall()) { + if (get_compat_pages_array(chunk_pages, pages, + chunk_nr)) + break; + } else { + if (copy_from_user(chunk_pages, pages, + chunk_nr * sizeof(*chunk_pages))) + break; + } do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); @@ -2023,28 +2047,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); } -#ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, - compat_uptr_t __user *, pages32, - const int __user *, nodes, - int __user *, status, - int, flags) -{ - const void __user * __user *pages; - int i; - - pages = compat_alloc_user_space(nr_pages * sizeof(void *)); - for (i = 0; i < nr_pages; i++) { - compat_uptr_t p; - - if (get_user(p, pages32 + i) || - put_user(compat_ptr(p), pages + i)) - return -EFAULT; - } - return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); -} -#endif /* CONFIG_COMPAT */ - #ifdef CONFIG_NUMA_BALANCING /* * Returns true if this is a safe migration target node for misplaced NUMA @@ -2107,6 +2109,7 @@ out: static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) { int page_lru; + int nr_pages = thp_nr_pages(page); VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); @@ -2115,7 +2118,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) return 0; /* Avoid migrating to a node that is nearly full */ - if (!migrate_balanced_pgdat(pgdat, compound_nr(page))) + if (!migrate_balanced_pgdat(pgdat, nr_pages)) return 0; if (isolate_lru_page(page)) @@ -2123,7 +2126,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) page_lru = page_is_file_lru(page); mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, - thp_nr_pages(page)); + nr_pages); /* * Isolating the page has taken another reference, so the diff --git a/mm/page_alloc.c b/mm/page_alloc.c index de309a1dfe65..b37435c274cf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3428,8 +3428,10 @@ void free_unref_page_list(struct list_head *list) /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { pfn = page_to_pfn(page); - if (!free_unref_page_prepare(page, pfn, 0)) + if (!free_unref_page_prepare(page, pfn, 0)) { list_del(&page->lru); + continue; + } /* * Free isolated pages directly to the allocator, see diff --git a/mm/vmscan.c b/mm/vmscan.c index 740d03e6dae2..74296c2d1fed 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2715,7 +2715,7 @@ out: cgroup_size = max(cgroup_size, protection); scan = lruvec_size - lruvec_size * protection / - cgroup_size; + (cgroup_size + 1); /* * Minimally target SWAP_CLUSTER_MAX pages to keep diff --git a/mm/vmstat.c b/mm/vmstat.c index 0885a34197b7..8ce2620344b2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -319,6 +319,16 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, long x; long t; + /* + * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels, + * atomicity is provided by IRQs being disabled -- either explicitly + * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables + * CPU migrations and preemption potentially corrupts a counter so + * disable preemption. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + x = delta + __this_cpu_read(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -328,6 +338,9 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, x = 0; } __this_cpu_write(*p, x); + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } EXPORT_SYMBOL(__mod_zone_page_state); @@ -350,6 +363,10 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, delta >>= PAGE_SHIFT; } + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + x = delta + __this_cpu_read(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -359,6 +376,9 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, x = 0; } __this_cpu_write(*p, x); + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } EXPORT_SYMBOL(__mod_node_page_state); @@ -391,6 +411,10 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) s8 __percpu *p = pcp->vm_stat_diff + item; s8 v, t; + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v > t)) { @@ -399,6 +423,9 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) zone_page_state_add(v + overstep, zone, item); __this_cpu_write(*p, -overstep); } + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) @@ -409,6 +436,10 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v > t)) { @@ -417,6 +448,9 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) node_page_state_add(v + overstep, pgdat, item); __this_cpu_write(*p, -overstep); } + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } void __inc_zone_page_state(struct page *page, enum zone_stat_item item) @@ -437,6 +471,10 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) s8 __percpu *p = pcp->vm_stat_diff + item; s8 v, t; + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v < - t)) { @@ -445,6 +483,9 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) zone_page_state_add(v - overstep, zone, item); __this_cpu_write(*p, overstep); } + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) @@ -455,6 +496,10 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + /* See __mod_node_page_state */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v < - t)) { @@ -463,6 +508,9 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) node_page_state_add(v - overstep, pgdat, item); __this_cpu_write(*p, overstep); } + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } void __dec_zone_page_state(struct page *page, enum zone_stat_item item) |