aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds2021-03-14 12:23:34 -0700
committerLinus Torvalds2021-03-14 12:23:34 -0700
commit50eb842fe517b2765b7748c3016082b484a6dbb8 (patch)
tree043787bcebc48b58698469d5699f9e034b92a391 /mm
parent88fe49249c99de14e543c632a46248d85411ab9e (diff)
parent2766f1821600cc7562bae2128ad0b163f744c5d9 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge misc fixes from Andrew Morton: "28 patches. Subsystems affected by this series: mm (memblock, pagealloc, hugetlb, highmem, kfence, oom-kill, madvise, kasan, userfaultfd, memcg, and zram), core-kernel, kconfig, fork, binfmt, MAINTAINERS, kbuild, and ia64" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (28 commits) zram: fix broken page writeback zram: fix return value on writeback_store mm/memcg: set memcg when splitting page mm/memcg: rename mem_cgroup_split_huge_fixup to split_page_memcg and add nr_pages argument ia64: fix ptrace(PTRACE_SYSCALL_INFO_EXIT) sign ia64: fix ia64_syscall_get_set_arguments() for break-based syscalls mm/userfaultfd: fix memory corruption due to writeprotect kasan: fix KASAN_STACK dependency for HW_TAGS kasan, mm: fix crash with HW_TAGS and DEBUG_PAGEALLOC mm/madvise: replace ptrace attach requirement for process_madvise include/linux/sched/mm.h: use rcu_dereference in in_vfork() kfence: fix reports if constant function prefixes exist kfence, slab: fix cache_alloc_debugcheck_after() for bulk allocations kfence: fix printk format for ptrdiff_t linux/compiler-clang.h: define HAVE_BUILTIN_BSWAP* MAINTAINERS: exclude uapi directories in API/ABI section binfmt_misc: fix possible deadlock in bm_register_write mm/highmem.c: fix zero_user_segments() with start > end hugetlb: do early cow when page pinned on src mm mm: use is_cow_mapping() across tree where proper ...
Diffstat (limited to 'mm')
-rw-r--r--mm/highmem.c17
-rw-r--r--mm/huge_memory.c10
-rw-r--r--mm/hugetlb.c123
-rw-r--r--mm/internal.h5
-rw-r--r--mm/kfence/report.c30
-rw-r--r--mm/madvise.c13
-rw-r--r--mm/memcontrol.c15
-rw-r--r--mm/memory.c16
-rw-r--r--mm/page_alloc.c167
-rw-r--r--mm/slab.c2
10 files changed, 234 insertions, 164 deletions
diff --git a/mm/highmem.c b/mm/highmem.c
index 874b732b120c..86f2b9495f9c 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -368,20 +368,24 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
BUG_ON(end1 > page_size(page) || end2 > page_size(page));
+ if (start1 >= end1)
+ start1 = end1 = 0;
+ if (start2 >= end2)
+ start2 = end2 = 0;
+
for (i = 0; i < compound_nr(page); i++) {
void *kaddr = NULL;
- if (start1 < PAGE_SIZE || start2 < PAGE_SIZE)
- kaddr = kmap_atomic(page + i);
-
if (start1 >= PAGE_SIZE) {
start1 -= PAGE_SIZE;
end1 -= PAGE_SIZE;
} else {
unsigned this_end = min_t(unsigned, end1, PAGE_SIZE);
- if (end1 > start1)
+ if (end1 > start1) {
+ kaddr = kmap_atomic(page + i);
memset(kaddr + start1, 0, this_end - start1);
+ }
end1 -= this_end;
start1 = 0;
}
@@ -392,8 +396,11 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
} else {
unsigned this_end = min_t(unsigned, end2, PAGE_SIZE);
- if (end2 > start2)
+ if (end2 > start2) {
+ if (!kaddr)
+ kaddr = kmap_atomic(page + i);
memset(kaddr + start2, 0, this_end - start2);
+ }
end2 -= this_end;
start2 = 0;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 395c75111d33..ae907a9c2050 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1100,9 +1100,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* best effort that the pinned pages won't be replaced by another
* random page during the coming copy-on-write.
*/
- if (unlikely(is_cow_mapping(vma->vm_flags) &&
- atomic_read(&src_mm->has_pinned) &&
- page_maybe_dma_pinned(src_page))) {
+ if (unlikely(page_needs_cow_for_dma(vma, src_page))) {
pte_free(dst_mm, pgtable);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
@@ -1214,9 +1212,7 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
/* Please refer to comments in copy_huge_pmd() */
- if (unlikely(is_cow_mapping(vma->vm_flags) &&
- atomic_read(&src_mm->has_pinned) &&
- page_maybe_dma_pinned(pud_page(pud)))) {
+ if (unlikely(page_needs_cow_for_dma(vma, pud_page(pud)))) {
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
__split_huge_pud(vma, src_pud, addr);
@@ -2471,7 +2467,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
int i;
/* complete memcg works before add pages to LRU */
- mem_cgroup_split_huge_fixup(head);
+ split_page_memcg(head, nr);
if (PageAnon(head) && PageSwapCache(head)) {
swp_entry_t entry = { .val = page_private(head) };
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8fb42c6dd74b..5b1ab1f427c5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -331,6 +331,24 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
}
}
+static inline long
+hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
+ long to, struct hstate *h, struct hugetlb_cgroup *cg,
+ long *regions_needed)
+{
+ struct file_region *nrg;
+
+ if (!regions_needed) {
+ nrg = get_file_region_entry_from_cache(map, from, to);
+ record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
+ list_add(&nrg->link, rg->link.prev);
+ coalesce_file_region(map, nrg);
+ } else
+ *regions_needed += 1;
+
+ return to - from;
+}
+
/*
* Must be called with resv->lock held.
*
@@ -346,7 +364,7 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
long add = 0;
struct list_head *head = &resv->regions;
long last_accounted_offset = f;
- struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
+ struct file_region *rg = NULL, *trg = NULL;
if (regions_needed)
*regions_needed = 0;
@@ -369,24 +387,17 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
/* When we find a region that starts beyond our range, we've
* finished.
*/
- if (rg->from > t)
+ if (rg->from >= t)
break;
/* Add an entry for last_accounted_offset -> rg->from, and
* update last_accounted_offset.
*/
- if (rg->from > last_accounted_offset) {
- add += rg->from - last_accounted_offset;
- if (!regions_needed) {
- nrg = get_file_region_entry_from_cache(
- resv, last_accounted_offset, rg->from);
- record_hugetlb_cgroup_uncharge_info(h_cg, h,
- resv, nrg);
- list_add(&nrg->link, rg->link.prev);
- coalesce_file_region(resv, nrg);
- } else
- *regions_needed += 1;
- }
+ if (rg->from > last_accounted_offset)
+ add += hugetlb_resv_map_add(resv, rg,
+ last_accounted_offset,
+ rg->from, h, h_cg,
+ regions_needed);
last_accounted_offset = rg->to;
}
@@ -394,17 +405,9 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
/* Handle the case where our range extends beyond
* last_accounted_offset.
*/
- if (last_accounted_offset < t) {
- add += t - last_accounted_offset;
- if (!regions_needed) {
- nrg = get_file_region_entry_from_cache(
- resv, last_accounted_offset, t);
- record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
- list_add(&nrg->link, rg->link.prev);
- coalesce_file_region(resv, nrg);
- } else
- *regions_needed += 1;
- }
+ if (last_accounted_offset < t)
+ add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
+ t, h, h_cg, regions_needed);
VM_BUG_ON(add < 0);
return add;
@@ -3725,21 +3728,32 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
return false;
}
+static void
+hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
+ struct page *new_page)
+{
+ __SetPageUptodate(new_page);
+ set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
+ hugepage_add_new_anon_rmap(new_page, vma, addr);
+ hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
+ ClearHPageRestoreReserve(new_page);
+ SetHPageMigratable(new_page);
+}
+
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
{
pte_t *src_pte, *dst_pte, entry, dst_entry;
struct page *ptepage;
unsigned long addr;
- int cow;
+ bool cow = is_cow_mapping(vma->vm_flags);
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
+ unsigned long npages = pages_per_huge_page(h);
struct address_space *mapping = vma->vm_file->f_mapping;
struct mmu_notifier_range range;
int ret = 0;
- cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
-
if (cow) {
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
vma->vm_start,
@@ -3784,6 +3798,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
dst_entry = huge_ptep_get(dst_pte);
+again:
if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
/*
* Skip if src entry none. Also, skip in the
@@ -3807,6 +3822,52 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
}
set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
} else {
+ entry = huge_ptep_get(src_pte);
+ ptepage = pte_page(entry);
+ get_page(ptepage);
+
+ /*
+ * This is a rare case where we see pinned hugetlb
+ * pages while they're prone to COW. We need to do the
+ * COW earlier during fork.
+ *
+ * When pre-allocating the page or copying data, we
+ * need to be without the pgtable locks since we could
+ * sleep during the process.
+ */
+ if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
+ pte_t src_pte_old = entry;
+ struct page *new;
+
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ /* Do not use reserve as it's private owned */
+ new = alloc_huge_page(vma, addr, 1);
+ if (IS_ERR(new)) {
+ put_page(ptepage);
+ ret = PTR_ERR(new);
+ break;
+ }
+ copy_user_huge_page(new, ptepage, addr, vma,
+ npages);
+ put_page(ptepage);
+
+ /* Install the new huge page if src pte stable */
+ dst_ptl = huge_pte_lock(h, dst, dst_pte);
+ src_ptl = huge_pte_lockptr(h, src, src_pte);
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+ entry = huge_ptep_get(src_pte);
+ if (!pte_same(src_pte_old, entry)) {
+ put_page(new);
+ /* dst_entry won't change as in child */
+ goto again;
+ }
+ hugetlb_install_page(vma, dst_pte, addr, new);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ continue;
+ }
+
if (cow) {
/*
* No need to notify as we are downgrading page
@@ -3817,12 +3878,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
*/
huge_ptep_set_wrprotect(src, addr, src_pte);
}
- entry = huge_ptep_get(src_pte);
- ptepage = pte_page(entry);
- get_page(ptepage);
+
page_dup_rmap(ptepage, true);
set_huge_pte_at(dst, addr, dst_pte, entry);
- hugetlb_count_add(pages_per_huge_page(h), dst);
+ hugetlb_count_add(npages, dst);
}
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
diff --git a/mm/internal.h b/mm/internal.h
index 9902648f2206..1432feec62df 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -296,11 +296,6 @@ static inline unsigned int buddy_order(struct page *page)
*/
#define buddy_order_unsafe(page) READ_ONCE(page_private(page))
-static inline bool is_cow_mapping(vm_flags_t flags)
-{
- return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
-}
-
/*
* These three helpers classifies VMAs for virtual memory accounting.
*/
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index ab83d5a59bb1..e3f71451ad9e 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -20,6 +20,11 @@
#include "kfence.h"
+/* May be overridden by <asm/kfence.h>. */
+#ifndef ARCH_FUNC_PREFIX
+#define ARCH_FUNC_PREFIX ""
+#endif
+
extern bool no_hash_pointers;
/* Helper function to either print to a seq_file or to console. */
@@ -67,8 +72,9 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
for (skipnr = 0; skipnr < num_entries; skipnr++) {
int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]);
- if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") ||
- !strncmp(buf, "__slab_free", len)) {
+ if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfence_") ||
+ str_has_prefix(buf, ARCH_FUNC_PREFIX "__kfence_") ||
+ !strncmp(buf, ARCH_FUNC_PREFIX "__slab_free", len)) {
/*
* In case of tail calls from any of the below
* to any of the above.
@@ -77,10 +83,10 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
}
/* Also the *_bulk() variants by only checking prefixes. */
- if (str_has_prefix(buf, "kfree") ||
- str_has_prefix(buf, "kmem_cache_free") ||
- str_has_prefix(buf, "__kmalloc") ||
- str_has_prefix(buf, "kmem_cache_alloc"))
+ if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") ||
+ str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") ||
+ str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") ||
+ str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc"))
goto found;
}
if (fallback < num_entries)
@@ -116,12 +122,12 @@ void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *met
lockdep_assert_held(&meta->lock);
if (meta->state == KFENCE_OBJECT_UNUSED) {
- seq_con_printf(seq, "kfence-#%zd unused\n", meta - kfence_metadata);
+ seq_con_printf(seq, "kfence-#%td unused\n", meta - kfence_metadata);
return;
}
seq_con_printf(seq,
- "kfence-#%zd [0x%p-0x%p"
+ "kfence-#%td [0x%p-0x%p"
", size=%d, cache=%s] allocated by task %d:\n",
meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size,
(cache && cache->name) ? cache->name : "<destroyed>", meta->alloc_track.pid);
@@ -204,7 +210,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r
pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write),
(void *)stack_entries[skipnr]);
- pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%zd):\n",
+ pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%td):\n",
get_access_type(is_write), (void *)address,
left_of_object ? meta->addr - address : address - meta->addr,
left_of_object ? "left" : "right", object_index);
@@ -213,14 +219,14 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r
case KFENCE_ERROR_UAF:
pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write),
(void *)stack_entries[skipnr]);
- pr_err("Use-after-free %s at 0x%p (in kfence-#%zd):\n",
+ pr_err("Use-after-free %s at 0x%p (in kfence-#%td):\n",
get_access_type(is_write), (void *)address, object_index);
break;
case KFENCE_ERROR_CORRUPTION:
pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]);
pr_err("Corrupted memory at 0x%p ", (void *)address);
print_diff_canary(address, 16, meta);
- pr_cont(" (in kfence-#%zd):\n", object_index);
+ pr_cont(" (in kfence-#%td):\n", object_index);
break;
case KFENCE_ERROR_INVALID:
pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write),
@@ -230,7 +236,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r
break;
case KFENCE_ERROR_INVALID_FREE:
pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]);
- pr_err("Invalid free of 0x%p (in kfence-#%zd):\n", (void *)address,
+ pr_err("Invalid free of 0x%p (in kfence-#%td):\n", (void *)address,
object_index);
break;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index df692d2e35d4..01fef79ac761 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1198,12 +1198,22 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
goto release_task;
}
- mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
+ /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
+ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
if (IS_ERR_OR_NULL(mm)) {
ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
goto release_task;
}
+ /*
+ * Require CAP_SYS_NICE for influencing process performance. Note that
+ * only non-destructive hints are currently supported.
+ */
+ if (!capable(CAP_SYS_NICE)) {
+ ret = -EPERM;
+ goto release_mm;
+ }
+
total_len = iov_iter_count(&iter);
while (iov_iter_count(&iter)) {
@@ -1218,6 +1228,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
if (ret == 0)
ret = total_len - iov_iter_count(&iter);
+release_mm:
mmput(mm);
release_task:
put_task_struct(task);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 845eec01ef9d..e064ac0d850a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3287,24 +3287,21 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
#endif /* CONFIG_MEMCG_KMEM */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
- * Because page_memcg(head) is not set on compound tails, set it now.
+ * Because page_memcg(head) is not set on tails, set it now.
*/
-void mem_cgroup_split_huge_fixup(struct page *head)
+void split_page_memcg(struct page *head, unsigned int nr)
{
struct mem_cgroup *memcg = page_memcg(head);
int i;
- if (mem_cgroup_disabled())
+ if (mem_cgroup_disabled() || !memcg)
return;
- for (i = 1; i < HPAGE_PMD_NR; i++) {
- css_get(&memcg->css);
- head[i].memcg_data = (unsigned long)memcg;
- }
+ for (i = 1; i < nr; i++)
+ head[i].memcg_data = head->memcg_data;
+ css_get_many(&memcg->css, nr - 1);
}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifdef CONFIG_MEMCG_SWAP
/**
diff --git a/mm/memory.c b/mm/memory.c
index c8e357627318..5efa07fb6cdc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -809,12 +809,8 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
struct page **prealloc, pte_t pte, struct page *page)
{
- struct mm_struct *src_mm = src_vma->vm_mm;
struct page *new_page;
- if (!is_cow_mapping(src_vma->vm_flags))
- return 1;
-
/*
* What we want to do is to check whether this page may
* have been pinned by the parent process. If so,
@@ -828,9 +824,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
* the page count. That might give false positives for
* for pinning, but it will work correctly.
*/
- if (likely(!atomic_read(&src_mm->has_pinned)))
- return 1;
- if (likely(!page_maybe_dma_pinned(page)))
+ if (likely(!page_needs_cow_for_dma(src_vma, page)))
return 1;
new_page = *prealloc;
@@ -3103,6 +3097,14 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
return handle_userfault(vmf, VM_UFFD_WP);
}
+ /*
+ * Userfaultfd write-protect can defer flushes. Ensure the TLB
+ * is flushed in this case before copying.
+ */
+ if (unlikely(userfaultfd_wp(vmf->vma) &&
+ mm_tlb_flush_pending(vmf->vma->vm_mm)))
+ flush_tlb_page(vmf->vma, vmf->address);
+
vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
if (!vmf->page) {
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3e4b29ee2b1e..cfc72873961d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1282,6 +1282,12 @@ static __always_inline bool free_pages_prepare(struct page *page,
kernel_poison_pages(page, 1 << order);
/*
+ * With hardware tag-based KASAN, memory tags must be set before the
+ * page becomes unavailable via debug_pagealloc or arch_free_page.
+ */
+ kasan_free_nondeferred_pages(page, order);
+
+ /*
* arch_free_page() can make the page's contents inaccessible. s390
* does this. So nothing which can access the page's contents should
* happen after this.
@@ -1290,8 +1296,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
debug_pagealloc_unmap_pages(page, 1 << order);
- kasan_free_nondeferred_pages(page, order);
-
return true;
}
@@ -3310,6 +3314,7 @@ void split_page(struct page *page, unsigned int order)
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
split_page_owner(page, 1 << order);
+ split_page_memcg(page, 1 << order);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -6259,12 +6264,65 @@ static void __meminit zone_init_free_lists(struct zone *zone)
}
}
+#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
+/*
+ * Only struct pages that correspond to ranges defined by memblock.memory
+ * are zeroed and initialized by going through __init_single_page() during
+ * memmap_init_zone().
+ *
+ * But, there could be struct pages that correspond to holes in
+ * memblock.memory. This can happen because of the following reasons:
+ * - physical memory bank size is not necessarily the exact multiple of the
+ * arbitrary section size
+ * - early reserved memory may not be listed in memblock.memory
+ * - memory layouts defined with memmap= kernel parameter may not align
+ * nicely with memmap sections
+ *
+ * Explicitly initialize those struct pages so that:
+ * - PG_Reserved is set
+ * - zone and node links point to zone and node that span the page if the
+ * hole is in the middle of a zone
+ * - zone and node links point to adjacent zone/node if the hole falls on
+ * the zone boundary; the pages in such holes will be prepended to the
+ * zone/node above the hole except for the trailing pages in the last
+ * section that will be appended to the zone/node below.
+ */
+static u64 __meminit init_unavailable_range(unsigned long spfn,
+ unsigned long epfn,
+ int zone, int node)
+{
+ unsigned long pfn;
+ u64 pgcnt = 0;
+
+ for (pfn = spfn; pfn < epfn; pfn++) {
+ if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
+ pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
+ + pageblock_nr_pages - 1;
+ continue;
+ }
+ __init_single_page(pfn_to_page(pfn), pfn, zone, node);
+ __SetPageReserved(pfn_to_page(pfn));
+ pgcnt++;
+ }
+
+ return pgcnt;
+}
+#else
+static inline u64 init_unavailable_range(unsigned long spfn, unsigned long epfn,
+ int zone, int node)
+{
+ return 0;
+}
+#endif
+
void __meminit __weak memmap_init_zone(struct zone *zone)
{
unsigned long zone_start_pfn = zone->zone_start_pfn;
unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone);
+ static unsigned long hole_pfn;
unsigned long start_pfn, end_pfn;
+ u64 pgcnt = 0;
for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
@@ -6274,7 +6332,29 @@ void __meminit __weak memmap_init_zone(struct zone *zone)
memmap_init_range(end_pfn - start_pfn, nid,
zone_id, start_pfn, zone_end_pfn,
MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+
+ if (hole_pfn < start_pfn)
+ pgcnt += init_unavailable_range(hole_pfn, start_pfn,
+ zone_id, nid);
+ hole_pfn = end_pfn;
}
+
+#ifdef CONFIG_SPARSEMEM
+ /*
+ * Initialize the hole in the range [zone_end_pfn, section_end].
+ * If zone boundary falls in the middle of a section, this hole
+ * will be re-initialized during the call to this function for the
+ * higher zone.
+ */
+ end_pfn = round_up(zone_end_pfn, PAGES_PER_SECTION);
+ if (hole_pfn < end_pfn)
+ pgcnt += init_unavailable_range(hole_pfn, end_pfn,
+ zone_id, nid);
+#endif
+
+ if (pgcnt)
+ pr_info(" %s zone: %llu pages in unavailable ranges\n",
+ zone->name, pgcnt);
}
static int zone_batchsize(struct zone *zone)
@@ -7071,88 +7151,6 @@ void __init free_area_init_memoryless_node(int nid)
free_area_init_node(nid);
}
-#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
-/*
- * Initialize all valid struct pages in the range [spfn, epfn) and mark them
- * PageReserved(). Return the number of struct pages that were initialized.
- */
-static u64 __init init_unavailable_range(unsigned long spfn, unsigned long epfn)
-{
- unsigned long pfn;
- u64 pgcnt = 0;
-
- for (pfn = spfn; pfn < epfn; pfn++) {
- if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
- pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
- + pageblock_nr_pages - 1;
- continue;
- }
- /*
- * Use a fake node/zone (0) for now. Some of these pages
- * (in memblock.reserved but not in memblock.memory) will
- * get re-initialized via reserve_bootmem_region() later.
- */
- __init_single_page(pfn_to_page(pfn), pfn, 0, 0);
- __SetPageReserved(pfn_to_page(pfn));
- pgcnt++;
- }
-
- return pgcnt;
-}
-
-/*
- * Only struct pages that are backed by physical memory are zeroed and
- * initialized by going through __init_single_page(). But, there are some
- * struct pages which are reserved in memblock allocator and their fields
- * may be accessed (for example page_to_pfn() on some configuration accesses
- * flags). We must explicitly initialize those struct pages.
- *
- * This function also addresses a similar issue where struct pages are left
- * uninitialized because the physical address range is not covered by
- * memblock.memory or memblock.reserved. That could happen when memblock
- * layout is manually configured via memmap=, or when the highest physical
- * address (max_pfn) does not end on a section boundary.
- */
-static void __init init_unavailable_mem(void)
-{
- phys_addr_t start, end;
- u64 i, pgcnt;
- phys_addr_t next = 0;
-
- /*
- * Loop through unavailable ranges not covered by memblock.memory.
- */
- pgcnt = 0;
- for_each_mem_range(i, &start, &end) {
- if (next < start)
- pgcnt += init_unavailable_range(PFN_DOWN(next),
- PFN_UP(start));
- next = end;
- }
-
- /*
- * Early sections always have a fully populated memmap for the whole
- * section - see pfn_valid(). If the last section has holes at the
- * end and that section is marked "online", the memmap will be
- * considered initialized. Make sure that memmap has a well defined
- * state.
- */
- pgcnt += init_unavailable_range(PFN_DOWN(next),
- round_up(max_pfn, PAGES_PER_SECTION));
-
- /*
- * Struct pages that do not have backing memory. This could be because
- * firmware is using some of this memory, or for some other reasons.
- */
- if (pgcnt)
- pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
-}
-#else
-static inline void __init init_unavailable_mem(void)
-{
-}
-#endif /* !CONFIG_FLAT_NODE_MEM_MAP */
-
#if MAX_NUMNODES > 1
/*
* Figure out the number of possible node ids.
@@ -7576,7 +7574,6 @@ void __init free_area_init(unsigned long *max_zone_pfn)
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
- init_unavailable_mem();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid);
diff --git a/mm/slab.c b/mm/slab.c
index 51fd424e0d6d..ae651bf540b7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2992,7 +2992,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
gfp_t flags, void *objp, unsigned long caller)
{
WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
- if (!objp)
+ if (!objp || is_kfence_address(objp))
return objp;
if (cachep->flags & SLAB_POISON) {
check_poison_obj(cachep, objp);