aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds2022-10-14 12:28:43 -0700
committerLinus Torvalds2022-10-14 12:28:43 -0700
commit5e714bf1713b4b096d20ec75c13880b7086964bd (patch)
tree7742ba7cc03302f59fefe54bc105dc347a57803e /mm
parentf2e44139f3e0edb8be8821fe4dc93afd7b034182 (diff)
parentef6e06b2ef87077104d1145a0fd452ff8dbbc4b7 (diff)
Merge tag 'mm-stable-2022-10-13' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull more MM updates from Andrew Morton: - fix a race which causes page refcounting errors in ZONE_DEVICE pages (Alistair Popple) - fix userfaultfd test harness instability (Peter Xu) - various other patches in MM, mainly fixes * tag 'mm-stable-2022-10-13' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (29 commits) highmem: fix kmap_to_page() for kmap_local_page() addresses mm/page_alloc: fix incorrect PGFREE and PGALLOC for high-order page mm/selftest: uffd: explain the write missing fault check mm/hugetlb: use hugetlb_pte_stable in migration race check mm/hugetlb: fix race condition of uffd missing/minor handling zram: always expose rw_page LoongArch: update local TLB if PTE entry exists mm: use update_mmu_tlb() on the second thread kasan: fix array-bounds warnings in tests hmm-tests: add test for migrate_device_range() nouveau/dmem: evict device private memory during release nouveau/dmem: refactor nouveau_dmem_fault_copy_one() mm/migrate_device.c: add migrate_device_range() mm/migrate_device.c: refactor migrate_vma and migrate_deivce_coherent_page() mm/memremap.c: take a pgmap reference on page allocation mm: free device private pages have zero refcount mm/memory.c: fix race when faulting a device private page mm/damon: use damon_sz_region() in appropriate place mm/damon: move sz_damon_region to damon_sz_region lib/test_meminit: add checks for the allocation functions ...
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c1
-rw-r--r--mm/damon/core.c26
-rw-r--r--mm/damon/vaddr.c4
-rw-r--r--mm/highmem.c43
-rw-r--r--mm/hugetlb.c72
-rw-r--r--mm/kasan/kasan_test.c9
-rw-r--r--mm/memory.c20
-rw-r--r--mm/memremap.c30
-rw-r--r--mm/migrate.c34
-rw-r--r--mm/migrate_device.c239
-rw-r--r--mm/mmap.c28
-rw-r--r--mm/mmu_gather.c10
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/page_alloc.c12
14 files changed, 383 insertions, 147 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 2dd02c4683c4..c51f7f545afe 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1847,7 +1847,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
pfn = cc->zone->zone_start_pfn;
cc->fast_search_fail = 0;
found_block = true;
- set_pageblock_skip(freepage);
break;
}
}
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 8e1ab38d0f1f..36d098d06c55 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -491,7 +491,7 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t)
- sz += r->ar.end - r->ar.start;
+ sz += damon_sz_region(r);
}
if (ctx->attrs.min_nr_regions)
@@ -674,7 +674,7 @@ static bool __damos_valid_target(struct damon_region *r, struct damos *s)
{
unsigned long sz;
- sz = r->ar.end - r->ar.start;
+ sz = damon_sz_region(r);
return s->pattern.min_sz_region <= sz &&
sz <= s->pattern.max_sz_region &&
s->pattern.min_nr_accesses <= r->nr_accesses &&
@@ -702,7 +702,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
damon_for_each_scheme(s, c) {
struct damos_quota *quota = &s->quota;
- unsigned long sz = r->ar.end - r->ar.start;
+ unsigned long sz = damon_sz_region(r);
struct timespec64 begin, end;
unsigned long sz_applied = 0;
@@ -731,14 +731,14 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
sz = ALIGN_DOWN(quota->charge_addr_from -
r->ar.start, DAMON_MIN_REGION);
if (!sz) {
- if (r->ar.end - r->ar.start <=
- DAMON_MIN_REGION)
+ if (damon_sz_region(r) <=
+ DAMON_MIN_REGION)
continue;
sz = DAMON_MIN_REGION;
}
damon_split_region_at(t, r, sz);
r = damon_next_region(r);
- sz = r->ar.end - r->ar.start;
+ sz = damon_sz_region(r);
}
quota->charge_target_from = NULL;
quota->charge_addr_from = 0;
@@ -843,8 +843,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
continue;
score = c->ops.get_scheme_score(
c, t, r, s);
- quota->histogram[score] +=
- r->ar.end - r->ar.start;
+ quota->histogram[score] += damon_sz_region(r);
if (score > max_score)
max_score = score;
}
@@ -865,18 +864,13 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
}
}
-static inline unsigned long sz_damon_region(struct damon_region *r)
-{
- return r->ar.end - r->ar.start;
-}
-
/*
* Merge two adjacent regions into one region
*/
static void damon_merge_two_regions(struct damon_target *t,
struct damon_region *l, struct damon_region *r)
{
- unsigned long sz_l = sz_damon_region(l), sz_r = sz_damon_region(r);
+ unsigned long sz_l = damon_sz_region(l), sz_r = damon_sz_region(r);
l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
(sz_l + sz_r);
@@ -905,7 +899,7 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
if (prev && prev->ar.end == r->ar.start &&
abs(prev->nr_accesses - r->nr_accesses) <= thres &&
- sz_damon_region(prev) + sz_damon_region(r) <= sz_limit)
+ damon_sz_region(prev) + damon_sz_region(r) <= sz_limit)
damon_merge_two_regions(t, prev, r);
else
prev = r;
@@ -963,7 +957,7 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs)
int i;
damon_for_each_region_safe(r, next, t) {
- sz_region = r->ar.end - r->ar.start;
+ sz_region = damon_sz_region(r);
for (i = 0; i < nr_subs - 1 &&
sz_region > 2 * DAMON_MIN_REGION; i++) {
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index ea94e0b2c311..15f03df66db6 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -72,7 +72,7 @@ static int damon_va_evenly_split_region(struct damon_target *t,
return -EINVAL;
orig_end = r->ar.end;
- sz_orig = r->ar.end - r->ar.start;
+ sz_orig = damon_sz_region(r);
sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION);
if (!sz_piece)
@@ -618,7 +618,7 @@ static unsigned long damos_madvise(struct damon_target *target,
{
struct mm_struct *mm;
unsigned long start = PAGE_ALIGN(r->ar.start);
- unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start);
+ unsigned long len = PAGE_ALIGN(damon_sz_region(r));
unsigned long applied;
mm = damon_get_mm(target);
diff --git a/mm/highmem.c b/mm/highmem.c
index c707d7202d5f..db251e77f98f 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -30,6 +30,17 @@
#include <asm/tlbflush.h>
#include <linux/vmalloc.h>
+#ifdef CONFIG_KMAP_LOCAL
+static inline int kmap_local_calc_idx(int idx)
+{
+ return idx + KM_MAX_IDX * smp_processor_id();
+}
+
+#ifndef arch_kmap_local_map_idx
+#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx)
+#endif
+#endif /* CONFIG_KMAP_LOCAL */
+
/*
* Virtual_count is not a pure "count".
* 0 means that it is not mapped, and has not been mapped
@@ -142,12 +153,29 @@ pte_t *pkmap_page_table;
struct page *__kmap_to_page(void *vaddr)
{
+ unsigned long base = (unsigned long) vaddr & PAGE_MASK;
+ struct kmap_ctrl *kctrl = &current->kmap_ctrl;
unsigned long addr = (unsigned long)vaddr;
+ int i;
+
+ /* kmap() mappings */
+ if (WARN_ON_ONCE(addr >= PKMAP_ADDR(0) &&
+ addr < PKMAP_ADDR(LAST_PKMAP)))
+ return pte_page(pkmap_page_table[PKMAP_NR(addr)]);
- if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
- int i = PKMAP_NR(addr);
+ /* kmap_local_page() mappings */
+ if (WARN_ON_ONCE(base >= __fix_to_virt(FIX_KMAP_END) &&
+ base < __fix_to_virt(FIX_KMAP_BEGIN))) {
+ for (i = 0; i < kctrl->idx; i++) {
+ unsigned long base_addr;
+ int idx;
- return pte_page(pkmap_page_table[i]);
+ idx = arch_kmap_local_map_idx(i, pte_pfn(pteval));
+ base_addr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+
+ if (base_addr == base)
+ return pte_page(kctrl->pteval[i]);
+ }
}
return virt_to_page(vaddr);
@@ -462,10 +490,6 @@ static inline void kmap_local_idx_pop(void)
# define arch_kmap_local_post_unmap(vaddr) do { } while (0)
#endif
-#ifndef arch_kmap_local_map_idx
-#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx)
-#endif
-
#ifndef arch_kmap_local_unmap_idx
#define arch_kmap_local_unmap_idx(idx, vaddr) kmap_local_calc_idx(idx)
#endif
@@ -494,11 +518,6 @@ static inline bool kmap_high_unmap_local(unsigned long vaddr)
return false;
}
-static inline int kmap_local_calc_idx(int idx)
-{
- return idx + KM_MAX_IDX * smp_processor_id();
-}
-
static pte_t *__kmap_pte;
static pte_t *kmap_get_pte(unsigned long vaddr, int idx)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 57b7b0b5d9eb..b586cdd75930 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5096,6 +5096,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
* unmapped and its refcount is dropped, so just clear pte here.
*/
if (unlikely(!pte_present(pte))) {
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
/*
* If the pte was wr-protected by uffd-wp in any of the
* swap forms, meanwhile the caller does not want to
@@ -5107,6 +5108,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
set_huge_pte_at(mm, address, ptep,
make_pte_marker(PTE_MARKER_UFFD_WP));
else
+#endif
huge_pte_clear(mm, address, ptep, sz);
spin_unlock(ptl);
continue;
@@ -5135,11 +5137,13 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
/* Leave a uffd-wp pte marker if needed */
if (huge_pte_uffd_wp(pte) &&
!(zap_flags & ZAP_FLAG_DROP_MARKER))
set_huge_pte_at(mm, address, ptep,
make_pte_marker(PTE_MARKER_UFFD_WP));
+#endif
hugetlb_count_sub(pages_per_huge_page(h), mm);
page_remove_rmap(page, vma, true);
@@ -5531,6 +5535,23 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
return handle_userfault(&vmf, reason);
}
+/*
+ * Recheck pte with pgtable lock. Returns true if pte didn't change, or
+ * false if pte changed or is changing.
+ */
+static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm,
+ pte_t *ptep, pte_t old_pte)
+{
+ spinlock_t *ptl;
+ bool same;
+
+ ptl = huge_pte_lock(h, mm, ptep);
+ same = pte_same(huge_ptep_get(ptep), old_pte);
+ spin_unlock(ptl);
+
+ return same;
+}
+
static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping, pgoff_t idx,
@@ -5571,10 +5592,33 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
if (idx >= size)
goto out;
/* Check for page in userfault range */
- if (userfaultfd_missing(vma))
- return hugetlb_handle_userfault(vma, mapping, idx,
- flags, haddr, address,
- VM_UFFD_MISSING);
+ if (userfaultfd_missing(vma)) {
+ /*
+ * Since hugetlb_no_page() was examining pte
+ * without pgtable lock, we need to re-test under
+ * lock because the pte may not be stable and could
+ * have changed from under us. Try to detect
+ * either changed or during-changing ptes and retry
+ * properly when needed.
+ *
+ * Note that userfaultfd is actually fine with
+ * false positives (e.g. caused by pte changed),
+ * but not wrong logical events (e.g. caused by
+ * reading a pte during changing). The latter can
+ * confuse the userspace, so the strictness is very
+ * much preferred. E.g., MISSING event should
+ * never happen on the page after UFFDIO_COPY has
+ * correctly installed the page and returned.
+ */
+ if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+ ret = 0;
+ goto out;
+ }
+
+ return hugetlb_handle_userfault(vma, mapping, idx, flags,
+ haddr, address,
+ VM_UFFD_MISSING);
+ }
page = alloc_huge_page(vma, haddr, 0);
if (IS_ERR(page)) {
@@ -5590,11 +5634,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* here. Before returning error, get ptl and make
* sure there really is no pte entry.
*/
- ptl = huge_pte_lock(h, mm, ptep);
- ret = 0;
- if (huge_pte_none(huge_ptep_get(ptep)))
+ if (hugetlb_pte_stable(h, mm, ptep, old_pte))
ret = vmf_error(PTR_ERR(page));
- spin_unlock(ptl);
+ else
+ ret = 0;
goto out;
}
clear_huge_page(page, address, pages_per_huge_page(h));
@@ -5640,9 +5683,14 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
if (userfaultfd_minor(vma)) {
unlock_page(page);
put_page(page);
- return hugetlb_handle_userfault(vma, mapping, idx,
- flags, haddr, address,
- VM_UFFD_MINOR);
+ /* See comment in userfaultfd_missing() block above */
+ if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+ ret = 0;
+ goto out;
+ }
+ return hugetlb_handle_userfault(vma, mapping, idx, flags,
+ haddr, address,
+ VM_UFFD_MINOR);
}
}
@@ -6804,7 +6852,7 @@ void hugetlb_vma_lock_release(struct kref *kref)
kfree(vma_lock);
}
-void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
+static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
{
struct vm_area_struct *vma = vma_lock->vma;
diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
index f25692def781..57e4c72aa8bd 100644
--- a/mm/kasan/kasan_test.c
+++ b/mm/kasan/kasan_test.c
@@ -295,6 +295,9 @@ static void krealloc_more_oob_helper(struct kunit *test,
ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+ /* Suppress -Warray-bounds warnings. */
+ OPTIMIZER_HIDE_VAR(ptr2);
+
/* All offsets up to size2 must be accessible. */
ptr2[size1 - 1] = 'x';
ptr2[size1] = 'x';
@@ -327,6 +330,9 @@ static void krealloc_less_oob_helper(struct kunit *test,
ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+ /* Suppress -Warray-bounds warnings. */
+ OPTIMIZER_HIDE_VAR(ptr2);
+
/* Must be accessible for all modes. */
ptr2[size2 - 1] = 'x';
@@ -540,13 +546,14 @@ static void kmalloc_memmove_invalid_size(struct kunit *test)
{
char *ptr;
size_t size = 64;
- volatile size_t invalid_size = size;
+ size_t invalid_size = size;
ptr = kmalloc(size, GFP_KERNEL);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
memset((char *)ptr, 0, 64);
OPTIMIZER_HIDE_VAR(ptr);
+ OPTIMIZER_HIDE_VAR(invalid_size);
KUNIT_EXPECT_KASAN_FAIL(test,
memmove((char *)ptr, (char *)ptr + 4, invalid_size));
kfree(ptr);
diff --git a/mm/memory.c b/mm/memory.c
index df678fa30cdb..f88c351aecd4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1393,10 +1393,12 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte,
struct zap_details *details, pte_t pteval)
{
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
if (zap_drop_file_uffd_wp(details))
return;
pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+#endif
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -3748,7 +3750,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
ret = remove_device_exclusive_entry(vmf);
} else if (is_device_private_entry(entry)) {
vmf->page = pfn_swap_entry_to_page(entry);
- ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+ spin_unlock(vmf->ptl);
+ goto out;
+ }
+
+ /*
+ * Get a page reference while we know the page can't be
+ * freed.
+ */
+ get_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ vmf->page->pgmap->ops->migrate_to_ram(vmf);
+ put_page(vmf->page);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else if (is_swapin_error_entry(entry)) {
@@ -4118,7 +4134,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
if (!pte_none(*vmf->pte)) {
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
goto release;
}
diff --git a/mm/memremap.c b/mm/memremap.c
index 25029a474d30..421bec3a29ee 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -138,8 +138,11 @@ void memunmap_pages(struct dev_pagemap *pgmap)
int i;
percpu_ref_kill(&pgmap->ref);
- for (i = 0; i < pgmap->nr_range; i++)
- percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
+ if (pgmap->type != MEMORY_DEVICE_PRIVATE &&
+ pgmap->type != MEMORY_DEVICE_COHERENT)
+ for (i = 0; i < pgmap->nr_range; i++)
+ percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
+
wait_for_completion(&pgmap->done);
for (i = 0; i < pgmap->nr_range; i++)
@@ -264,7 +267,9 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
PHYS_PFN(range->start),
PHYS_PFN(range_len(range)), pgmap);
- percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id));
+ if (pgmap->type != MEMORY_DEVICE_PRIVATE &&
+ pgmap->type != MEMORY_DEVICE_COHERENT)
+ percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id));
return 0;
err_add_memory:
@@ -502,11 +507,28 @@ void free_zone_device_page(struct page *page)
page->mapping = NULL;
page->pgmap->ops->page_free(page);
+ if (page->pgmap->type != MEMORY_DEVICE_PRIVATE &&
+ page->pgmap->type != MEMORY_DEVICE_COHERENT)
+ /*
+ * Reset the page count to 1 to prepare for handing out the page
+ * again.
+ */
+ set_page_count(page, 1);
+ else
+ put_dev_pagemap(page->pgmap);
+}
+
+void zone_device_page_init(struct page *page)
+{
/*
- * Reset the page count to 1 to prepare for handing out the page again.
+ * Drivers shouldn't be allocating pages after calling
+ * memunmap_pages().
*/
+ WARN_ON_ONCE(!percpu_ref_tryget_live(&page->pgmap->ref));
set_page_count(page, 1);
+ lock_page(page);
}
+EXPORT_SYMBOL_GPL(zone_device_page_init);
#ifdef CONFIG_FS_DAX
bool __put_devmap_managed_page_refs(struct page *page, int refs)
diff --git a/mm/migrate.c b/mm/migrate.c
index c228afba0963..1379e1912772 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -625,6 +625,25 @@ EXPORT_SYMBOL(folio_migrate_copy);
* Migration functions
***********************************************************/
+int migrate_folio_extra(struct address_space *mapping, struct folio *dst,
+ struct folio *src, enum migrate_mode mode, int extra_count)
+{
+ int rc;
+
+ BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */
+
+ rc = folio_migrate_mapping(mapping, dst, src, extra_count);
+
+ if (rc != MIGRATEPAGE_SUCCESS)
+ return rc;
+
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ folio_migrate_copy(dst, src);
+ else
+ folio_migrate_flags(dst, src);
+ return MIGRATEPAGE_SUCCESS;
+}
+
/**
* migrate_folio() - Simple folio migration.
* @mapping: The address_space containing the folio.
@@ -640,20 +659,7 @@ EXPORT_SYMBOL(folio_migrate_copy);
int migrate_folio(struct address_space *mapping, struct folio *dst,
struct folio *src, enum migrate_mode mode)
{
- int rc;
-
- BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */
-
- rc = folio_migrate_mapping(mapping, dst, src, 0);
-
- if (rc != MIGRATEPAGE_SUCCESS)
- return rc;
-
- if (mode != MIGRATE_SYNC_NO_COPY)
- folio_migrate_copy(dst, src);
- else
- folio_migrate_flags(dst, src);
- return MIGRATEPAGE_SUCCESS;
+ return migrate_folio_extra(mapping, dst, src, mode, 0);
}
EXPORT_SYMBOL(migrate_folio);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 5ab6ab9d2ed8..6fa682eef7a0 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -325,14 +325,14 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
* folio_migrate_mapping(), except that here we allow migration of a
* ZONE_DEVICE page.
*/
-static bool migrate_vma_check_page(struct page *page)
+static bool migrate_vma_check_page(struct page *page, struct page *fault_page)
{
/*
* One extra ref because caller holds an extra reference, either from
* isolate_lru_page() for a regular page, or migrate_vma_collect() for
* a device page.
*/
- int extra = 1;
+ int extra = 1 + (page == fault_page);
/*
* FIXME support THP (transparent huge page), it is bit more complex to
@@ -357,26 +357,20 @@ static bool migrate_vma_check_page(struct page *page)
}
/*
- * migrate_vma_unmap() - replace page mapping with special migration pte entry
- * @migrate: migrate struct containing all migration information
- *
- * Isolate pages from the LRU and replace mappings (CPU page table pte) with a
- * special migration pte entry and check if it has been pinned. Pinned pages are
- * restored because we cannot migrate them.
- *
- * This is the last step before we call the device driver callback to allocate
- * destination memory and copy contents of original page over to new page.
+ * Unmaps pages for migration. Returns number of unmapped pages.
*/
-static void migrate_vma_unmap(struct migrate_vma *migrate)
+static unsigned long migrate_device_unmap(unsigned long *src_pfns,
+ unsigned long npages,
+ struct page *fault_page)
{
- const unsigned long npages = migrate->npages;
unsigned long i, restore = 0;
bool allow_drain = true;
+ unsigned long unmapped = 0;
lru_add_drain();
for (i = 0; i < npages; i++) {
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct folio *folio;
if (!page)
@@ -391,8 +385,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
}
if (isolate_lru_page(page)) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- migrate->cpages--;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
continue;
}
@@ -405,34 +398,55 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
if (folio_mapped(folio))
try_to_migrate(folio, 0);
- if (page_mapped(page) || !migrate_vma_check_page(page)) {
+ if (page_mapped(page) ||
+ !migrate_vma_check_page(page, fault_page)) {
if (!is_zone_device_page(page)) {
get_page(page);
putback_lru_page(page);
}
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- migrate->cpages--;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
continue;
}
+
+ unmapped++;
}
for (i = 0; i < npages && restore; i++) {
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct folio *folio;
- if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ if (!page || (src_pfns[i] & MIGRATE_PFN_MIGRATE))
continue;
folio = page_folio(page);
remove_migration_ptes(folio, folio, false);
- migrate->src[i] = 0;
+ src_pfns[i] = 0;
folio_unlock(folio);
folio_put(folio);
restore--;
}
+
+ return unmapped;
+}
+
+/*
+ * migrate_vma_unmap() - replace page mapping with special migration pte entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * Isolate pages from the LRU and replace mappings (CPU page table pte) with a
+ * special migration pte entry and check if it has been pinned. Pinned pages are
+ * restored because we cannot migrate them.
+ *
+ * This is the last step before we call the device driver callback to allocate
+ * destination memory and copy contents of original page over to new page.
+ */
+static void migrate_vma_unmap(struct migrate_vma *migrate)
+{
+ migrate->cpages = migrate_device_unmap(migrate->src, migrate->npages,
+ migrate->fault_page);
}
/**
@@ -517,6 +531,8 @@ int migrate_vma_setup(struct migrate_vma *args)
return -EINVAL;
if (!args->src || !args->dst)
return -EINVAL;
+ if (args->fault_page && !is_device_private_page(args->fault_page))
+ return -EINVAL;
memset(args->src, 0, sizeof(*args->src) * nr_pages);
args->cpages = 0;
@@ -677,42 +693,38 @@ abort:
*src &= ~MIGRATE_PFN_MIGRATE;
}
-/**
- * migrate_vma_pages() - migrate meta-data from src page to dst page
- * @migrate: migrate struct containing all migration information
- *
- * This migrates struct page meta-data from source struct page to destination
- * struct page. This effectively finishes the migration from source page to the
- * destination page.
- */
-void migrate_vma_pages(struct migrate_vma *migrate)
+static void __migrate_device_pages(unsigned long *src_pfns,
+ unsigned long *dst_pfns, unsigned long npages,
+ struct migrate_vma *migrate)
{
- const unsigned long npages = migrate->npages;
- const unsigned long start = migrate->start;
struct mmu_notifier_range range;
- unsigned long addr, i;
+ unsigned long i;
bool notified = false;
- for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
- struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ for (i = 0; i < npages; i++) {
+ struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct address_space *mapping;
int r;
if (!newpage) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
if (!page) {
+ unsigned long addr;
+
+ if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
/*
* The only time there is no vma is when called from
* migrate_device_coherent_page(). However this isn't
* called if the page could not be unmapped.
*/
- VM_BUG_ON(!migrate->vma);
- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
- continue;
+ VM_BUG_ON(!migrate);
+ addr = migrate->start + i*PAGE_SIZE;
if (!notified) {
notified = true;
@@ -723,7 +735,7 @@ void migrate_vma_pages(struct migrate_vma *migrate)
mmu_notifier_invalidate_range_start(&range);
}
migrate_vma_insert_page(migrate, addr, newpage,
- &migrate->src[i]);
+ &src_pfns[i]);
continue;
}
@@ -736,21 +748,26 @@ void migrate_vma_pages(struct migrate_vma *migrate)
* device private or coherent memory.
*/
if (mapping) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
} else if (is_zone_device_page(newpage)) {
/*
* Other types of ZONE_DEVICE page are not supported.
*/
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
- r = migrate_folio(mapping, page_folio(newpage),
- page_folio(page), MIGRATE_SYNC_NO_COPY);
+ if (migrate && migrate->fault_page == page)
+ r = migrate_folio_extra(mapping, page_folio(newpage),
+ page_folio(page),
+ MIGRATE_SYNC_NO_COPY, 1);
+ else
+ r = migrate_folio(mapping, page_folio(newpage),
+ page_folio(page), MIGRATE_SYNC_NO_COPY);
if (r != MIGRATEPAGE_SUCCESS)
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
}
/*
@@ -761,28 +778,56 @@ void migrate_vma_pages(struct migrate_vma *migrate)
if (notified)
mmu_notifier_invalidate_range_only_end(&range);
}
-EXPORT_SYMBOL(migrate_vma_pages);
/**
- * migrate_vma_finalize() - restore CPU page table entry
+ * migrate_device_pages() - migrate meta-data from src page to dst page
+ * @src_pfns: src_pfns returned from migrate_device_range()
+ * @dst_pfns: array of pfns allocated by the driver to migrate memory to
+ * @npages: number of pages in the range
+ *
+ * Equivalent to migrate_vma_pages(). This is called to migrate struct page
+ * meta-data from source struct page to destination.
+ */
+void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
+ unsigned long npages)
+{
+ __migrate_device_pages(src_pfns, dst_pfns, npages, NULL);
+}
+EXPORT_SYMBOL(migrate_device_pages);
+
+/**
+ * migrate_vma_pages() - migrate meta-data from src page to dst page
* @migrate: migrate struct containing all migration information
*
- * This replaces the special migration pte entry with either a mapping to the
- * new page if migration was successful for that page, or to the original page
- * otherwise.
+ * This migrates struct page meta-data from source struct page to destination
+ * struct page. This effectively finishes the migration from source page to the
+ * destination page.
+ */
+void migrate_vma_pages(struct migrate_vma *migrate)
+{
+ __migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate);
+}
+EXPORT_SYMBOL(migrate_vma_pages);
+
+/*
+ * migrate_device_finalize() - complete page migration
+ * @src_pfns: src_pfns returned from migrate_device_range()
+ * @dst_pfns: array of pfns allocated by the driver to migrate memory to
+ * @npages: number of pages in the range
*
- * This also unlocks the pages and puts them back on the lru, or drops the extra
- * refcount, for device pages.
+ * Completes migration of the page by removing special migration entries.
+ * Drivers must ensure copying of page data is complete and visible to the CPU
+ * before calling this.
*/
-void migrate_vma_finalize(struct migrate_vma *migrate)
+void migrate_device_finalize(unsigned long *src_pfns,
+ unsigned long *dst_pfns, unsigned long npages)
{
- const unsigned long npages = migrate->npages;
unsigned long i;
for (i = 0; i < npages; i++) {
struct folio *dst, *src;
- struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
if (!page) {
if (newpage) {
@@ -792,7 +837,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
continue;
}
- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
+ if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
if (newpage) {
unlock_page(newpage);
put_page(newpage);
@@ -819,8 +864,72 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
}
}
}
+EXPORT_SYMBOL(migrate_device_finalize);
+
+/**
+ * migrate_vma_finalize() - restore CPU page table entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * This replaces the special migration pte entry with either a mapping to the
+ * new page if migration was successful for that page, or to the original page
+ * otherwise.
+ *
+ * This also unlocks the pages and puts them back on the lru, or drops the extra
+ * refcount, for device pages.
+ */
+void migrate_vma_finalize(struct migrate_vma *migrate)
+{
+ migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
+}
EXPORT_SYMBOL(migrate_vma_finalize);
+/**
+ * migrate_device_range() - migrate device private pfns to normal memory.
+ * @src_pfns: array large enough to hold migrating source device private pfns.
+ * @start: starting pfn in the range to migrate.
+ * @npages: number of pages to migrate.
+ *
+ * migrate_vma_setup() is similar in concept to migrate_vma_setup() except that
+ * instead of looking up pages based on virtual address mappings a range of
+ * device pfns that should be migrated to system memory is used instead.
+ *
+ * This is useful when a driver needs to free device memory but doesn't know the
+ * virtual mappings of every page that may be in device memory. For example this
+ * is often the case when a driver is being unloaded or unbound from a device.
+ *
+ * Like migrate_vma_setup() this function will take a reference and lock any
+ * migrating pages that aren't free before unmapping them. Drivers may then
+ * allocate destination pages and start copying data from the device to CPU
+ * memory before calling migrate_device_pages().
+ */
+int migrate_device_range(unsigned long *src_pfns, unsigned long start,
+ unsigned long npages)
+{
+ unsigned long i, pfn;
+
+ for (pfn = start, i = 0; i < npages; pfn++, i++) {
+ struct page *page = pfn_to_page(pfn);
+
+ if (!get_page_unless_zero(page)) {
+ src_pfns[i] = 0;
+ continue;
+ }
+
+ if (!trylock_page(page)) {
+ src_pfns[i] = 0;
+ put_page(page);
+ continue;
+ }
+
+ src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+ }
+
+ migrate_device_unmap(src_pfns, npages, NULL);
+
+ return 0;
+}
+EXPORT_SYMBOL(migrate_device_range);
+
/*
* Migrate a device coherent page back to normal memory. The caller should have
* a reference on page which will be copied to the new page if migration is
@@ -829,25 +938,19 @@ EXPORT_SYMBOL(migrate_vma_finalize);
int migrate_device_coherent_page(struct page *page)
{
unsigned long src_pfn, dst_pfn = 0;
- struct migrate_vma args;
struct page *dpage;
WARN_ON_ONCE(PageCompound(page));
lock_page(page);
src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE;
- args.src = &src_pfn;
- args.dst = &dst_pfn;
- args.cpages = 1;
- args.npages = 1;
- args.vma = NULL;
/*
* We don't have a VMA and don't need to walk the page tables to find
* the source page. So call migrate_vma_unmap() directly to unmap the
* page as migrate_vma_setup() will fail if args.vma == NULL.
*/
- migrate_vma_unmap(&args);
+ migrate_device_unmap(&src_pfn, 1, NULL);
if (!(src_pfn & MIGRATE_PFN_MIGRATE))
return -EBUSY;
@@ -857,10 +960,10 @@ int migrate_device_coherent_page(struct page *page)
dst_pfn = migrate_pfn(page_to_pfn(dpage));
}
- migrate_vma_pages(&args);
+ migrate_device_pages(&src_pfn, &dst_pfn, 1);
if (src_pfn & MIGRATE_PFN_MIGRATE)
copy_highpage(dpage, page);
- migrate_vma_finalize(&args);
+ migrate_device_finalize(&src_pfn, &dst_pfn, 1);
if (src_pfn & MIGRATE_PFN_MIGRATE)
return 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index 6e447544f07d..bf2122af94e7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2673,7 +2673,7 @@ cannot_expand:
if (!arch_validate_flags(vma->vm_flags)) {
error = -EINVAL;
if (file)
- goto unmap_and_free_vma;
+ goto close_and_free_vma;
else
goto free_vma;
}
@@ -2742,6 +2742,9 @@ expanded:
validate_mm(mm);
return addr;
+close_and_free_vma:
+ if (vma->vm_ops && vma->vm_ops->close)
+ vma->vm_ops->close(vma);
unmap_and_free_vma:
fput(vma->vm_file);
vma->vm_file = NULL;
@@ -2942,17 +2945,18 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
if (vma &&
(!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) &&
((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) {
- mas->index = vma->vm_start;
- mas->last = addr + len - 1;
- vma_adjust_trans_huge(vma, addr, addr + len, 0);
+ mas_set_range(mas, vma->vm_start, addr + len - 1);
+ if (mas_preallocate(mas, vma, GFP_KERNEL))
+ return -ENOMEM;
+
+ vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
if (vma->anon_vma) {
anon_vma_lock_write(vma->anon_vma);
anon_vma_interval_tree_pre_update_vma(vma);
}
vma->vm_end = addr + len;
vma->vm_flags |= VM_SOFTDIRTY;
- if (mas_store_gfp(mas, vma, GFP_KERNEL))
- goto mas_expand_failed;
+ mas_store_prealloc(mas, vma);
if (vma->anon_vma) {
anon_vma_interval_tree_post_update_vma(vma);
@@ -2993,13 +2997,6 @@ mas_store_fail:
vma_alloc_fail:
vm_unacct_memory(len >> PAGE_SHIFT);
return -ENOMEM;
-
-mas_expand_failed:
- if (vma->anon_vma) {
- anon_vma_interval_tree_post_update_vma(vma);
- anon_vma_unlock_write(vma->anon_vma);
- }
- return -ENOMEM;
}
int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
@@ -3240,6 +3237,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
out_vma_link:
if (new_vma->vm_ops && new_vma->vm_ops->close)
new_vma->vm_ops->close(new_vma);
+
+ if (new_vma->vm_file)
+ fput(new_vma->vm_file);
+
+ unlink_anon_vmas(new_vma);
out_free_mempol:
mpol_put(vma_policy(new_vma));
out_free_vma:
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index a71924bd38c0..add4244e5790 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -1,6 +1,7 @@
#include <linux/gfp.h>
#include <linux/highmem.h>
#include <linux/kernel.h>
+#include <linux/kmsan-checks.h>
#include <linux/mmdebug.h>
#include <linux/mm_types.h>
#include <linux/mm_inline.h>
@@ -265,6 +266,15 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
bool fullmm)
{
+ /*
+ * struct mmu_gather contains 7 1-bit fields packed into a 32-bit
+ * unsigned int value. The remaining 25 bits remain uninitialized
+ * and are never used, but KMSAN updates the origin for them in
+ * zap_pXX_range() in mm/memory.c, thus creating very long origin
+ * chains. This is technically correct, but consumes too much memory.
+ * Unpoisoning the whole structure will prevent creating such chains.
+ */
+ kmsan_unpoison_memory(tlb, sizeof(*tlb));
tlb->mm = mm;
tlb->fullmm = fullmm;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 461dcbd4f21a..668bfaa6ed2a 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -267,6 +267,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
} else {
/* It must be an none page, or what else?.. */
WARN_ON_ONCE(!pte_none(oldpte));
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
if (unlikely(uffd_wp && !vma_is_anonymous(vma))) {
/*
* For file-backed mem, we need to be able to
@@ -278,6 +279,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
make_pte_marker(PTE_MARKER_UFFD_WP));
pages++;
}
+#endif
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ac2c9f12a7b2..e20ade858e71 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3446,7 +3446,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
int pindex;
bool free_high;
- __count_vm_event(PGFREE);
+ __count_vm_events(PGFREE, 1 << order);
pindex = order_to_pindex(migratetype, order);
list_add(&page->pcp_list, &pcp->lists[pindex]);
pcp->count += 1 << order;
@@ -3803,7 +3803,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
pcp_spin_unlock_irqrestore(pcp, flags);
pcp_trylock_finish(UP_flags);
if (page) {
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
}
return page;
@@ -6823,6 +6823,14 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
+
+ /*
+ * ZONE_DEVICE pages are released directly to the driver page allocator
+ * which will set the page count to 1 when allocating the page.
+ */
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
+ pgmap->type == MEMORY_DEVICE_COHERENT)
+ set_page_count(page, 0);
}
/*