aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds2019-05-14 10:10:55 -0700
committerLinus Torvalds2019-05-14 10:10:55 -0700
commit318222a35bfb0ae9b5ff3e359a583463e6cfcd94 (patch)
tree6a8d921f7ac9915f3f12dd3fcd7efaaf6f16bb09
parent7e9890a3500d95c01511a4c45b7e7192dfa47ae2 (diff)
parent640be2d1ffbc1946f1547eb89b5005ed7542de99 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton: - a few misc things and hotfixes - ocfs2 - almost all of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (139 commits) kernel/memremap.c: remove the unused device_private_entry_fault() export mm: delete find_get_entries_tag mm/huge_memory.c: make __thp_get_unmapped_area static mm/mprotect.c: fix compilation warning because of unused 'mm' variable mm/page-writeback: introduce tracepoint for wait_on_page_writeback() mm/vmscan: simplify trace_reclaim_flags and trace_shrink_flags mm/Kconfig: update "Memory Model" help text mm/vmscan.c: don't disable irq again when count pgrefill for memcg mm: memblock: make keeping memblock memory opt-in rather than opt-out hugetlbfs: always use address space in inode for resv_map pointer mm/z3fold.c: support page migration mm/z3fold.c: add structure for buddy handles mm/z3fold.c: improve compression by extending search mm/z3fold.c: introduce helper functions mm/page_alloc.c: remove unnecessary parameter in rmqueue_pcplist mm/hmm: add ARCH_HAS_HMM_MIRROR ARCH_HAS_HMM_DEVICE Kconfig mm/vmscan.c: simplify shrink_inactive_list() fs/sync.c: sync_file_range(2) may use WB_SYNC_ALL writeback xen/privcmd-buf.c: convert to use vm_map_pages_zero() xen/gntdev.c: convert to use vm_map_pages() ...
-rw-r--r--Documentation/sysctl/vm.txt12
-rw-r--r--Documentation/trace/postprocess/trace-vmscan-postprocess.pl7
-rw-r--r--Documentation/vm/hmm.rst94
-rw-r--r--MAINTAINERS1
-rw-r--r--arch/Kconfig7
-rw-r--r--arch/alpha/mm/init.c14
-rw-r--r--arch/arc/mm/init.c15
-rw-r--r--arch/arm/Kconfig3
-rw-r--r--arch/arm/mm/dma-mapping.c22
-rw-r--r--arch/arm/mm/init.c25
-rw-r--r--arch/arm64/Kconfig4
-rw-r--r--arch/arm64/include/asm/hugetlb.h4
-rw-r--r--arch/arm64/mm/init.c17
-rw-r--r--arch/arm64/mm/mmu.c6
-rw-r--r--arch/c6x/mm/init.c12
-rw-r--r--arch/h8300/mm/init.c14
-rw-r--r--arch/hexagon/Kconfig1
-rw-r--r--arch/hexagon/mm/init.c10
-rw-r--r--arch/ia64/Kconfig1
-rw-r--r--arch/ia64/mm/init.c17
-rw-r--r--arch/m68k/Kconfig1
-rw-r--r--arch/m68k/mm/init.c7
-rw-r--r--arch/microblaze/mm/init.c12
-rw-r--r--arch/mips/Kconfig1
-rw-r--r--arch/mips/mm/gup.c11
-rw-r--r--arch/mips/mm/init.c8
-rw-r--r--arch/nds32/mm/init.c12
-rw-r--r--arch/nios2/Kconfig1
-rw-r--r--arch/nios2/mm/init.c12
-rw-r--r--arch/openrisc/mm/init.c12
-rw-r--r--arch/parisc/mm/init.c7
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/include/asm/book3s/64/hugetlb.h5
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c4
-rw-r--r--arch/powerpc/kvm/e500_mmu.c2
-rw-r--r--arch/powerpc/mm/book3s64/iommu_api.c5
-rw-r--r--arch/powerpc/mm/mem.c22
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype2
-rw-r--r--arch/riscv/mm/init.c5
-rw-r--r--arch/s390/Kconfig3
-rw-r--r--arch/s390/include/asm/hugetlb.h8
-rw-r--r--arch/s390/kvm/interrupt.c2
-rw-r--r--arch/s390/mm/init.c19
-rw-r--r--arch/sh/Kconfig2
-rw-r--r--arch/sh/boards/mach-dreamcast/irq.c1
-rw-r--r--arch/sh/mm/gup.c11
-rw-r--r--arch/sh/mm/init.c29
-rw-r--r--arch/sparc/Kconfig1
-rw-r--r--arch/sparc/include/asm/pgtable_64.h30
-rw-r--r--arch/sparc/mm/gup.c9
-rw-r--r--arch/sparc/mm/init_32.c13
-rw-r--r--arch/sparc/mm/init_64.c8
-rw-r--r--arch/um/kernel/mem.c7
-rw-r--r--arch/unicore32/Kconfig1
-rw-r--r--arch/unicore32/mm/init.c24
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/include/asm/hugetlb.h4
-rw-r--r--arch/x86/kvm/paging_tmpl.h2
-rw-r--r--arch/x86/kvm/svm.c2
-rw-r--r--arch/x86/mm/hugetlbpage.c2
-rw-r--r--arch/x86/mm/init_32.c11
-rw-r--r--arch/x86/mm/init_64.c20
-rw-r--r--arch/xtensa/mm/init.c5
-rw-r--r--drivers/base/memory.c24
-rw-r--r--drivers/dax/device.c6
-rw-r--r--drivers/firewire/core-iso.c15
-rw-r--r--drivers/fpga/dfl-afu-dma-region.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c8
-rw-r--r--drivers/gpu/drm/i915/i915_gem_userptr.c2
-rw-r--r--drivers/gpu/drm/radeon/radeon_mn.c4
-rw-r--r--drivers/gpu/drm/rockchip/rockchip_drm_gem.c17
-rw-r--r--drivers/gpu/drm/via/via_dmablit.c3
-rw-r--r--drivers/gpu/drm/xen/xen_drm_front_gem.c18
-rw-r--r--drivers/infiniband/core/umem.c5
-rw-r--r--drivers/infiniband/core/umem_odp.c5
-rw-r--r--drivers/infiniband/hw/hfi1/user_pages.c3
-rw-r--r--drivers/infiniband/hw/mthca/mthca_memfree.c3
-rw-r--r--drivers/infiniband/hw/qib/qib_user_pages.c8
-rw-r--r--drivers/infiniband/hw/qib/qib_user_sdma.c2
-rw-r--r--drivers/infiniband/hw/usnic/usnic_uiom.c9
-rw-r--r--drivers/iommu/dma-iommu.c12
-rw-r--r--drivers/media/common/videobuf2/videobuf2-core.c7
-rw-r--r--drivers/media/common/videobuf2/videobuf2-dma-contig.c6
-rw-r--r--drivers/media/common/videobuf2/videobuf2-dma-sg.c22
-rw-r--r--drivers/media/v4l2-core/videobuf-dma-sg.c6
-rw-r--r--drivers/misc/genwqe/card_utils.c2
-rw-r--r--drivers/misc/vmw_vmci/vmci_host.c2
-rw-r--r--drivers/misc/vmw_vmci/vmci_queue_pair.c6
-rw-r--r--drivers/platform/goldfish/goldfish_pipe.c3
-rw-r--r--drivers/rapidio/devices/rio_mport_cdev.c4
-rw-r--r--drivers/sbus/char/oradax.c2
-rw-r--r--drivers/scsi/st.c3
-rw-r--r--drivers/staging/gasket/gasket_page_table.c4
-rw-r--r--drivers/tee/tee_shm.c2
-rw-r--r--drivers/vfio/vfio_iommu_spapr_tce.c3
-rw-r--r--drivers/vfio/vfio_iommu_type1.c3
-rw-r--r--drivers/vhost/vhost.c2
-rw-r--r--drivers/video/fbdev/pvr2fb.c2
-rw-r--r--drivers/virt/fsl_hypervisor.c2
-rw-r--r--drivers/xen/gntdev.c19
-rw-r--r--drivers/xen/privcmd-buf.c8
-rw-r--r--fs/dax.c8
-rw-r--r--fs/hugetlbfs/inode.c18
-rw-r--r--fs/io_uring.c5
-rw-r--r--fs/ocfs2/dir.c20
-rw-r--r--fs/ocfs2/export.c30
-rw-r--r--fs/ocfs2/ocfs2_fs.h28
-rw-r--r--fs/orangefs/orangefs-bufmap.c2
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/sync.c21
-rw-r--r--fs/userfaultfd.c5
-rw-r--r--include/asm-generic/hugetlb.h7
-rw-r--r--include/linux/balloon_compaction.h15
-rw-r--r--include/linux/gfp.h4
-rw-r--r--include/linux/hmm.h310
-rw-r--r--include/linux/huge_mm.h6
-rw-r--r--include/linux/hugetlb.h4
-rw-r--r--include/linux/list.h18
-rw-r--r--include/linux/memblock.h44
-rw-r--r--include/linux/memcontrol.h34
-rw-r--r--include/linux/memory.h2
-rw-r--r--include/linux/memory_hotplug.h42
-rw-r--r--include/linux/mm.h114
-rw-r--r--include/linux/mm_inline.h2
-rw-r--r--include/linux/mm_types.h2
-rw-r--r--include/linux/mmu_notifier.h63
-rw-r--r--include/linux/mmzone.h5
-rw-r--r--include/linux/pagemap.h26
-rw-r--r--include/linux/userfaultfd_k.h2
-rw-r--r--include/linux/vmstat.h2
-rw-r--r--include/trace/events/compaction.h10
-rw-r--r--include/trace/events/vmscan.h98
-rw-r--r--include/trace/events/writeback.h16
-rw-r--r--include/uapi/linux/fs.h3
-rw-r--r--init/initramfs.c147
-rw-r--r--init/main.c5
-rw-r--r--kernel/events/uprobes.c3
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/kexec_file.c16
-rw-r--r--kernel/memremap.c13
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/sysctl.c12
-rw-r--r--lib/iov_iter.c7
-rw-r--r--mm/Kconfig82
-rw-r--r--mm/Kconfig.debug1
-rw-r--r--mm/cma.c23
-rw-r--r--mm/cma_debug.c2
-rw-r--r--mm/compaction.c4
-rw-r--r--mm/filemap.c216
-rw-r--r--mm/gup.c393
-rw-r--r--mm/gup_benchmark.c5
-rw-r--r--mm/hmm.c1086
-rw-r--r--mm/huge_memory.c35
-rw-r--r--mm/hugetlb.c180
-rw-r--r--mm/khugepaged.c7
-rw-r--r--mm/ksm.c6
-rw-r--r--mm/madvise.c3
-rw-r--r--mm/memblock.c70
-rw-r--r--mm/memcontrol.c85
-rw-r--r--mm/memfd.c2
-rw-r--r--mm/memory.c106
-rw-r--r--mm/memory_hotplug.c129
-rw-r--r--mm/migrate.c7
-rw-r--r--mm/mmu_notifier.c12
-rw-r--r--mm/mprotect.c9
-rw-r--r--mm/mremap.c3
-rw-r--r--mm/nommu.c14
-rw-r--r--mm/oom_kill.c3
-rw-r--r--mm/page-writeback.c12
-rw-r--r--mm/page_alloc.c270
-rw-r--r--mm/page_isolation.c2
-rw-r--r--mm/rmap.c10
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/slab.c61
-rw-r--r--mm/slob.c59
-rw-r--r--mm/slub.c72
-rw-r--r--mm/sparse.c16
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/userfaultfd.c3
-rw-r--r--mm/util.c59
-rw-r--r--mm/vmscan.c203
-rw-r--r--mm/workingset.c5
-rw-r--r--mm/z3fold.c638
-rw-r--r--net/ceph/pagevec.c2
-rw-r--r--net/rds/info.c2
-rw-r--r--net/rds/rdma.c3
-rw-r--r--net/xdp/xdp_umem.c4
-rw-r--r--virt/kvm/kvm_main.c3
189 files changed, 3646 insertions, 2332 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 3f13d8599337..749322060f10 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm:
- stat_refresh
- numa_stat
- swappiness
+- unprivileged_userfaultfd
- user_reserve_kbytes
- vfs_cache_pressure
- watermark_boost_factor
@@ -818,6 +819,17 @@ The default value is 60.
==============================================================
+unprivileged_userfaultfd
+
+This flag controls whether unprivileged users can use the userfaultfd
+system calls. Set this to 1 to allow unprivileged users to use the
+userfaultfd system calls, or set this to 0 to restrict userfaultfd to only
+privileged users (with SYS_CAP_PTRACE capability).
+
+The default value is 1.
+
+==============================================================
+
- user_reserve_kbytes
When overcommit_memory is set to 2, "never overcommit" mode, reserve
diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
index 66bfd8396877..995da15b16ca 100644
--- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
+++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
@@ -113,7 +113,7 @@ my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)';
my $regex_kswapd_sleep_default = 'nid=([0-9]*)';
my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)';
my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)';
-my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)';
+my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate_anon=([0-9]*) nr_activate_file=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)';
my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)';
my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)';
@@ -212,7 +212,8 @@ $regex_lru_shrink_inactive = generate_traceevent_regex(
"vmscan/mm_vmscan_lru_shrink_inactive",
$regex_lru_shrink_inactive_default,
"nid", "nr_scanned", "nr_reclaimed", "nr_dirty", "nr_writeback",
- "nr_congested", "nr_immediate", "nr_activate", "nr_ref_keep",
+ "nr_congested", "nr_immediate", "nr_activate_anon",
+ "nr_activate_file", "nr_ref_keep",
"nr_unmap_fail", "priority", "flags");
$regex_lru_shrink_active = generate_traceevent_regex(
"vmscan/mm_vmscan_lru_shrink_active",
@@ -407,7 +408,7 @@ EVENT_PROCESS:
}
my $nr_reclaimed = $3;
- my $flags = $12;
+ my $flags = $13;
my $file = 0;
if ($flags =~ /RECLAIM_WB_FILE/) {
$file = 1;
diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 44205f0b671f..ec1efa32af3c 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -189,20 +189,10 @@ the driver callback returns.
When the device driver wants to populate a range of virtual addresses, it can
use either::
- int hmm_vma_get_pfns(struct vm_area_struct *vma,
- struct hmm_range *range,
- unsigned long start,
- unsigned long end,
- hmm_pfn_t *pfns);
- int hmm_vma_fault(struct vm_area_struct *vma,
- struct hmm_range *range,
- unsigned long start,
- unsigned long end,
- hmm_pfn_t *pfns,
- bool write,
- bool block);
-
-The first one (hmm_vma_get_pfns()) will only fetch present CPU page table
+ long hmm_range_snapshot(struct hmm_range *range);
+ long hmm_range_fault(struct hmm_range *range, bool block);
+
+The first one (hmm_range_snapshot()) will only fetch present CPU page table
entries and will not trigger a page fault on missing or non-present entries.
The second one does trigger a page fault on missing or read-only entry if the
write parameter is true. Page faults use the generic mm page fault code path
@@ -220,25 +210,56 @@ respect in order to keep things properly synchronized. The usage pattern is::
{
struct hmm_range range;
...
+
+ range.start = ...;
+ range.end = ...;
+ range.pfns = ...;
+ range.flags = ...;
+ range.values = ...;
+ range.pfn_shift = ...;
+ hmm_range_register(&range);
+
+ /*
+ * Just wait for range to be valid, safe to ignore return value as we
+ * will use the return value of hmm_range_snapshot() below under the
+ * mmap_sem to ascertain the validity of the range.
+ */
+ hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
+
again:
- ret = hmm_vma_get_pfns(vma, &range, start, end, pfns);
- if (ret)
+ down_read(&mm->mmap_sem);
+ ret = hmm_range_snapshot(&range);
+ if (ret) {
+ up_read(&mm->mmap_sem);
+ if (ret == -EAGAIN) {
+ /*
+ * No need to check hmm_range_wait_until_valid() return value
+ * on retry we will get proper error with hmm_range_snapshot()
+ */
+ hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
+ goto again;
+ }
+ hmm_mirror_unregister(&range);
return ret;
+ }
take_lock(driver->update);
- if (!hmm_vma_range_done(vma, &range)) {
+ if (!range.valid) {
release_lock(driver->update);
+ up_read(&mm->mmap_sem);
goto again;
}
// Use pfns array content to update device page table
+ hmm_mirror_unregister(&range);
release_lock(driver->update);
+ up_read(&mm->mmap_sem);
return 0;
}
The driver->update lock is the same lock that the driver takes inside its
-update() callback. That lock must be held before hmm_vma_range_done() to avoid
-any race with a concurrent CPU page table update.
+update() callback. That lock must be held before checking the range.valid
+field to avoid any race with a concurrent CPU page table update.
HMM implements all this on top of the mmu_notifier API because we wanted a
simpler API and also to be able to perform optimizations latter on like doing
@@ -255,6 +276,41 @@ report commands as executed is serialized (there is no point in doing this
concurrently).
+Leverage default_flags and pfn_flags_mask
+=========================================
+
+The hmm_range struct has 2 fields default_flags and pfn_flags_mask that allows
+to set fault or snapshot policy for a whole range instead of having to set them
+for each entries in the range.
+
+For instance if the device flags for device entries are:
+ VALID (1 << 63)
+ WRITE (1 << 62)
+
+Now let say that device driver wants to fault with at least read a range then
+it does set:
+ range->default_flags = (1 << 63)
+ range->pfn_flags_mask = 0;
+
+and calls hmm_range_fault() as described above. This will fill fault all page
+in the range with at least read permission.
+
+Now let say driver wants to do the same except for one page in the range for
+which its want to have write. Now driver set:
+ range->default_flags = (1 << 63);
+ range->pfn_flags_mask = (1 << 62);
+ range->pfns[index_of_write] = (1 << 62);
+
+With this HMM will fault in all page with at least read (ie valid) and for the
+address == range->start + (index_of_write << PAGE_SHIFT) it will fault with
+write permission ie if the CPU pte does not have write permission set then HMM
+will call handle_mm_fault().
+
+Note that HMM will populate the pfns array with write permission for any entry
+that have write permission within the CPU pte no matter what are the values set
+in default_flags or pfn_flags_mask.
+
+
Represent and manage device memory from core kernel point of view
=================================================================
diff --git a/MAINTAINERS b/MAINTAINERS
index fb9f9d71f7a2..372e60e416f6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11746,6 +11746,7 @@ F: include/linux/oprofile.h
ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
M: Mark Fasheh <mark@fasheh.com>
M: Joel Becker <jlbec@evilplan.org>
+M: Joseph Qi <joseph.qi@linux.alibaba.com>
L: ocfs2-devel@oss.oracle.com (moderated for non-subscribers)
W: http://ocfs2.wiki.kernel.org
S: Supported
diff --git a/arch/Kconfig b/arch/Kconfig
index 5e43fcbad4ca..f11f0698b148 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -245,6 +245,13 @@ config ARCH_HAS_FORTIFY_SOURCE
An architecture should select this when it can successfully
build and run with CONFIG_FORTIFY_SOURCE.
+#
+# Select if the arch provides a historic keepinit alias for the retain_initrd
+# command line option
+#
+config ARCH_HAS_KEEPINITRD
+ bool
+
# Select if arch has all set_memory_ro/rw/x/nx() functions in asm/cacheflush.h
config ARCH_HAS_SET_MEMORY
bool
diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index a42fc5c4db89..e2cbec3789e8 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -285,17 +285,3 @@ mem_init(void)
memblock_free_all();
mem_init_print_info(NULL);
}
-
-void
-free_initmem(void)
-{
- free_initmem_default(-1);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void
-free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index e1ab2d7f1d64..02b7a3b20d7c 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -206,18 +206,3 @@ void __init mem_init(void)
memblock_free_all();
mem_init_print_info(NULL);
}
-
-/*
- * free_initmem: Free all the __init memory.
- */
-void __ref free_initmem(void)
-{
- free_initmem_default(-1);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index dc9855c4a3b4..5fd344bd06b9 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -4,11 +4,11 @@ config ARM
default y
select ARCH_32BIT_OFF_T
select ARCH_CLOCKSOURCE_DATA
- select ARCH_DISCARD_MEMBLOCK if !HAVE_ARCH_PFN_VALID && !KEXEC
select ARCH_HAS_DEBUG_VIRTUAL if MMU
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FORTIFY_SOURCE
+ select ARCH_HAS_KEEPINITRD
select ARCH_HAS_KCOV
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_PTE_SPECIAL if ARM_LPAE
@@ -21,6 +21,7 @@ config ARM
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAVE_CUSTOM_GPIO_H
select ARCH_HAS_GCOV_PROFILE_ALL
+ select ARCH_KEEP_MEMBLOCK if HAVE_ARCH_PFN_VALID || KEXEC
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_NO_SG_CHAIN if !ARM_HAS_SG_CHAIN
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 43f46aa7ef33..0a75058c11f3 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -1577,31 +1577,21 @@ static int __arm_iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma
void *cpu_addr, dma_addr_t dma_addr, size_t size,
unsigned long attrs)
{
- unsigned long uaddr = vma->vm_start;
- unsigned long usize = vma->vm_end - vma->vm_start;
struct page **pages = __iommu_get_pages(cpu_addr, attrs);
unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
- unsigned long off = vma->vm_pgoff;
+ int err;
if (!pages)
return -ENXIO;
- if (off >= nr_pages || (usize >> PAGE_SHIFT) > nr_pages - off)
+ if (vma->vm_pgoff >= nr_pages)
return -ENXIO;
- pages += off;
-
- do {
- int ret = vm_insert_page(vma, uaddr, *pages++);
- if (ret) {
- pr_err("Remapping memory failed: %d\n", ret);
- return ret;
- }
- uaddr += PAGE_SIZE;
- usize -= PAGE_SIZE;
- } while (usize > 0);
+ err = vm_map_pages(vma, pages, nr_pages);
+ if (err)
+ pr_err("Remapping memory failed: %d\n", err);
- return 0;
+ return err;
}
static int arm_iommu_mmap_attrs(struct device *dev,
struct vm_area_struct *vma, void *cpu_addr,
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index c2daabbe0af0..68dcd5f8d7c6 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -695,27 +695,14 @@ void free_initmem(void)
}
#ifdef CONFIG_BLK_DEV_INITRD
-
-static int keep_initrd;
-
void free_initrd_mem(unsigned long start, unsigned long end)
{
- if (!keep_initrd) {
- if (start == initrd_start)
- start = round_down(start, PAGE_SIZE);
- if (end == initrd_end)
- end = round_up(end, PAGE_SIZE);
+ if (start == initrd_start)
+ start = round_down(start, PAGE_SIZE);
+ if (end == initrd_end)
+ end = round_up(end, PAGE_SIZE);
- poison_init_mem((void *)start, PAGE_ALIGN(end) - start);
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
- }
+ poison_init_mem((void *)start, PAGE_ALIGN(end) - start);
+ free_reserved_area((void *)start, (void *)end, -1, "initrd");
}
-
-static int __init keepinitrd_setup(char *__unused)
-{
- keep_initrd = 1;
- return 1;
-}
-
-__setup("keepinitrd", keepinitrd_setup);
#endif
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3f957443f286..69a59a5d1143 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -19,8 +19,9 @@ config ARM64
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
- select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+ select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_KCOV
+ select ARCH_HAS_KEEPINITRD
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SETUP_DMA_OPS
@@ -59,6 +60,7 @@ config ARM64
select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPT
select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPT
select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPT
+ select ARCH_KEEP_MEMBLOCK
select ARCH_USE_CMPXCHG_LOCKREF
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index c6a07a3b433e..4aad6382f631 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -70,8 +70,4 @@ extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
#include <asm-generic/hugetlb.h>
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void) { return true; }
-#endif
-
#endif /* __ASM_HUGETLB_H */
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 40e2d7e5efcb..007c05a4cce0 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -578,24 +578,11 @@ void free_initmem(void)
}
#ifdef CONFIG_BLK_DEV_INITRD
-
-static int keep_initrd __initdata;
-
void __init free_initrd_mem(unsigned long start, unsigned long end)
{
- if (!keep_initrd) {
- free_reserved_area((void *)start, (void *)end, 0, "initrd");
- memblock_free(__virt_to_phys(start), end - start);
- }
-}
-
-static int __init keepinitrd_setup(char *__unused)
-{
- keep_initrd = 1;
- return 1;
+ free_reserved_area((void *)start, (void *)end, 0, "initrd");
+ memblock_free(__virt_to_phys(start), end - start);
}
-
-__setup("keepinitrd", keepinitrd_setup);
#endif
/*
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index ef82312860ac..ef32d4839c3f 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1065,8 +1065,8 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+ struct mhp_restrictions *restrictions)
{
int flags = 0;
@@ -1077,6 +1077,6 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
- altmap, want_memblock);
+ restrictions);
}
#endif
diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c
index fe582c3a1794..573242b160e1 100644
--- a/arch/c6x/mm/init.c
+++ b/arch/c6x/mm/init.c
@@ -68,15 +68,3 @@ void __init mem_init(void)
mem_init_print_info(NULL);
}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
-void __init free_initmem(void)
-{
- free_initmem_default(-1);
-}
diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c
index 0f04a5e9aa4f..1eab16b1a0bc 100644
--- a/arch/h8300/mm/init.c
+++ b/arch/h8300/mm/init.c
@@ -102,17 +102,3 @@ void __init mem_init(void)
mem_init_print_info(NULL);
}
-
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
-void
-free_initmem(void)
-{
- free_initmem_default(-1);
-}
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 3e54a53208d5..b7d404bbaa0f 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -22,7 +22,6 @@ config HEXAGON
select GENERIC_IRQ_SHOW
select HAVE_ARCH_KGDB
select HAVE_ARCH_TRACEHOOK
- select ARCH_DISCARD_MEMBLOCK
select NEED_SG_DMA_LENGTH
select NO_IOPORT_MAP
select GENERIC_IOMAP
diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c
index 1719ede9e9bd..41cf34243ea1 100644
--- a/arch/hexagon/mm/init.c
+++ b/arch/hexagon/mm/init.c
@@ -85,16 +85,6 @@ void __init mem_init(void)
}
/*
- * free_initmem - frees memory used by stuff declared with __init
- *
- * Todo: free pages between __init_begin and __init_end; possibly
- * some devtree related stuff as well.
- */
-void __ref free_initmem(void)
-{
-}
-
-/*
* free_initrd_mem - frees... initrd memory.
* @start - start of init memory
* @end - end of init memory
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 73a26f04644e..7468d8e50467 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -33,7 +33,6 @@ config IA64
select ARCH_HAS_DMA_COHERENT_TO_PFN if SWIOTLB
select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB
select VIRT_TO_BUS
- select ARCH_DISCARD_MEMBLOCK
select GENERIC_IRQ_PROBE
select GENERIC_PENDING_IRQ if SMP
select GENERIC_IRQ_SHOW
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index e49200e31750..d28e29103bdb 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -666,14 +666,14 @@ mem_init (void)
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+ struct mhp_restrictions *restrictions)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
- ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
if (ret)
printk("%s: Problem encountered in __add_pages() as ret=%d\n",
__func__, ret);
@@ -682,20 +682,15 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
+void arch_remove_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
struct zone *zone;
- int ret;
zone = page_zone(pfn_to_page(start_pfn));
- ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
- if (ret)
- pr_warn("%s: Problem encountered in __remove_pages() as"
- " ret=%d\n", __func__, ret);
-
- return ret;
+ __remove_pages(zone, start_pfn, nr_pages, altmap);
}
#endif
#endif
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index fe5cc2da6d10..218e037ef901 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -26,7 +26,6 @@ config M68K
select MODULES_USE_ELF_RELA
select OLD_SIGSUSPEND3
select OLD_SIGACTION
- select ARCH_DISCARD_MEMBLOCK
select MMU_GATHER_NO_RANGE if MMU
config CPU_BIG_ENDIAN
diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c
index 8868a4c9adae..778cacb7d57b 100644
--- a/arch/m68k/mm/init.c
+++ b/arch/m68k/mm/init.c
@@ -147,10 +147,3 @@ void __init mem_init(void)
init_pointer_tables();
mem_init_print_info(NULL);
}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 7e97d44f6538..a015a951c8b7 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -186,18 +186,6 @@ void __init setup_memory(void)
paging_init();
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
-void free_initmem(void)
-{
- free_initmem_default(-1);
-}
-
void __init mem_init(void)
{
high_memory = (void *)__va(memory_start + lowmem_size - 1);
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index ff8cff9fcf54..677e5bfeff47 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -5,7 +5,6 @@ config MIPS
select ARCH_32BIT_OFF_T if !64BIT
select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT
select ARCH_CLOCKSOURCE_DATA
- select ARCH_DISCARD_MEMBLOCK
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UBSAN_SANITIZE_ALL
diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
index 0d14e0d8eacf..4c2b4483683c 100644
--- a/arch/mips/mm/gup.c
+++ b/arch/mips/mm/gup.c
@@ -235,7 +235,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
@@ -247,8 +247,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*/
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
@@ -273,7 +273,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE,
+ pages, &nr))
goto slow;
} while (pgdp++, addr = next, addr != end);
local_irq_enable();
@@ -289,7 +290,7 @@ slow_irqon:
pages += nr;
ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT,
- pages, write ? FOLL_WRITE : 0);
+ pages, gup_flags);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index bbb196ad5f26..8a038b30d3c4 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -504,14 +504,6 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end)
printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
- "initrd");
-}
-#endif
-
void (*free_init_pages_eva)(void *begin, void *end) = NULL;
void __ref free_initmem(void)
diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c
index 1d03633f89a9..1a4ab1b7525f 100644
--- a/arch/nds32/mm/init.c
+++ b/arch/nds32/mm/init.c
@@ -252,18 +252,6 @@ void __init mem_init(void)
return;
}
-void free_initmem(void)
-{
- free_initmem_default(-1);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
void __set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags)
{
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index ea37394ff3ea..26a9c760a98b 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -23,7 +23,6 @@ config NIOS2
select SPARSE_IRQ
select USB_ARCH_HAS_HCD if USB_SUPPORT
select CPU_NO_EFFICIENT_FFS
- select ARCH_DISCARD_MEMBLOCK
select MMU_GATHER_NO_RANGE if MMU
config GENERIC_CSUM
diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c
index 16cea5776b87..2c609c2516b2 100644
--- a/arch/nios2/mm/init.c
+++ b/arch/nios2/mm/init.c
@@ -82,18 +82,6 @@ void __init mmu_init(void)
flush_tlb_all();
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
-void __ref free_initmem(void)
-{
- free_initmem_default(-1);
-}
-
#define __page_aligned(order) __aligned(PAGE_SIZE << (order))
pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned(PGD_ORDER);
pte_t invalid_pte_table[PTRS_PER_PTE] __page_aligned(PTE_ORDER);
diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c
index caeb4184e8a6..abe87e54e231 100644
--- a/arch/openrisc/mm/init.c
+++ b/arch/openrisc/mm/init.c
@@ -223,15 +223,3 @@ void __init mem_init(void)
mem_init_done = 1;
return;
}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
-void free_initmem(void)
-{
- free_initmem_default(-1);
-}
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 3b0f9eab7f2c..11ec1f1221a6 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -917,10 +917,3 @@ void flush_tlb_all(void)
spin_unlock(&sid_lock);
}
#endif
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d7996cfaceca..8c1c636308c8 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -137,6 +137,7 @@ config PPC
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64
select ARCH_HAVE_NMI_SAFE_CMPXCHG
+ select ARCH_KEEP_MEMBLOCK
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index 56140d19c85f..12e150e615b7 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -36,8 +36,8 @@ static inline int hstate_get_psize(struct hstate *hstate)
}
}
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void)
+#define __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED
+static inline bool gigantic_page_runtime_supported(void)
{
/*
* We used gigantic page reservation with hypervisor assist in some case.
@@ -49,7 +49,6 @@ static inline bool gigantic_page_supported(void)
return true;
}
-#endif
/* hugepd entry valid bit */
#define HUGEPD_VAL_BITS (0x8000000000000000UL)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index be7bc070eae5..ab3d484c5e2e 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -600,7 +600,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
/* If writing != 0, then the HPTE must allow writing, if we get here */
write_ok = writing;
hva = gfn_to_hva_memslot(memslot, gfn);
- npages = get_user_pages_fast(hva, 1, writing, pages);
+ npages = get_user_pages_fast(hva, 1, writing ? FOLL_WRITE : 0, pages);
if (npages < 1) {
/* Check if it's an I/O mapping */
down_read(&current->mm->mmap_sem);
@@ -1193,7 +1193,7 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
goto err;
hva = gfn_to_hva_memslot(memslot, gfn);
- npages = get_user_pages_fast(hva, 1, 1, pages);
+ npages = get_user_pages_fast(hva, 1, FOLL_WRITE, pages);
if (npages < 1)
goto err;
page = pages[0];
diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
index 24296f4cadc6..e0af53fd78c5 100644
--- a/arch/powerpc/kvm/e500_mmu.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -783,7 +783,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
if (!pages)
return -ENOMEM;
- ret = get_user_pages_fast(cfg->array, num_pages, 1, pages);
+ ret = get_user_pages_fast(cfg->array, num_pages, FOLL_WRITE, pages);
if (ret < 0)
goto free_pages;
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
index 8330f135294f..5c521f3924a5 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -141,8 +141,9 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
for (entry = 0; entry < entries; entry += chunk) {
unsigned long n = min(entries - entry, chunk);
- ret = get_user_pages_longterm(ua + (entry << PAGE_SHIFT), n,
- FOLL_WRITE, mem->hpages + entry, NULL);
+ ret = get_user_pages(ua + (entry << PAGE_SHIFT), n,
+ FOLL_WRITE | FOLL_LONGTERM,
+ mem->hpages + entry, NULL);
if (ret == n) {
pinned += n;
continue;
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index cd525d709072..e885fe2aafcc 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -109,8 +109,8 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end)
return -ENODEV;
}
-int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+int __ref arch_add_memory(int nid, u64 start, u64 size,
+ struct mhp_restrictions *restrictions)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -127,11 +127,11 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altm
}
flush_inval_dcache_range(start, start + size);
- return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ return __add_pages(nid, start_pfn, nr_pages, restrictions);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-int __ref arch_remove_memory(int nid, u64 start, u64 size,
+void __ref arch_remove_memory(int nid, u64 start, u64 size,
struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
@@ -147,14 +147,13 @@ int __ref arch_remove_memory(int nid, u64 start, u64 size,
if (altmap)
page += vmem_altmap_offset(altmap);
- ret = __remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
- if (ret)
- return ret;
+ __remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
/* Remove htab bolted mappings for this section of memory */
start = (unsigned long)__va(start);
flush_inval_dcache_range(start, start + size);
ret = remove_section_mapping(start, start + size);
+ WARN_ON_ONCE(ret);
/* Ensure all vmalloc mappings are flushed in case they also
* hit that section of memory
@@ -163,8 +162,6 @@ int __ref arch_remove_memory(int nid, u64 start, u64 size,
if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC)
pr_warn("Hash collision while resizing HPT\n");
-
- return ret;
}
#endif
#endif /* CONFIG_MEMORY_HOTPLUG */
@@ -338,13 +335,6 @@ void free_initmem(void)
free_initmem_default(POISON_FREE_INITMEM);
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
/*
* This is called when a page has been modified by the kernel.
* It just marks the page as not i-cache clean. We do the i-cache
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index d0e172d47574..2794235e9d3e 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -331,7 +331,7 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK
config PPC_RADIX_MMU
bool "Radix MMU Support"
depends on PPC_BOOK3S_64 && HUGETLB_PAGE
- select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+ select ARCH_HAS_GIGANTIC_PAGE
select PPC_HAVE_KUEP
select PPC_HAVE_KUAP
default y
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index bc7b77e34d09..8bf6f9c2d48c 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -66,11 +66,6 @@ void __init mem_init(void)
mem_init_print_info(NULL);
}
-void free_initmem(void)
-{
- free_initmem_default(0);
-}
-
#ifdef CONFIG_BLK_DEV_INITRD
static void __init setup_initrd(void)
{
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 07485582d027..109243fdb6ec 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -63,7 +63,7 @@ config S390
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
- select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+ select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_KCOV
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SET_MEMORY
@@ -100,6 +100,7 @@ config S390
select ARCH_INLINE_WRITE_UNLOCK_BH
select ARCH_INLINE_WRITE_UNLOCK_IRQ
select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+ select ARCH_KEEP_MEMBLOCK
select ARCH_SAVE_PAGE_KEYS if HIBERNATION
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_NUMA_BALANCING
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index 2d1afa58a4b6..bb59dd964590 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -116,7 +116,9 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
return pte_modify(pte, newprot);
}
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void) { return true; }
-#endif
+static inline bool gigantic_page_runtime_supported(void)
+{
+ return true;
+}
+
#endif /* _ASM_S390_HUGETLB_H */
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 37503ae62486..1fd706f6206c 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -2376,7 +2376,7 @@ static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr)
ret = -EFAULT;
goto out;
}
- ret = get_user_pages_fast(map->addr, 1, 1, &map->page);
+ ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page);
if (ret < 0)
goto out;
BUG_ON(ret != 1);
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 7cf48eefec8f..14d1eae9fe43 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -157,14 +157,6 @@ void free_initmem(void)
free_initmem_default(POISON_FREE_INITMEM);
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
- "initrd");
-}
-#endif
-
unsigned long memory_block_size_bytes(void)
{
/*
@@ -227,8 +219,8 @@ device_initcall(s390_cma_mem_init);
#endif /* CONFIG_CMA */
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+ struct mhp_restrictions *restrictions)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long size_pages = PFN_DOWN(size);
@@ -238,21 +230,22 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
if (rc)
return rc;
- rc = __add_pages(nid, start_pfn, size_pages, altmap, want_memblock);
+ rc = __add_pages(nid, start_pfn, size_pages, restrictions);
if (rc)
vmem_remove_mapping(start, size);
return rc;
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
+void arch_remove_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap)
{
/*
* There is no hardware or firmware interface which could trigger a
* hot memory remove on s390. So there is nothing that needs to be
* implemented.
*/
- return -EBUSY;
+ BUG();
}
#endif
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 0be08d586d40..b77f512bb176 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -10,7 +10,6 @@ config SUPERH
select DMA_DECLARE_COHERENT
select HAVE_IDE if HAS_IOPORT_MAP
select HAVE_MEMBLOCK_NODE_MAP
- select ARCH_DISCARD_MEMBLOCK
select HAVE_OPROFILE
select HAVE_ARCH_TRACEHOOK
select HAVE_PERF_EVENTS
@@ -53,6 +52,7 @@ config SUPERH
select HAVE_FUTEX_CMPXCHG if FUTEX
select HAVE_NMI
select NEED_SG_DMA_LENGTH
+ select ARCH_HAS_GIGANTIC_PAGE
help
The SuperH is a RISC processor targeted for use in embedded systems
diff --git a/arch/sh/boards/mach-dreamcast/irq.c b/arch/sh/boards/mach-dreamcast/irq.c
index a929f764ae04..cc06e4cdb4cd 100644
--- a/arch/sh/boards/mach-dreamcast/irq.c
+++ b/arch/sh/boards/mach-dreamcast/irq.c
@@ -10,7 +10,6 @@
*/
#include <linux/irq.h>
#include <linux/io.h>
-#include <linux/irq.h>
#include <linux/export.h>
#include <linux/err.h>
#include <mach/sysasic.h>
diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c
index 3e27f6d1f1ec..277c882f7489 100644
--- a/arch/sh/mm/gup.c
+++ b/arch/sh/mm/gup.c
@@ -204,7 +204,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
@@ -216,8 +216,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*/
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
@@ -241,7 +241,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE,
+ pages, &nr))
goto slow;
} while (pgdp++, addr = next, addr != end);
local_irq_enable();
@@ -261,7 +262,7 @@ slow_irqon:
ret = get_user_pages_unlocked(start,
(end - start) >> PAGE_SHIFT, pages,
- write ? FOLL_WRITE : 0);
+ gup_flags);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 70621324db41..b95e343e3c9d 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -403,28 +403,16 @@ void __init mem_init(void)
mem_init_done = 1;
}
-void free_initmem(void)
-{
- free_initmem_default(-1);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+ struct mhp_restrictions *restrictions)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
/* We only have ZONE_NORMAL, so this is easy.. */
- ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
if (unlikely(ret))
printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
@@ -441,20 +429,15 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif
#ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
+void arch_remove_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long nr_pages = size >> PAGE_SHIFT;
struct zone *zone;
- int ret;
zone = page_zone(pfn_to_page(start_pfn));
- ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
- if (unlikely(ret))
- pr_warn("%s: Failed, __remove_pages() == %d\n", __func__,
- ret);
-
- return ret;
+ __remove_pages(zone, start_pfn, nr_pages, altmap);
}
#endif
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index f6421c9ce5d3..7c93f3121ee6 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -92,6 +92,7 @@ config SPARC64
select ARCH_CLOCKSOURCE_DATA
select ARCH_HAS_PTE_SPECIAL
select PCI_DOMAINS if PCI
+ select ARCH_HAS_GIGANTIC_PAGE
config ARCH_DEFCONFIG
string
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 1393a8ac596b..22500c3be7a9 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -231,36 +231,6 @@ extern unsigned long _PAGE_ALL_SZ_BITS;
extern struct page *mem_map_zero;
#define ZERO_PAGE(vaddr) (mem_map_zero)
-/* This macro must be updated when the size of struct page grows above 80
- * or reduces below 64.
- * The idea that compiler optimizes out switch() statement, and only
- * leaves clrx instructions
- */
-#define mm_zero_struct_page(pp) do { \
- unsigned long *_pp = (void *)(pp); \
- \
- /* Check that struct page is either 64, 72, or 80 bytes */ \
- BUILD_BUG_ON(sizeof(struct page) & 7); \
- BUILD_BUG_ON(sizeof(struct page) < 64); \
- BUILD_BUG_ON(sizeof(struct page) > 80); \
- \
- switch (sizeof(struct page)) { \
- case 80: \
- _pp[9] = 0; /* fallthrough */ \
- case 72: \
- _pp[8] = 0; /* fallthrough */ \
- default: \
- _pp[7] = 0; \
- _pp[6] = 0; \
- _pp[5] = 0; \
- _pp[4] = 0; \
- _pp[3] = 0; \
- _pp[2] = 0; \
- _pp[1] = 0; \
- _pp[0] = 0; \
- } \
-} while (0)
-
/* PFNs are real physical page numbers. However, mem_map only begins to record
* per-page information starting at pfn_base. This is to handle systems where
* the first physical page in the machine is at some huge physical address,
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c
index aee6dba83d0e..1e770a517d4a 100644
--- a/arch/sparc/mm/gup.c
+++ b/arch/sparc/mm/gup.c
@@ -245,8 +245,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
return nr;
}
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
@@ -303,7 +303,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
- if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE,
+ pages, &nr))
goto slow;
} while (pgdp++, addr = next, addr != end);
@@ -324,7 +325,7 @@ slow:
ret = get_user_pages_unlocked(start,
(end - start) >> PAGE_SHIFT, pages,
- write ? FOLL_WRITE : 0);
+ gup_flags);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c
index a8ff29821bdb..046ab116cc8c 100644
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -294,19 +294,6 @@ void __init mem_init(void)
mem_init_print_info(NULL);
}
-void free_initmem (void)
-{
- free_initmem_default(POISON_FREE_INITMEM);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
- "initrd");
-}
-#endif
-
void sparc_flush_page_to_ram(struct page *page)
{
unsigned long vaddr = (unsigned long)page_address(page);
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index bc2aaa47bc8a..4b099dd7a767 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2572,14 +2572,6 @@ void free_initmem(void)
}
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
- "initrd");
-}
-#endif
-
pgprot_t PAGE_KERNEL __read_mostly;
EXPORT_SYMBOL(PAGE_KERNEL);
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 99aa11bf53d1..a9c9a94c096f 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -188,13 +188,6 @@ void free_initmem(void)
{
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-#endif
-
/* Allocate and free page tables. */
pgd_t *pgd_alloc(struct mm_struct *mm)
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index 2445dfcf6444..afe4949cfc2d 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -3,6 +3,7 @@ config UNICORE32
def_bool y
select ARCH_32BIT_OFF_T
select ARCH_HAS_DEVMEM_IS_ALLOWED
+ select ARCH_HAS_KEEPINITRD
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select HAVE_KERNEL_GZIP
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index 74b6a2e29809..b4442f3060ce 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -287,27 +287,3 @@ void __init mem_init(void)
sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
}
}
-
-void free_initmem(void)
-{
- free_initmem_default(-1);
-}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-
-static int keep_initrd;
-
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
- if (!keep_initrd)
- free_reserved_area((void *)start, (void *)end, -1, "initrd");
-}
-
-static int __init keepinitrd_setup(char *__unused)
-{
- keep_initrd = 1;
- return 1;
-}
-
-__setup("keepinitrd", keepinitrd_setup);
-#endif
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e7212731cffb..818b361094ed 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -22,7 +22,7 @@ config X86_64
def_bool y
depends on 64BIT
# Options that are inherently 64-bit kernel only:
- select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
+ select ARCH_HAS_GIGANTIC_PAGE
select ARCH_SUPPORTS_INT128
select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_ARCH_SOFT_DIRTY
@@ -47,7 +47,6 @@ config X86
select ARCH_32BIT_OFF_T if X86_32
select ARCH_CLOCKSOURCE_DATA
select ARCH_CLOCKSOURCE_INIT
- select ARCH_DISCARD_MEMBLOCK
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEVMEM_IS_ALLOWED
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 7469d321f072..f65cfb48cfdd 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -17,8 +17,4 @@ static inline void arch_clear_hugepage_flags(struct page *page)
{
}
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void) { return true; }
-#endif
-
#endif /* _ASM_X86_HUGETLB_H */
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bdca39829bc..08715034e315 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -140,7 +140,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
pt_element_t *table;
struct page *page;
- npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
+ npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page);
/* Check if the user is doing something meaningless. */
if (unlikely(npages != 1))
return -EFAULT;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 406b558abfef..6b92eaf4a3b1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1805,7 +1805,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
return NULL;
/* Pin the user virtual address. */
- npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
+ npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages);
if (npinned != npages) {
pr_err("SEV: Failure locking %lu pages.\n", npages);
goto err;
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 92e4c4b85bba..fab095362c50 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -203,7 +203,7 @@ static __init int setup_hugepagesz(char *opt)
}
__setup("hugepagesz=", setup_hugepagesz);
-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
+#ifdef CONFIG_CONTIG_ALLOC
static __init int gigantic_pages_init(void)
{
/* With compaction or CMA we can allocate gigantic pages at runtime */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 85c94f9a87f8..075e568098f2 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -850,24 +850,25 @@ void __init mem_init(void)
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+ struct mhp_restrictions *restrictions)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ return __add_pages(nid, start_pfn, nr_pages, restrictions);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
+void arch_remove_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
struct zone *zone;
zone = page_zone(pfn_to_page(start_pfn));
- return __remove_pages(zone, start_pfn, nr_pages, altmap);
+ __remove_pages(zone, start_pfn, nr_pages, altmap);
}
#endif
#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bccff68e3267..20d14254b686 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -777,11 +777,11 @@ static void update_end_of_memory_vars(u64 start, u64 size)
}
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap, bool want_memblock)
+ struct mhp_restrictions *restrictions)
{
int ret;
- ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
WARN_ON_ONCE(ret);
/* update max_pfn, max_low_pfn and high_memory */
@@ -791,15 +791,15 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
return ret;
}
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
- bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+ struct mhp_restrictions *restrictions)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
init_memory_mapping(start, start + size);
- return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ return add_pages(nid, start_pfn, nr_pages, restrictions);
}
#define PAGE_INUSE 0xFD
@@ -1141,24 +1141,20 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end)
remove_pagetable(start, end, true, NULL);
}
-int __ref arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void __ref arch_remove_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
struct page *page = pfn_to_page(start_pfn);
struct zone *zone;
- int ret;
/* With altmap the first mapped page is offset from @start */
if (altmap)
page += vmem_altmap_offset(altmap);
zone = page_zone(page);
- ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
- WARN_ON_ONCE(ret);
+ __remove_pages(zone, start_pfn, nr_pages, altmap);
kernel_physical_mapping_remove(start, start + size);
-
- return ret;
}
#endif
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index d49861099684..b51746f2b80b 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -216,11 +216,6 @@ void free_initrd_mem(unsigned long start, unsigned long end)
}
#endif
-void free_initmem(void)
-{
- free_initmem_default(-1);
-}
-
static void __init parse_memmap_one(char *p)
{
char *oldp;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index e49028a60429..f180427e48f4 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -231,13 +231,14 @@ static bool pages_correctly_probed(unsigned long start_pfn)
* OK to have direct references to sparsemem variables in here.
*/
static int
-memory_block_action(unsigned long phys_index, unsigned long action, int online_type)
+memory_block_action(unsigned long start_section_nr, unsigned long action,
+ int online_type)
{
unsigned long start_pfn;
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
int ret;
- start_pfn = section_nr_to_pfn(phys_index);
+ start_pfn = section_nr_to_pfn(start_section_nr);
switch (action) {
case MEM_ONLINE:
@@ -251,7 +252,7 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t
break;
default:
WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
- "%ld\n", __func__, phys_index, action, action);
+ "%ld\n", __func__, start_section_nr, action, action);
ret = -EINVAL;
}
@@ -733,16 +734,18 @@ unregister_memory(struct memory_block *memory)
{
BUG_ON(memory->dev.bus != &memory_subsys);
- /* drop the ref. we got in remove_memory_section() */
+ /* drop the ref. we got via find_memory_block() */
put_device(&memory->dev);
device_unregister(&memory->dev);
}
-static int remove_memory_section(unsigned long node_id,
- struct mem_section *section, int phys_device)
+void unregister_memory_section(struct mem_section *section)
{
struct memory_block *mem;
+ if (WARN_ON_ONCE(!present_section(section)))
+ return;
+
mutex_lock(&mem_sysfs_mutex);
/*
@@ -763,15 +766,6 @@ static int remove_memory_section(unsigned long node_id,
out_unlock:
mutex_unlock(&mem_sysfs_mutex);
- return 0;
-}
-
-int unregister_memory_section(struct mem_section *section)
-{
- if (!present_section(section))
- return -EINVAL;
-
- return remove_memory_section(0, section, 0);
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index e428468ab661..996d68ff992a 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -184,8 +184,7 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax,
*pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
- return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, *pfn,
- vmf->flags & FAULT_FLAG_WRITE);
+ return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE);
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
@@ -235,8 +234,7 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
*pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
- return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, *pfn,
- vmf->flags & FAULT_FLAG_WRITE);
+ return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE);
}
#else
static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax,
diff --git a/drivers/firewire/core-iso.c b/drivers/firewire/core-iso.c
index 35e784cffc23..5414eb1306aa 100644
--- a/drivers/firewire/core-iso.c
+++ b/drivers/firewire/core-iso.c
@@ -107,19 +107,8 @@ EXPORT_SYMBOL(fw_iso_buffer_init);
int fw_iso_buffer_map_vma(struct fw_iso_buffer *buffer,
struct vm_area_struct *vma)
{
- unsigned long uaddr;
- int i, err;
-
- uaddr = vma->vm_start;
- for (i = 0; i < buffer->page_count; i++) {
- err = vm_insert_page(vma, uaddr, buffer->pages[i]);
- if (err)
- return err;
-
- uaddr += PAGE_SIZE;
- }
-
- return 0;
+ return vm_map_pages_zero(vma, buffer->pages,
+ buffer->page_count);
}
void fw_iso_buffer_destroy(struct fw_iso_buffer *buffer,
diff --git a/drivers/fpga/dfl-afu-dma-region.c b/drivers/fpga/dfl-afu-dma-region.c
index e18a786fc943..c438722bf4e1 100644
--- a/drivers/fpga/dfl-afu-dma-region.c
+++ b/drivers/fpga/dfl-afu-dma-region.c
@@ -102,7 +102,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata,
goto unlock_vm;
}
- pinned = get_user_pages_fast(region->user_addr, npages, 1,
+ pinned = get_user_pages_fast(region->user_addr, npages, FOLL_WRITE,
region->pages);
if (pinned < 0) {
ret = pinned;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
index 3e6823fdd939..58ed401c5996 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -256,14 +256,14 @@ static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn,
/* TODO we should be able to split locking for interval tree and
* amdgpu_mn_invalidate_node
*/
- if (amdgpu_mn_read_lock(amn, range->blockable))
+ if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range)))
return -EAGAIN;
it = interval_tree_iter_first(&amn->objects, range->start, end);
while (it) {
struct amdgpu_mn_node *node;
- if (!range->blockable) {
+ if (!mmu_notifier_range_blockable(range)) {
amdgpu_mn_read_unlock(amn);
return -EAGAIN;
}
@@ -299,7 +299,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
/* notification is exclusive, but interval is inclusive */
end = range->end - 1;
- if (amdgpu_mn_read_lock(amn, range->blockable))
+ if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range)))
return -EAGAIN;
it = interval_tree_iter_first(&amn->objects, range->start, end);
@@ -307,7 +307,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn,
struct amdgpu_mn_node *node;
struct amdgpu_bo *bo;
- if (!range->blockable) {
+ if (!mmu_notifier_range_blockable(range)) {
amdgpu_mn_read_unlock(amn);
return -EAGAIN;
}
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 215bf3fef10c..8079ea3af103 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -122,7 +122,7 @@ userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
while (it) {
struct drm_i915_gem_object *obj;
- if (!range->blockable) {
+ if (!mmu_notifier_range_blockable(range)) {
ret = -EAGAIN;
break;
}
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c
index b3019505065a..c9bd1278f573 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -133,7 +133,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
/* TODO we should be able to split locking for interval tree and
* the tear down.
*/
- if (range->blockable)
+ if (mmu_notifier_range_blockable(range))
mutex_lock(&rmn->lock);
else if (!mutex_trylock(&rmn->lock))
return -EAGAIN;
@@ -144,7 +144,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn,
struct radeon_bo *bo;
long r;
- if (!range->blockable) {
+ if (!mmu_notifier_range_blockable(range)) {
ret = -EAGAIN;
goto out_unlock;
}
diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
index a8db758d523e..a2ebb08990e9 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c
@@ -221,26 +221,13 @@ static int rockchip_drm_gem_object_mmap_iommu(struct drm_gem_object *obj,
struct vm_area_struct *vma)
{
struct rockchip_gem_object *rk_obj = to_rockchip_obj(obj);
- unsigned int i, count = obj->size >> PAGE_SHIFT;
+ unsigned int count = obj->size >> PAGE_SHIFT;
unsigned long user_count = vma_pages(vma);
- unsigned long uaddr = vma->vm_start;
- unsigned long offset = vma->vm_pgoff;
- unsigned long end = user_count + offset;
- int ret;
if (user_count == 0)
return -ENXIO;
- if (end > count)
- return -ENXIO;
- for (i = offset; i < end; i++) {
- ret = vm_insert_page(vma, uaddr, rk_obj->pages[i]);
- if (ret)
- return ret;
- uaddr += PAGE_SIZE;
- }
-
- return 0;
+ return vm_map_pages(vma, rk_obj->pages, count);
}
static int rockchip_drm_gem_object_mmap_dma(struct drm_gem_object *obj,
diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c
index 8bf3a7c23ed3..062067438f1d 100644
--- a/drivers/gpu/drm/via/via_dmablit.c
+++ b/drivers/gpu/drm/via/via_dmablit.c
@@ -243,7 +243,8 @@ via_lock_all_dma_pages(drm_via_sg_info_t *vsg, drm_via_dmablit_t *xfer)
if (NULL == vsg->pages)
return -ENOMEM;
ret = get_user_pages_fast((unsigned long)xfer->mem_addr,
- vsg->num_pages, vsg->direction == DMA_FROM_DEVICE,
+ vsg->num_pages,
+ vsg->direction == DMA_FROM_DEVICE ? FOLL_WRITE : 0,
vsg->pages);
if (ret != vsg->num_pages) {
if (ret < 0)
diff --git a/drivers/gpu/drm/xen/xen_drm_front_gem.c b/drivers/gpu/drm/xen/xen_drm_front_gem.c
index 53c376d55fcf..a24548489dde 100644
--- a/drivers/gpu/drm/xen/xen_drm_front_gem.c
+++ b/drivers/gpu/drm/xen/xen_drm_front_gem.c
@@ -224,8 +224,7 @@ xen_drm_front_gem_import_sg_table(struct drm_device *dev,
static int gem_mmap_obj(struct xen_gem_object *xen_obj,
struct vm_area_struct *vma)
{
- unsigned long addr = vma->vm_start;
- int i;
+ int ret;
/*
* clear the VM_PFNMAP flag that was set by drm_gem_mmap(), and set the
@@ -252,18 +251,11 @@ static int gem_mmap_obj(struct xen_gem_object *xen_obj,
* FIXME: as we insert all the pages now then no .fault handler must
* be called, so don't provide one
*/
- for (i = 0; i < xen_obj->num_pages; i++) {
- int ret;
-
- ret = vm_insert_page(vma, addr, xen_obj->pages[i]);
- if (ret < 0) {
- DRM_ERROR("Failed to insert pages into vma: %d\n", ret);
- return ret;
- }
+ ret = vm_map_pages(vma, xen_obj->pages, xen_obj->num_pages);
+ if (ret < 0)
+ DRM_ERROR("Failed to map pages into vma: %d\n", ret);
- addr += PAGE_SIZE;
- }
- return 0;
+ return ret;
}
int xen_drm_front_gem_mmap(struct file *filp, struct vm_area_struct *vma)
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 0a23048db523..e7ea819fcb11 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -295,10 +295,11 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
while (npages) {
down_read(&mm->mmap_sem);
- ret = get_user_pages_longterm(cur_base,
+ ret = get_user_pages(cur_base,
min_t(unsigned long, npages,
PAGE_SIZE / sizeof (struct page *)),
- gup_flags, page_list, NULL);
+ gup_flags | FOLL_LONGTERM,
+ page_list, NULL);
if (ret < 0) {
up_read(&mm->mmap_sem);
goto umem_release;
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index c7226cf52acc..f962b5bbfa40 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -152,7 +152,7 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
struct ib_ucontext_per_mm *per_mm =
container_of(mn, struct ib_ucontext_per_mm, mn);
- if (range->blockable)
+ if (mmu_notifier_range_blockable(range))
down_read(&per_mm->umem_rwsem);
else if (!down_read_trylock(&per_mm->umem_rwsem))
return -EAGAIN;
@@ -170,7 +170,8 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
range->end,
invalidate_range_start_trampoline,
- range->blockable, NULL);
+ mmu_notifier_range_blockable(range),
+ NULL);
}
static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start,
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
index 24b592c6522e..02eee8eff1db 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -104,8 +104,9 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np
bool writable, struct page **pages)
{
int ret;
+ unsigned int gup_flags = FOLL_LONGTERM | (writable ? FOLL_WRITE : 0);
- ret = get_user_pages_fast(vaddr, npages, writable, pages);
+ ret = get_user_pages_fast(vaddr, npages, gup_flags, pages);
if (ret < 0)
return ret;
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
index 112d2f38e0de..8ff0e90d7564 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -472,7 +472,8 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
goto out;
}
- ret = get_user_pages_fast(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages);
+ ret = get_user_pages_fast(uaddr & PAGE_MASK, 1,
+ FOLL_WRITE | FOLL_LONGTERM, pages);
if (ret < 0)
goto out;
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index 123ca8f64f75..f712fb7fa82f 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -114,10 +114,10 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages,
down_read(&current->mm->mmap_sem);
for (got = 0; got < num_pages; got += ret) {
- ret = get_user_pages_longterm(start_page + got * PAGE_SIZE,
- num_pages - got,
- FOLL_WRITE | FOLL_FORCE,
- p + got, NULL);
+ ret = get_user_pages(start_page + got * PAGE_SIZE,
+ num_pages - got,
+ FOLL_LONGTERM | FOLL_WRITE | FOLL_FORCE,
+ p + got, NULL);
if (ret < 0) {
up_read(&current->mm->mmap_sem);
goto bail_release;
diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c
index ef19d39a44b1..0c204776263f 100644
--- a/drivers/infiniband/hw/qib/qib_user_sdma.c
+++ b/drivers/infiniband/hw/qib/qib_user_sdma.c
@@ -670,7 +670,7 @@ static int qib_user_sdma_pin_pages(const struct qib_devdata *dd,
else
j = npages;
- ret = get_user_pages_fast(addr, j, 0, pages);
+ ret = get_user_pages_fast(addr, j, FOLL_LONGTERM, pages);
if (ret != j) {
i = 0;
j = ret;
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index da35d6fdfc5e..e312f522a66d 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -143,10 +143,11 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
ret = 0;
while (npages) {
- ret = get_user_pages_longterm(cur_base,
- min_t(unsigned long, npages,
- PAGE_SIZE / sizeof(struct page *)),
- gup_flags, page_list, NULL);
+ ret = get_user_pages(cur_base,
+ min_t(unsigned long, npages,
+ PAGE_SIZE / sizeof(struct page *)),
+ gup_flags | FOLL_LONGTERM,
+ page_list, NULL);
if (ret < 0)
goto out;
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 77aabe637a60..20abd19bbfbe 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -619,17 +619,7 @@ out_free_pages:
int iommu_dma_mmap(struct page **pages, size_t size, struct vm_area_struct *vma)
{
- unsigned long uaddr = vma->vm_start;
- unsigned int i, count = PAGE_ALIGN(size) >> PAGE_SHIFT;
- int ret = -ENXIO;
-
- for (i = vma->vm_pgoff; i < count && uaddr < vma->vm_end; i++) {
- ret = vm_insert_page(vma, uaddr, pages[i]);
- if (ret)
- break;
- uaddr += PAGE_SIZE;
- }
- return ret;
+ return vm_map_pages(vma, pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
}
static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c
index 7ebd58a1c431..3cf25abf5807 100644
--- a/drivers/media/common/videobuf2/videobuf2-core.c
+++ b/drivers/media/common/videobuf2/videobuf2-core.c
@@ -2201,6 +2201,13 @@ int vb2_mmap(struct vb2_queue *q, struct vm_area_struct *vma)
goto unlock;
}
+ /*
+ * vm_pgoff is treated in V4L2 API as a 'cookie' to select a buffer,
+ * not as a in-buffer offset. We always want to mmap a whole buffer
+ * from its beginning.
+ */
+ vma->vm_pgoff = 0;
+
ret = call_memop(vb, mmap, vb->planes[plane].mem_priv, vma);
unlock:
diff --git a/drivers/media/common/videobuf2/videobuf2-dma-contig.c b/drivers/media/common/videobuf2/videobuf2-dma-contig.c
index 82389aead6ed..ecbef266130b 100644
--- a/drivers/media/common/videobuf2/videobuf2-dma-contig.c
+++ b/drivers/media/common/videobuf2/videobuf2-dma-contig.c
@@ -186,12 +186,6 @@ static int vb2_dc_mmap(void *buf_priv, struct vm_area_struct *vma)
return -EINVAL;
}
- /*
- * dma_mmap_* uses vm_pgoff as in-buffer offset, but we want to
- * map whole buffer
- */
- vma->vm_pgoff = 0;
-
ret = dma_mmap_attrs(buf->dev, vma, buf->cookie,
buf->dma_addr, buf->size, buf->attrs);
diff --git a/drivers/media/common/videobuf2/videobuf2-dma-sg.c b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
index 270c3162fdcb..4a4c49d6085c 100644
--- a/drivers/media/common/videobuf2/videobuf2-dma-sg.c
+++ b/drivers/media/common/videobuf2/videobuf2-dma-sg.c
@@ -328,28 +328,18 @@ static unsigned int vb2_dma_sg_num_users(void *buf_priv)
static int vb2_dma_sg_mmap(void *buf_priv, struct vm_area_struct *vma)
{
struct vb2_dma_sg_buf *buf = buf_priv;
- unsigned long uaddr = vma->vm_start;
- unsigned long usize = vma->vm_end - vma->vm_start;
- int i = 0;
+ int err;
if (!buf) {
printk(KERN_ERR "No memory to map\n");
return -EINVAL;
}
- do {
- int ret;
-
- ret = vm_insert_page(vma, uaddr, buf->pages[i++]);
- if (ret) {
- printk(KERN_ERR "Remapping memory, error: %d\n", ret);
- return ret;
- }
-
- uaddr += PAGE_SIZE;
- usize -= PAGE_SIZE;
- } while (usize > 0);
-
+ err = vm_map_pages(vma, buf->pages, buf->num_pages);
+ if (err) {
+ printk(KERN_ERR "Remapping memory, error: %d\n", err);
+ return err;
+ }
/*
* Use common vm_area operations to track buffer refcount.
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index 08929c087e27..870a2a526e0b 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -186,12 +186,12 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma,
dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n",
data, size, dma->nr_pages);
- err = get_user_pages_longterm(data & PAGE_MASK, dma->nr_pages,
- flags, dma->pages, NULL);
+ err = get_user_pages(data & PAGE_MASK, dma->nr_pages,
+ flags | FOLL_LONGTERM, dma->pages, NULL);
if (err != dma->nr_pages) {
dma->nr_pages = (err >= 0) ? err : 0;
- dprintk(1, "get_user_pages_longterm: err=%d [%d]\n", err,
+ dprintk(1, "get_user_pages: err=%d [%d]\n", err,
dma->nr_pages);
return err < 0 ? err : -EINVAL;
}
diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c
index 25265fd0fd6e..89cff9d1012b 100644
--- a/drivers/misc/genwqe/card_utils.c
+++ b/drivers/misc/genwqe/card_utils.c
@@ -603,7 +603,7 @@ int genwqe_user_vmap(struct genwqe_dev *cd, struct dma_mapping *m, void *uaddr,
/* pin user pages in memory */
rc = get_user_pages_fast(data & PAGE_MASK, /* page aligned addr */
m->nr_pages,
- m->write, /* readable/writable */
+ m->write ? FOLL_WRITE : 0, /* readable/writable */
m->page_list); /* ptrs to pages */
if (rc < 0)
goto fail_get_user_pages;
diff --git a/drivers/misc/vmw_vmci/vmci_host.c b/drivers/misc/vmw_vmci/vmci_host.c
index 997f92543dd4..422d08da3244 100644
--- a/drivers/misc/vmw_vmci/vmci_host.c
+++ b/drivers/misc/vmw_vmci/vmci_host.c
@@ -242,7 +242,7 @@ static int vmci_host_setup_notify(struct vmci_ctx *context,
/*
* Lock physical page backing a given user VA.
*/
- retval = get_user_pages_fast(uva, 1, 1, &context->notify_page);
+ retval = get_user_pages_fast(uva, 1, FOLL_WRITE, &context->notify_page);
if (retval != 1) {
context->notify_page = NULL;
return VMCI_ERROR_GENERIC;
diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c
index f5f1aac9d163..1174735f003d 100644
--- a/drivers/misc/vmw_vmci/vmci_queue_pair.c
+++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c
@@ -659,7 +659,8 @@ static int qp_host_get_user_memory(u64 produce_uva,
int err = VMCI_SUCCESS;
retval = get_user_pages_fast((uintptr_t) produce_uva,
- produce_q->kernel_if->num_pages, 1,
+ produce_q->kernel_if->num_pages,
+ FOLL_WRITE,
produce_q->kernel_if->u.h.header_page);
if (retval < (int)produce_q->kernel_if->num_pages) {
pr_debug("get_user_pages_fast(produce) failed (retval=%d)",
@@ -671,7 +672,8 @@ static int qp_host_get_user_memory(u64 produce_uva,
}
retval = get_user_pages_fast((uintptr_t) consume_uva,
- consume_q->kernel_if->num_pages, 1,
+ consume_q->kernel_if->num_pages,
+ FOLL_WRITE,
consume_q->kernel_if->u.h.header_page);
if (retval < (int)consume_q->kernel_if->num_pages) {
pr_debug("get_user_pages_fast(consume) failed (retval=%d)",
diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c
index 321bc673c417..cef0133aa47a 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -274,7 +274,8 @@ static int pin_user_pages(unsigned long first_page,
*iter_last_page_size = last_page_size;
}
- ret = get_user_pages_fast(first_page, requested_pages, !is_write,
+ ret = get_user_pages_fast(first_page, requested_pages,
+ !is_write ? FOLL_WRITE : 0,
pages);
if (ret <= 0)
return -EFAULT;
diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index 1e1f42e210a0..4a4a75fa26d5 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -868,7 +868,9 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode,
pinned = get_user_pages_fast(
(unsigned long)xfer->loc_addr & PAGE_MASK,
- nr_pages, dir == DMA_FROM_DEVICE, page_list);
+ nr_pages,
+ dir == DMA_FROM_DEVICE ? FOLL_WRITE : 0,
+ page_list);
if (pinned != nr_pages) {
if (pinned < 0) {
diff --git a/drivers/sbus/char/oradax.c b/drivers/sbus/char/oradax.c
index acd9ba40eabe..8090dc9a1514 100644
--- a/drivers/sbus/char/oradax.c
+++ b/drivers/sbus/char/oradax.c
@@ -437,7 +437,7 @@ static int dax_lock_page(void *va, struct page **p)
dax_dbg("uva %p", va);
- ret = get_user_pages_fast((unsigned long)va, 1, 1, p);
+ ret = get_user_pages_fast((unsigned long)va, 1, FOLL_WRITE, p);
if (ret == 1) {
dax_dbg("locked page %p, for VA %p", *p, va);
return 0;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 19c022e66d63..3c6a18ad9a87 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -4922,7 +4922,8 @@ static int sgl_map_user_pages(struct st_buffer *STbp,
/* Try to fault in all of the necessary pages */
/* rw==READ means read from drive, write into memory area */
- res = get_user_pages_fast(uaddr, nr_pages, rw == READ, pages);
+ res = get_user_pages_fast(uaddr, nr_pages, rw == READ ? FOLL_WRITE : 0,
+ pages);
/* Errors and no page mapped should return here */
if (res < nr_pages)
diff --git a/drivers/staging/gasket/gasket_page_table.c b/drivers/staging/gasket/gasket_page_table.c
index 600928f63577..d35c4fb19e28 100644
--- a/drivers/staging/gasket/gasket_page_table.c
+++ b/drivers/staging/gasket/gasket_page_table.c
@@ -486,8 +486,8 @@ static int gasket_perform_mapping(struct gasket_page_table *pg_tbl,
ptes[i].dma_addr = pg_tbl->coherent_pages[0].paddr +
off + i * PAGE_SIZE;
} else {
- ret = get_user_pages_fast(page_addr - offset, 1, 1,
- &page);
+ ret = get_user_pages_fast(page_addr - offset, 1,
+ FOLL_WRITE, &page);
if (ret <= 0) {
dev_err(pg_tbl->device,
diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index 0b9ab1d0dd45..49fd7312e2aa 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -273,7 +273,7 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr,
goto err;
}
- rc = get_user_pages_fast(start, num_pages, 1, shm->pages);
+ rc = get_user_pages_fast(start, num_pages, FOLL_WRITE, shm->pages);
if (rc > 0)
shm->num_pages = rc;
if (rc != num_pages) {
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 6b64e45a5269..40ddc0c5f677 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -532,7 +532,8 @@ static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
enum dma_data_direction direction = iommu_tce_direction(tce);
if (get_user_pages_fast(tce & PAGE_MASK, 1,
- direction != DMA_TO_DEVICE, &page) != 1)
+ direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
+ &page) != 1)
return -EFAULT;
*hpa = __pa((unsigned long) page_address(page));
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 3be1db3501cc..3ddc375e7063 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -358,7 +358,8 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
down_read(&mm->mmap_sem);
if (mm == current->mm) {
- ret = get_user_pages_longterm(vaddr, 1, flags, page, vmas);
+ ret = get_user_pages(vaddr, 1, flags | FOLL_LONGTERM, page,
+ vmas);
} else {
ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
vmas, NULL);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 351af88231ad..1e3ed41ae1f3 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1704,7 +1704,7 @@ static int set_bit_to_user(int nr, void __user *addr)
int bit = nr + (log % PAGE_SIZE) * 8;
int r;
- r = get_user_pages_fast(log, 1, 1, &page);
+ r = get_user_pages_fast(log, 1, FOLL_WRITE, &page);
if (r < 0)
return r;
BUG_ON(r != 1);
diff --git a/drivers/video/fbdev/pvr2fb.c b/drivers/video/fbdev/pvr2fb.c
index dfed532ed606..4e4d6a0df978 100644
--- a/drivers/video/fbdev/pvr2fb.c
+++ b/drivers/video/fbdev/pvr2fb.c
@@ -686,7 +686,7 @@ static ssize_t pvr2fb_write(struct fb_info *info, const char *buf,
if (!pages)
return -ENOMEM;
- ret = get_user_pages_fast((unsigned long)buf, nr_pages, true, pages);
+ ret = get_user_pages_fast((unsigned long)buf, nr_pages, FOLL_WRITE, pages);
if (ret < nr_pages) {
nr_pages = ret;
ret = -EINVAL;
diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c
index 8ba726e600e9..6446bcab4185 100644
--- a/drivers/virt/fsl_hypervisor.c
+++ b/drivers/virt/fsl_hypervisor.c
@@ -244,7 +244,7 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p)
/* Get the physical addresses of the source buffer */
num_pinned = get_user_pages_fast(param.local_vaddr - lb_offset,
- num_pages, param.source != -1, pages);
+ num_pages, param.source != -1 ? FOLL_WRITE : 0, pages);
if (num_pinned != num_pages) {
/* get_user_pages() failed */
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 7cf9c51318aa..469dfbd6cf90 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -526,20 +526,20 @@ static int mn_invl_range_start(struct mmu_notifier *mn,
struct gntdev_grant_map *map;
int ret = 0;
- if (range->blockable)
+ if (mmu_notifier_range_blockable(range))
mutex_lock(&priv->lock);
else if (!mutex_trylock(&priv->lock))
return -EAGAIN;
list_for_each_entry(map, &priv->maps, next) {
ret = unmap_if_in_range(map, range->start, range->end,
- range->blockable);
+ mmu_notifier_range_blockable(range));
if (ret)
goto out_unlock;
}
list_for_each_entry(map, &priv->freeable_maps, next) {
ret = unmap_if_in_range(map, range->start, range->end,
- range->blockable);
+ mmu_notifier_range_blockable(range));
if (ret)
goto out_unlock;
}
@@ -852,7 +852,7 @@ static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt,
unsigned long xen_pfn;
int ret;
- ret = get_user_pages_fast(addr, 1, writeable, &page);
+ ret = get_user_pages_fast(addr, 1, writeable ? FOLL_WRITE : 0, &page);
if (ret < 0)
return ret;
@@ -1084,7 +1084,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
int index = vma->vm_pgoff;
int count = vma_pages(vma);
struct gntdev_grant_map *map;
- int i, err = -EINVAL;
+ int err = -EINVAL;
if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
return -EINVAL;
@@ -1145,12 +1145,9 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma)
goto out_put_map;
if (!use_ptemod) {
- for (i = 0; i < count; i++) {
- err = vm_insert_page(vma, vma->vm_start + i*PAGE_SIZE,
- map->pages[i]);
- if (err)
- goto out_put_map;
- }
+ err = vm_map_pages(vma, map->pages, map->count);
+ if (err)
+ goto out_put_map;
} else {
#ifdef CONFIG_X86
/*
diff --git a/drivers/xen/privcmd-buf.c b/drivers/xen/privcmd-buf.c
index a1c61e351d3f..dd5bbb6e1b6b 100644
--- a/drivers/xen/privcmd-buf.c
+++ b/drivers/xen/privcmd-buf.c
@@ -165,12 +165,8 @@ static int privcmd_buf_mmap(struct file *file, struct vm_area_struct *vma)
if (vma_priv->n_pages != count)
ret = -ENOMEM;
else
- for (i = 0; i < vma_priv->n_pages; i++) {
- ret = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE,
- vma_priv->pages[i]);
- if (ret)
- break;
- }
+ ret = vm_map_pages_zero(vma, vma_priv->pages,
+ vma_priv->n_pages);
if (ret)
privcmd_buf_vmapriv_free(vma_priv);
diff --git a/fs/dax.c b/fs/dax.c
index e5e54da1715f..f74386293632 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -814,7 +814,7 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
goto unlock_pmd;
flush_cache_page(vma, address, pfn);
- pmd = pmdp_huge_clear_flush(vma, address, pmdp);
+ pmd = pmdp_invalidate(vma, address, pmdp);
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
set_pmd_at(vma->vm_mm, address, pmdp, pmd);
@@ -1575,8 +1575,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
}
trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
- result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
- write);
+ result = vmf_insert_pfn_pmd(vmf, pfn, write);
break;
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
@@ -1686,8 +1685,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
#ifdef CONFIG_FS_DAX_PMD
else if (order == PMD_ORDER)
- ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
- pfn, true);
+ ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
#endif
else
ret = VM_FAULT_FALLBACK;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c74ef4426282..1dcc57189382 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -440,9 +440,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
u32 hash;
index = page->index;
- hash = hugetlb_fault_mutex_hash(h, current->mm,
- &pseudo_vma,
- mapping, index, 0);
+ hash = hugetlb_fault_mutex_hash(h, mapping, index, 0);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/*
@@ -499,8 +497,15 @@ static void hugetlbfs_evict_inode(struct inode *inode)
struct resv_map *resv_map;
remove_inode_hugepages(inode, 0, LLONG_MAX);
- resv_map = (struct resv_map *)inode->i_mapping->private_data;
- /* root inode doesn't have the resv_map, so we should check it */
+
+ /*
+ * Get the resv_map from the address space embedded in the inode.
+ * This is the address space which points to any resv_map allocated
+ * at inode creation time. If this is a device special inode,
+ * i_mapping may not point to the original address space.
+ */
+ resv_map = (struct resv_map *)(&inode->i_data)->private_data;
+ /* Only regular and link inodes have associated reserve maps */
if (resv_map)
resv_map_release(&resv_map->refs);
clear_inode(inode);
@@ -639,8 +644,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
addr = index * hpage_size;
/* mutex taken here, fault path and hole punch */
- hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
- index, addr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, index, addr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/* See if already present in mapping to avoid alloc/free */
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 48ea3977012a..fdc18321d70c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2697,8 +2697,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
ret = 0;
down_read(&current->mm->mmap_sem);
- pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE,
- pages, vmas);
+ pret = get_user_pages(ubuf, nr_pages,
+ FOLL_WRITE | FOLL_LONGTERM,
+ pages, vmas);
if (pret == nr_pages) {
/* don't support file backed memory */
for (j = 0; j < nr_pages; j++) {
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c121abbdfc7d..85f21caaa6ec 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -69,10 +69,6 @@
#define NAMEI_RA_BLOCKS 4
#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-static unsigned char ocfs2_filetype_table[] = {
- DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-};
-
static int ocfs2_do_extend_dir(struct super_block *sb,
handle_t *handle,
struct inode *dir,
@@ -1718,7 +1714,7 @@ int __ocfs2_add_entry(handle_t *handle,
de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
de = de1;
}
- de->file_type = OCFS2_FT_UNKNOWN;
+ de->file_type = FT_UNKNOWN;
if (blkno) {
de->inode = cpu_to_le64(blkno);
ocfs2_set_de_type(de, inode->i_mode);
@@ -1803,13 +1799,9 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
}
offset += le16_to_cpu(de->rec_len);
if (le64_to_cpu(de->inode)) {
- unsigned char d_type = DT_UNKNOWN;
-
- if (de->file_type < OCFS2_FT_MAX)
- d_type = ocfs2_filetype_table[de->file_type];
-
if (!dir_emit(ctx, de->name, de->name_len,
- le64_to_cpu(de->inode), d_type))
+ le64_to_cpu(de->inode),
+ fs_ftype_to_dtype(de->file_type)))
goto out;
}
ctx->pos += le16_to_cpu(de->rec_len);
@@ -1900,14 +1892,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
break;
}
if (le64_to_cpu(de->inode)) {
- unsigned char d_type = DT_UNKNOWN;
-
- if (de->file_type < OCFS2_FT_MAX)
- d_type = ocfs2_filetype_table[de->file_type];
if (!dir_emit(ctx, de->name,
de->name_len,
le64_to_cpu(de->inode),
- d_type)) {
+ fs_ftype_to_dtype(de->file_type))) {
brelse(bh);
return 0;
}
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 4bf8d5854b27..af2888d23de3 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -148,16 +148,24 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
u64 blkno;
struct dentry *parent;
struct inode *dir = d_inode(child);
+ int set;
trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name,
(unsigned long long)OCFS2_I(dir)->ip_blkno);
+ status = ocfs2_nfs_sync_lock(OCFS2_SB(dir->i_sb), 1);
+ if (status < 0) {
+ mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status);
+ parent = ERR_PTR(status);
+ goto bail;
+ }
+
status = ocfs2_inode_lock(dir, NULL, 0);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
parent = ERR_PTR(status);
- goto bail;
+ goto unlock_nfs_sync;
}
status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno);
@@ -166,11 +174,31 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
goto bail_unlock;
}
+ status = ocfs2_test_inode_bit(OCFS2_SB(dir->i_sb), blkno, &set);
+ if (status < 0) {
+ if (status == -EINVAL) {
+ status = -ESTALE;
+ } else
+ mlog(ML_ERROR, "test inode bit failed %d\n", status);
+ parent = ERR_PTR(status);
+ goto bail_unlock;
+ }
+
+ trace_ocfs2_get_dentry_test_bit(status, set);
+ if (!set) {
+ status = -ESTALE;
+ parent = ERR_PTR(status);
+ goto bail_unlock;
+ }
+
parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0));
bail_unlock:
ocfs2_inode_unlock(dir, 0);
+unlock_nfs_sync:
+ ocfs2_nfs_sync_unlock(OCFS2_SB(dir->i_sb), 1);
+
bail:
trace_ocfs2_get_parent_end(parent);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 7071ad0dec90..b86bf5e74348 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -392,21 +392,6 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
#define OCFS2_HB_GLOBAL "heartbeat=global"
/*
- * OCFS2 directory file types. Only the low 3 bits are used. The
- * other bits are reserved for now.
- */
-#define OCFS2_FT_UNKNOWN 0
-#define OCFS2_FT_REG_FILE 1
-#define OCFS2_FT_DIR 2
-#define OCFS2_FT_CHRDEV 3
-#define OCFS2_FT_BLKDEV 4
-#define OCFS2_FT_FIFO 5
-#define OCFS2_FT_SOCK 6
-#define OCFS2_FT_SYMLINK 7
-
-#define OCFS2_FT_MAX 8
-
-/*
* OCFS2_DIR_PAD defines the directory entries boundaries
*
* NOTE: It must be a multiple of 4
@@ -424,17 +409,6 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
#define OCFS2_LINKS_HI_SHIFT 16
#define OCFS2_DX_ENTRIES_MAX (0xffffffffU)
-#define S_SHIFT 12
-static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
- [S_IFREG >> S_SHIFT] = OCFS2_FT_REG_FILE,
- [S_IFDIR >> S_SHIFT] = OCFS2_FT_DIR,
- [S_IFCHR >> S_SHIFT] = OCFS2_FT_CHRDEV,
- [S_IFBLK >> S_SHIFT] = OCFS2_FT_BLKDEV,
- [S_IFIFO >> S_SHIFT] = OCFS2_FT_FIFO,
- [S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK,
- [S_IFLNK >> S_SHIFT] = OCFS2_FT_SYMLINK,
-};
-
/*
* Convenience casts
@@ -1629,7 +1603,7 @@ static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
umode_t mode)
{
- de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+ de->file_type = fs_umode_to_ftype(mode);
}
static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd)
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index d4811f981608..2bb916d68576 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -269,7 +269,7 @@ orangefs_bufmap_map(struct orangefs_bufmap *bufmap,
/* map the pages */
ret = get_user_pages_fast((unsigned long)user_desc->ptr,
- bufmap->page_count, 1, bufmap->page_array);
+ bufmap->page_count, FOLL_WRITE, bufmap->page_array);
if (ret < 0)
return ret;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 95ca1fe7283c..01d4eb0e6bd1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1169,7 +1169,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
break;
}
- mmu_notifier_range_init(&range, mm, 0, -1UL);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
+ 0, NULL, mm, 0, -1UL);
mmu_notifier_invalidate_range_start(&range);
}
walk_page_range(0, mm->highest_vm_end, &clear_refs_walk);
diff --git a/fs/sync.c b/fs/sync.c
index 01e82170545a..4d1ff010bc5a 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -292,8 +292,14 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
}
if (flags & SYNC_FILE_RANGE_WRITE) {
+ int sync_mode = WB_SYNC_NONE;
+
+ if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) ==
+ SYNC_FILE_RANGE_WRITE_AND_WAIT)
+ sync_mode = WB_SYNC_ALL;
+
ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
- WB_SYNC_NONE);
+ sync_mode);
if (ret < 0)
goto out;
}
@@ -306,9 +312,9 @@ out:
}
/*
- * sys_sync_file_range() permits finely controlled syncing over a segment of
+ * ksys_sync_file_range() permits finely controlled syncing over a segment of
* a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
- * zero then sys_sync_file_range() will operate from offset out to EOF.
+ * zero then ksys_sync_file_range() will operate from offset out to EOF.
*
* The flag bits are:
*
@@ -325,7 +331,7 @@ out:
* Useful combinations of the flag bits are:
*
* SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages
- * in the range which were dirty on entry to sys_sync_file_range() are placed
+ * in the range which were dirty on entry to ksys_sync_file_range() are placed
* under writeout. This is a start-write-for-data-integrity operation.
*
* SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which
@@ -337,10 +343,13 @@ out:
* earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait
* for that operation to complete and to return the result.
*
- * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER:
+ * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER
+ * (a.k.a. SYNC_FILE_RANGE_WRITE_AND_WAIT):
* a traditional sync() operation. This is a write-for-data-integrity operation
* which will ensure that all pages in the range which were dirty on entry to
- * sys_sync_file_range() are committed to disk.
+ * ksys_sync_file_range() are written to disk. It should be noted that disk
+ * caches are not flushed by this call, so there are no guarantees here that the
+ * data will be available on disk after a crash.
*
*
* SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index f5de1e726356..3b30301c90ec 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -30,6 +30,8 @@
#include <linux/security.h>
#include <linux/hugetlb.h>
+int sysctl_unprivileged_userfaultfd __read_mostly = 1;
+
static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
enum userfaultfd_state {
@@ -1930,6 +1932,9 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
struct userfaultfd_ctx *ctx;
int fd;
+ if (!sysctl_unprivileged_userfaultfd && !capable(CAP_SYS_PTRACE))
+ return -EPERM;
+
BUG_ON(!current->mm);
/* Check the UFFD_* constants for consistency. */
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index 71d7b77eea50..822f433ac95c 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -126,4 +126,11 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
}
#endif
+#ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED
+static inline bool gigantic_page_runtime_supported(void)
+{
+ return IS_ENABLED(CONFIG_ARCH_HAS_GIGANTIC_PAGE);
+}
+#endif /* __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED */
+
#endif /* _ASM_GENERIC_HUGETLB_H */
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index f111c780ef1d..f31521dcb09a 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -151,21 +151,6 @@ static inline void balloon_page_delete(struct page *page)
list_del(&page->lru);
}
-static inline bool __is_movable_balloon_page(struct page *page)
-{
- return false;
-}
-
-static inline bool balloon_page_movable(struct page *page)
-{
- return false;
-}
-
-static inline bool isolated_balloon_page(struct page *page)
-{
- return false;
-}
-
static inline bool balloon_page_isolate(struct page *page)
{
return false;
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index fdab7de7490d..fb07b503dc45 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -585,12 +585,12 @@ static inline bool pm_suspended_storage(void)
}
#endif /* CONFIG_PM_SLEEP */
-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
+#ifdef CONFIG_CONTIG_ALLOC
/* The below functions must be run on a range from a single zone. */
extern int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype, gfp_t gfp_mask);
-extern void free_contig_range(unsigned long pfn, unsigned nr_pages);
#endif
+void free_contig_range(unsigned long pfn, unsigned int nr_pages);
#ifdef CONFIG_CMA
/* CMA stuff */
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index ad50b7b4f141..51ec27a84668 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -77,8 +77,34 @@
#include <linux/migrate.h>
#include <linux/memremap.h>
#include <linux/completion.h>
+#include <linux/mmu_notifier.h>
-struct hmm;
+
+/*
+ * struct hmm - HMM per mm struct
+ *
+ * @mm: mm struct this HMM struct is bound to
+ * @lock: lock protecting ranges list
+ * @ranges: list of range being snapshotted
+ * @mirrors: list of mirrors for this mm
+ * @mmu_notifier: mmu notifier to track updates to CPU page table
+ * @mirrors_sem: read/write semaphore protecting the mirrors list
+ * @wq: wait queue for user waiting on a range invalidation
+ * @notifiers: count of active mmu notifiers
+ * @dead: is the mm dead ?
+ */
+struct hmm {
+ struct mm_struct *mm;
+ struct kref kref;
+ struct mutex lock;
+ struct list_head ranges;
+ struct list_head mirrors;
+ struct mmu_notifier mmu_notifier;
+ struct rw_semaphore mirrors_sem;
+ wait_queue_head_t wq;
+ long notifiers;
+ bool dead;
+};
/*
* hmm_pfn_flag_e - HMM flag enums
@@ -131,6 +157,7 @@ enum hmm_pfn_value_e {
/*
* struct hmm_range - track invalidation lock on virtual address range
*
+ * @hmm: the core HMM structure this range is active against
* @vma: the vm area struct for the range
* @list: all range lock are on a list
* @start: range virtual start address (inclusive)
@@ -138,10 +165,13 @@ enum hmm_pfn_value_e {
* @pfns: array of pfns (big enough for the range)
* @flags: pfn flags to match device driver page table
* @values: pfn value for some special case (none, special, error, ...)
+ * @default_flags: default flags for the range (write, read, ... see hmm doc)
+ * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
* @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
* @valid: pfns array did not change since it has been fill by an HMM function
*/
struct hmm_range {
+ struct hmm *hmm;
struct vm_area_struct *vma;
struct list_head list;
unsigned long start;
@@ -149,41 +179,96 @@ struct hmm_range {
uint64_t *pfns;
const uint64_t *flags;
const uint64_t *values;
+ uint64_t default_flags;
+ uint64_t pfn_flags_mask;
+ uint8_t page_shift;
uint8_t pfn_shift;
bool valid;
};
/*
- * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn
- * @range: range use to decode HMM pfn value
- * @pfn: HMM pfn value to get corresponding struct page from
- * Returns: struct page pointer if pfn is a valid HMM pfn, NULL otherwise
+ * hmm_range_page_shift() - return the page shift for the range
+ * @range: range being queried
+ * Returns: page shift (page size = 1 << page shift) for the range
+ */
+static inline unsigned hmm_range_page_shift(const struct hmm_range *range)
+{
+ return range->page_shift;
+}
+
+/*
+ * hmm_range_page_size() - return the page size for the range
+ * @range: range being queried
+ * Returns: page size for the range in bytes
+ */
+static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
+{
+ return 1UL << hmm_range_page_shift(range);
+}
+
+/*
+ * hmm_range_wait_until_valid() - wait for range to be valid
+ * @range: range affected by invalidation to wait on
+ * @timeout: time out for wait in ms (ie abort wait after that period of time)
+ * Returns: true if the range is valid, false otherwise.
+ */
+static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
+ unsigned long timeout)
+{
+ /* Check if mm is dead ? */
+ if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) {
+ range->valid = false;
+ return false;
+ }
+ if (range->valid)
+ return true;
+ wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead,
+ msecs_to_jiffies(timeout));
+ /* Return current valid status just in case we get lucky */
+ return range->valid;
+}
+
+/*
+ * hmm_range_valid() - test if a range is valid or not
+ * @range: range
+ * Returns: true if the range is valid, false otherwise.
+ */
+static inline bool hmm_range_valid(struct hmm_range *range)
+{
+ return range->valid;
+}
+
+/*
+ * hmm_device_entry_to_page() - return struct page pointed to by a device entry
+ * @range: range use to decode device entry value
+ * @entry: device entry value to get corresponding struct page from
+ * Returns: struct page pointer if entry is a valid, NULL otherwise
*
- * If the HMM pfn is valid (ie valid flag set) then return the struct page
- * matching the pfn value stored in the HMM pfn. Otherwise return NULL.
+ * If the device entry is valid (ie valid flag set) then return the struct page
+ * matching the entry value. Otherwise return NULL.
*/
-static inline struct page *hmm_pfn_to_page(const struct hmm_range *range,
- uint64_t pfn)
+static inline struct page *hmm_device_entry_to_page(const struct hmm_range *range,
+ uint64_t entry)
{
- if (pfn == range->values[HMM_PFN_NONE])
+ if (entry == range->values[HMM_PFN_NONE])
return NULL;
- if (pfn == range->values[HMM_PFN_ERROR])
+ if (entry == range->values[HMM_PFN_ERROR])
return NULL;
- if (pfn == range->values[HMM_PFN_SPECIAL])
+ if (entry == range->values[HMM_PFN_SPECIAL])
return NULL;
- if (!(pfn & range->flags[HMM_PFN_VALID]))
+ if (!(entry & range->flags[HMM_PFN_VALID]))
return NULL;
- return pfn_to_page(pfn >> range->pfn_shift);
+ return pfn_to_page(entry >> range->pfn_shift);
}
/*
- * hmm_pfn_to_pfn() - return pfn value store in a HMM pfn
- * @range: range use to decode HMM pfn value
- * @pfn: HMM pfn value to extract pfn from
- * Returns: pfn value if HMM pfn is valid, -1UL otherwise
+ * hmm_device_entry_to_pfn() - return pfn value store in a device entry
+ * @range: range use to decode device entry value
+ * @entry: device entry to extract pfn from
+ * Returns: pfn value if device entry is valid, -1UL otherwise
*/
-static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
- uint64_t pfn)
+static inline unsigned long
+hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn)
{
if (pfn == range->values[HMM_PFN_NONE])
return -1UL;
@@ -197,31 +282,66 @@ static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
}
/*
- * hmm_pfn_from_page() - create a valid HMM pfn value from struct page
+ * hmm_device_entry_from_page() - create a valid device entry for a page
* @range: range use to encode HMM pfn value
- * @page: struct page pointer for which to create the HMM pfn
- * Returns: valid HMM pfn for the page
+ * @page: page for which to create the device entry
+ * Returns: valid device entry for the page
*/
-static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range,
- struct page *page)
+static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range,
+ struct page *page)
{
return (page_to_pfn(page) << range->pfn_shift) |
range->flags[HMM_PFN_VALID];
}
/*
- * hmm_pfn_from_pfn() - create a valid HMM pfn value from pfn
+ * hmm_device_entry_from_pfn() - create a valid device entry value from pfn
* @range: range use to encode HMM pfn value
- * @pfn: pfn value for which to create the HMM pfn
- * Returns: valid HMM pfn for the pfn
+ * @pfn: pfn value for which to create the device entry
+ * Returns: valid device entry for the pfn
*/
-static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
- unsigned long pfn)
+static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
+ unsigned long pfn)
{
return (pfn << range->pfn_shift) |
range->flags[HMM_PFN_VALID];
}
+/*
+ * Old API:
+ * hmm_pfn_to_page()
+ * hmm_pfn_to_pfn()
+ * hmm_pfn_from_page()
+ * hmm_pfn_from_pfn()
+ *
+ * This are the OLD API please use new API, it is here to avoid cross-tree
+ * merge painfullness ie we convert things to new API in stages.
+ */
+static inline struct page *hmm_pfn_to_page(const struct hmm_range *range,
+ uint64_t pfn)
+{
+ return hmm_device_entry_to_page(range, pfn);
+}
+
+static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
+ uint64_t pfn)
+{
+ return hmm_device_entry_to_pfn(range, pfn);
+}
+
+static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range,
+ struct page *page)
+{
+ return hmm_device_entry_from_page(range, page);
+}
+
+static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
+ unsigned long pfn)
+{
+ return hmm_device_entry_from_pfn(range, pfn);
+}
+
+
#if IS_ENABLED(CONFIG_HMM_MIRROR)
/*
@@ -353,43 +473,113 @@ struct hmm_mirror {
int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
void hmm_mirror_unregister(struct hmm_mirror *mirror);
-
/*
- * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device
- * driver lock that serializes device page table updates, then call
- * hmm_vma_range_done(), to check if the snapshot is still valid. The same
- * device driver page table update lock must also be used in the
- * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page
- * table invalidation serializes on it.
- *
- * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL
- * hmm_vma_get_pfns() WITHOUT ERROR !
- *
- * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID !
+ * hmm_mirror_mm_is_alive() - test if mm is still alive
+ * @mirror: the HMM mm mirror for which we want to lock the mmap_sem
+ * Returns: false if the mm is dead, true otherwise
+ *
+ * This is an optimization it will not accurately always return -EINVAL if the
+ * mm is dead ie there can be false negative (process is being kill but HMM is
+ * not yet inform of that). It is only intented to be use to optimize out case
+ * where driver is about to do something time consuming and it would be better
+ * to skip it if the mm is dead.
*/
-int hmm_vma_get_pfns(struct hmm_range *range);
-bool hmm_vma_range_done(struct hmm_range *range);
+static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror)
+{
+ struct mm_struct *mm;
+
+ if (!mirror || !mirror->hmm)
+ return false;
+ mm = READ_ONCE(mirror->hmm->mm);
+ if (mirror->hmm->dead || !mm)
+ return false;
+
+ return true;
+}
/*
- * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will
- * not migrate any device memory back to system memory. The HMM pfn array will
- * be updated with the fault result and current snapshot of the CPU page table
- * for the range.
- *
- * The mmap_sem must be taken in read mode before entering and it might be
- * dropped by the function if the block argument is false. In that case, the
- * function returns -EAGAIN.
- *
- * Return value does not reflect if the fault was successful for every single
- * address or not. Therefore, the caller must to inspect the HMM pfn array to
- * determine fault status for each address.
- *
- * Trying to fault inside an invalid vma will result in -EINVAL.
+ * Please see Documentation/vm/hmm.rst for how to use the range API.
+ */
+int hmm_range_register(struct hmm_range *range,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end,
+ unsigned page_shift);
+void hmm_range_unregister(struct hmm_range *range);
+long hmm_range_snapshot(struct hmm_range *range);
+long hmm_range_fault(struct hmm_range *range, bool block);
+long hmm_range_dma_map(struct hmm_range *range,
+ struct device *device,
+ dma_addr_t *daddrs,
+ bool block);
+long hmm_range_dma_unmap(struct hmm_range *range,
+ struct vm_area_struct *vma,
+ struct device *device,
+ dma_addr_t *daddrs,
+ bool dirty);
+
+/*
+ * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
*
- * See the function description in mm/hmm.c for further documentation.
+ * When waiting for mmu notifiers we need some kind of time out otherwise we
+ * could potentialy wait for ever, 1000ms ie 1s sounds like a long time to
+ * wait already.
*/
-int hmm_vma_fault(struct hmm_range *range, bool block);
+#define HMM_RANGE_DEFAULT_TIMEOUT 1000
+
+/* This is a temporary helper to avoid merge conflict between trees. */
+static inline bool hmm_vma_range_done(struct hmm_range *range)
+{
+ bool ret = hmm_range_valid(range);
+
+ hmm_range_unregister(range);
+ return ret;
+}
+
+/* This is a temporary helper to avoid merge conflict between trees. */
+static inline int hmm_vma_fault(struct hmm_range *range, bool block)
+{
+ long ret;
+
+ /*
+ * With the old API the driver must set each individual entries with
+ * the requested flags (valid, write, ...). So here we set the mask to
+ * keep intact the entries provided by the driver and zero out the
+ * default_flags.
+ */
+ range->default_flags = 0;
+ range->pfn_flags_mask = -1UL;
+
+ ret = hmm_range_register(range, range->vma->vm_mm,
+ range->start, range->end,
+ PAGE_SHIFT);
+ if (ret)
+ return (int)ret;
+
+ if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) {
+ /*
+ * The mmap_sem was taken by driver we release it here and
+ * returns -EAGAIN which correspond to mmap_sem have been
+ * drop in the old API.
+ */
+ up_read(&range->vma->vm_mm->mmap_sem);
+ return -EAGAIN;
+ }
+
+ ret = hmm_range_fault(range, block);
+ if (ret <= 0) {
+ if (ret == -EBUSY || !ret) {
+ /* Same as above drop mmap_sem to match old API. */
+ up_read(&range->vma->vm_mm->mmap_sem);
+ ret = -EBUSY;
+ } else if (ret == -EAGAIN)
+ ret = -EBUSY;
+ hmm_range_unregister(range);
+ return ret;
+ }
+ return 0;
+}
/* Below are for HMM internal use only! Not to be used by device driver! */
void hmm_mm_destroy(struct mm_struct *mm);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 381e872bfde0..7cd5c150c21d 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -47,10 +47,8 @@ extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot,
int prot_numa);
-vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, pfn_t pfn, bool write);
-vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, pfn_t pfn, bool write);
+vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
+vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_FLAG,
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 11943b60f208..edf476c8cfb9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -123,9 +123,7 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
void free_huge_page(struct page *page);
void hugetlb_fix_reserve_counts(struct inode *inode);
extern struct mutex *hugetlb_fault_mutex_table;
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
pgoff_t idx, unsigned long address);
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
diff --git a/include/linux/list.h b/include/linux/list.h
index 58aa3adf94e6..9e9a6403dbe4 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -271,6 +271,24 @@ static inline void list_rotate_left(struct list_head *head)
}
/**
+ * list_rotate_to_front() - Rotate list to specific item.
+ * @list: The desired new front of the list.
+ * @head: The head of the list.
+ *
+ * Rotates list so that @list becomes the new front of the list.
+ */
+static inline void list_rotate_to_front(struct list_head *list,
+ struct list_head *head)
+{
+ /*
+ * Deletes the list head from the list denoted by @head and
+ * places it as the tail of @list, this effectively rotates the
+ * list so that @list is at the front.
+ */
+ list_move_tail(head, list);
+}
+
+/**
* list_is_singular - tests whether a list has just one entry.
* @head: the list to test.
*/
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 294d5d80e150..676d3900e1bd 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -96,13 +96,14 @@ struct memblock {
extern struct memblock memblock;
extern int memblock_debug;
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+#ifndef CONFIG_ARCH_KEEP_MEMBLOCK
#define __init_memblock __meminit
#define __initdata_memblock __meminitdata
void memblock_discard(void);
#else
#define __init_memblock
#define __initdata_memblock
+static inline void memblock_discard(void) {}
#endif
#define memblock_dbg(fmt, ...) \
@@ -240,6 +241,47 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
+ unsigned long *out_spfn,
+ unsigned long *out_epfn);
+/**
+ * for_each_free_mem_range_in_zone - iterate through zone specific free
+ * memblock areas
+ * @i: u64 used as loop variable
+ * @zone: zone in which all of the memory blocks reside
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ *
+ * Walks over free (memory && !reserved) areas of memblock in a specific
+ * zone. Available once memblock and an empty zone is initialized. The main
+ * assumption is that the zone start, end, and pgdat have been associated.
+ * This way we can use the zone to determine NUMA node, and if a given part
+ * of the memblock is valid for the zone.
+ */
+#define for_each_free_mem_pfn_range_in_zone(i, zone, p_start, p_end) \
+ for (i = 0, \
+ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end); \
+ i != U64_MAX; \
+ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end))
+
+/**
+ * for_each_free_mem_range_in_zone_from - iterate through zone specific
+ * free memblock areas from a given point
+ * @i: u64 used as loop variable
+ * @zone: zone in which all of the memory blocks reside
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ *
+ * Walks over free (memory && !reserved) areas of memblock in a specific
+ * zone, continuing from current position. Available as soon as memblock is
+ * initialized.
+ */
+#define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \
+ for (; i != U64_MAX; \
+ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end))
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
/**
* for_each_free_mem_range - iterate through free memblock areas
* @i: u64 used as loop variable
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index dbb6118370c1..30561a954ee0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -501,22 +501,6 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int zid, int nr_pages);
-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask);
-
-static inline
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
-{
- struct mem_cgroup_per_node *mz;
- unsigned long nr_pages = 0;
- int zid;
-
- mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- for (zid = 0; zid < MAX_NR_ZONES; zid++)
- nr_pages += mz->lru_zone_size[zid][lru];
- return nr_pages;
-}
-
static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
enum lru_list lru, int zone_idx)
@@ -960,11 +944,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
return true;
}
-static inline unsigned long
-mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
-{
- return 0;
-}
static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
enum lru_list lru, int zone_idx)
@@ -972,13 +951,6 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
return 0;
}
-static inline unsigned long
-mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask)
-{
- return 0;
-}
-
static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
return 0;
@@ -1117,6 +1089,12 @@ static inline void count_memcg_events(struct mem_cgroup *memcg,
{
}
+static inline void __count_memcg_events(struct mem_cgroup *memcg,
+ enum vm_event_item idx,
+ unsigned long count)
+{
+}
+
static inline void count_memcg_page_event(struct page *page,
int idx)
{
diff --git a/include/linux/memory.h b/include/linux/memory.h
index a6ddefc60517..e1dc1bb2b787 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -113,7 +113,7 @@ extern int register_memory_isolate_notifier(struct notifier_block *nb);
extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
int hotplug_memory_register(int nid, struct mem_section *section);
#ifdef CONFIG_MEMORY_HOTREMOVE
-extern int unregister_memory_section(struct mem_section *);
+extern void unregister_memory_section(struct mem_section *);
#endif
extern int memory_dev_init(void);
extern int memory_notify(unsigned long val, void *v);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 8ade08c50d26..ae892eef8b82 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -54,6 +54,16 @@ enum {
};
/*
+ * Restrictions for the memory hotplug:
+ * flags: MHP_ flags
+ * altmap: alternative allocator for memmap array
+ */
+struct mhp_restrictions {
+ unsigned long flags;
+ struct vmem_altmap *altmap;
+};
+
+/*
* Zone resizing functions
*
* Note: any attempt to resize a zone should has pgdat_resize_lock()
@@ -87,7 +97,8 @@ extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
extern int online_pages(unsigned long, unsigned long, int);
extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
unsigned long *valid_start, unsigned long *valid_end);
-extern void __offline_isolated_pages(unsigned long, unsigned long);
+extern unsigned long __offline_isolated_pages(unsigned long start_pfn,
+ unsigned long end_pfn);
typedef void (*online_page_callback_t)(struct page *page, unsigned int order);
@@ -100,6 +111,8 @@ extern void __online_page_free(struct page *page);
extern int try_online_node(int nid);
+extern int arch_add_memory(int nid, u64 start, u64 size,
+ struct mhp_restrictions *restrictions);
extern u64 max_mem_size;
extern bool memhp_auto_online;
@@ -111,26 +124,33 @@ static inline bool movable_node_is_enabled(void)
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-extern int arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap);
-extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap);
+extern void arch_remove_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap);
+extern void __remove_pages(struct zone *zone, unsigned long start_pfn,
+ unsigned long nr_pages, struct vmem_altmap *altmap);
#endif /* CONFIG_MEMORY_HOTREMOVE */
+/*
+ * Do we want sysfs memblock files created. This will allow userspace to online
+ * and offline memory explicitly. Lack of this bit means that the caller has to
+ * call move_pfn_range_to_zone to finish the initialization.
+ */
+
+#define MHP_MEMBLOCK_API (1<<0)
+
/* reasonably generic interface to expand the physical pages */
extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap, bool want_memblock);
+ struct mhp_restrictions *restrictions);
#ifndef CONFIG_ARCH_HAS_ADD_PAGES
static inline int add_pages(int nid, unsigned long start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap,
- bool want_memblock)
+ unsigned long nr_pages, struct mhp_restrictions *restrictions)
{
- return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+ return __add_pages(nid, start_pfn, nr_pages, restrictions);
}
#else /* ARCH_HAS_ADD_PAGES */
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
- struct vmem_altmap *altmap, bool want_memblock);
+ struct mhp_restrictions *restrictions);
#endif /* ARCH_HAS_ADD_PAGES */
#ifdef CONFIG_NUMA
@@ -331,8 +351,6 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
extern int __add_memory(int nid, u64 start, u64 size);
extern int add_memory(int nid, u64 start, u64 size);
extern int add_memory_resource(int nid, struct resource *resource);
-extern int arch_add_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap, bool want_memblock);
extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages, struct vmem_altmap *altmap);
extern bool is_memblock_offlined(struct memory_block *mem);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 083d7b4863ed..912614fbbef3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -124,10 +124,45 @@ extern int mmap_rnd_compat_bits __read_mostly;
/*
* On some architectures it is expensive to call memset() for small sizes.
- * Those architectures should provide their own implementation of "struct page"
- * zeroing by defining this macro in <asm/pgtable.h>.
+ * If an architecture decides to implement their own version of
+ * mm_zero_struct_page they should wrap the defines below in a #ifndef and
+ * define their own version of this macro in <asm/pgtable.h>
*/
-#ifndef mm_zero_struct_page
+#if BITS_PER_LONG == 64
+/* This function must be updated when the size of struct page grows above 80
+ * or reduces below 56. The idea that compiler optimizes out switch()
+ * statement, and only leaves move/store instructions. Also the compiler can
+ * combine write statments if they are both assignments and can be reordered,
+ * this can result in several of the writes here being dropped.
+ */
+#define mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
+static inline void __mm_zero_struct_page(struct page *page)
+{
+ unsigned long *_pp = (void *)page;
+
+ /* Check that struct page is either 56, 64, 72, or 80 bytes */
+ BUILD_BUG_ON(sizeof(struct page) & 7);
+ BUILD_BUG_ON(sizeof(struct page) < 56);
+ BUILD_BUG_ON(sizeof(struct page) > 80);
+
+ switch (sizeof(struct page)) {
+ case 80:
+ _pp[9] = 0; /* fallthrough */
+ case 72:
+ _pp[8] = 0; /* fallthrough */
+ case 64:
+ _pp[7] = 0; /* fallthrough */
+ case 56:
+ _pp[6] = 0;
+ _pp[5] = 0;
+ _pp[4] = 0;
+ _pp[3] = 0;
+ _pp[2] = 0;
+ _pp[1] = 0;
+ _pp[0] = 0;
+ }
+}
+#else
#define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page)))
#endif
@@ -1007,6 +1042,30 @@ static inline void put_page(struct page *page)
__put_page(page);
}
+/**
+ * put_user_page() - release a gup-pinned page
+ * @page: pointer to page to be released
+ *
+ * Pages that were pinned via get_user_pages*() must be released via
+ * either put_user_page(), or one of the put_user_pages*() routines
+ * below. This is so that eventually, pages that are pinned via
+ * get_user_pages*() can be separately tracked and uniquely handled. In
+ * particular, interactions with RDMA and filesystems need special
+ * handling.
+ *
+ * put_user_page() and put_page() are not interchangeable, despite this early
+ * implementation that makes them look the same. put_user_page() calls must
+ * be perfectly matched up with get_user_page() calls.
+ */
+static inline void put_user_page(struct page *page)
+{
+ put_page(page);
+}
+
+void put_user_pages_dirty(struct page **pages, unsigned long npages);
+void put_user_pages_dirty_lock(struct page **pages, unsigned long npages);
+void put_user_pages(struct page **pages, unsigned long npages);
+
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif
@@ -1505,21 +1564,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
-#if defined(CONFIG_FS_DAX) || defined(CONFIG_CMA)
-long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas);
-#else
-static inline long get_user_pages_longterm(unsigned long start,
- unsigned long nr_pages, unsigned int gup_flags,
- struct page **pages, struct vm_area_struct **vmas)
-{
- return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
-}
-#endif /* CONFIG_FS_DAX */
-
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages);
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages);
/* Container for pinned pfns / pages */
struct frame_vector {
@@ -2533,6 +2579,10 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t);
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
+int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num);
+int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num);
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn);
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
@@ -2583,6 +2633,34 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
#define FOLL_COW 0x4000 /* internal GUP flag */
#define FOLL_ANON 0x8000 /* don't do file mappings */
+#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */
+
+/*
+ * NOTE on FOLL_LONGTERM:
+ *
+ * FOLL_LONGTERM indicates that the page will be held for an indefinite time
+ * period _often_ under userspace control. This is contrasted with
+ * iov_iter_get_pages() where usages which are transient.
+ *
+ * FIXME: For pages which are part of a filesystem, mappings are subject to the
+ * lifetime enforced by the filesystem and we need guarantees that longterm
+ * users like RDMA and V4L2 only establish mappings which coordinate usage with
+ * the filesystem. Ideas for this coordination include revoking the longterm
+ * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was
+ * added after the problem with filesystems was found FS DAX VMAs are
+ * specifically failed. Filesystem pages are still subject to bugs and use of
+ * FOLL_LONGTERM should be avoided on those pages.
+ *
+ * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call.
+ * Currently only get_user_pages() and get_user_pages_fast() support this flag
+ * and calls to get_user_pages_[un]locked are specifically not allowed. This
+ * is due to an incompatibility with the FS DAX check and
+ * FAULT_FLAG_ALLOW_RETRY
+ *
+ * In the CMA case: longterm pins in a CMA region would unnecessarily fragment
+ * that region. And so CMA attempts to migrate the page before pinning when
+ * FOLL_LONGTERM is specified.
+ */
static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
{
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 04ec454d44ce..6f2fef7b0784 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -29,7 +29,7 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec,
{
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages);
+ __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
__mod_zone_page_state(&pgdat->node_zones[zid],
NR_ZONE_LRU_BASE + lru, nr_pages);
}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 4ef4bbe78a1d..e1f42a07d8f0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -103,7 +103,7 @@ struct page {
};
struct { /* slab, slob and slub */
union {
- struct list_head slab_list; /* uses lru */
+ struct list_head slab_list;
struct { /* Partial pages */
struct page *next;
#ifdef CONFIG_64BIT
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 4050ec1c3b45..b6c004bd9f6a 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -10,6 +10,36 @@
struct mmu_notifier;
struct mmu_notifier_ops;
+/**
+ * enum mmu_notifier_event - reason for the mmu notifier callback
+ * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that
+ * move the range
+ *
+ * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like
+ * madvise() or replacing a page by another one, ...).
+ *
+ * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range
+ * ie using the vma access permission (vm_page_prot) to update the whole range
+ * is enough no need to inspect changes to the CPU page table (mprotect()
+ * syscall)
+ *
+ * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for
+ * pages in the range so to mirror those changes the user must inspect the CPU
+ * page table (from the end callback).
+ *
+ * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same
+ * access flags). User should soft dirty the page in the end callback to make
+ * sure that anyone relying on soft dirtyness catch pages that might be written
+ * through non CPU mappings.
+ */
+enum mmu_notifier_event {
+ MMU_NOTIFY_UNMAP = 0,
+ MMU_NOTIFY_CLEAR,
+ MMU_NOTIFY_PROTECTION_VMA,
+ MMU_NOTIFY_PROTECTION_PAGE,
+ MMU_NOTIFY_SOFT_DIRTY,
+};
+
#ifdef CONFIG_MMU_NOTIFIER
/*
@@ -25,11 +55,15 @@ struct mmu_notifier_mm {
spinlock_t lock;
};
+#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
+
struct mmu_notifier_range {
+ struct vm_area_struct *vma;
struct mm_struct *mm;
unsigned long start;
unsigned long end;
- bool blockable;
+ unsigned flags;
+ enum mmu_notifier_event event;
};
struct mmu_notifier_ops {
@@ -225,6 +259,14 @@ extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r,
bool only_end);
extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
unsigned long start, unsigned long end);
+extern bool
+mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range);
+
+static inline bool
+mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
+{
+ return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE);
+}
static inline void mmu_notifier_release(struct mm_struct *mm)
{
@@ -269,7 +311,7 @@ static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
if (mm_has_notifiers(range->mm)) {
- range->blockable = true;
+ range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE;
__mmu_notifier_invalidate_range_start(range);
}
}
@@ -278,7 +320,7 @@ static inline int
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
if (mm_has_notifiers(range->mm)) {
- range->blockable = false;
+ range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE;
return __mmu_notifier_invalidate_range_start(range);
}
return 0;
@@ -318,13 +360,19 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
+ enum mmu_notifier_event event,
+ unsigned flags,
+ struct vm_area_struct *vma,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
+ range->vma = vma;
+ range->event = event;
range->mm = mm;
range->start = start;
range->end = end;
+ range->flags = flags;
}
#define ptep_clear_flush_young_notify(__vma, __address, __ptep) \
@@ -452,9 +500,14 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
range->end = end;
}
-#define mmu_notifier_range_init(range, mm, start, end) \
+#define mmu_notifier_range_init(range,event,flags,vma,mm,start,end) \
_mmu_notifier_range_init(range, start, end)
+static inline bool
+mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
+{
+ return true;
+}
static inline int mm_has_notifiers(struct mm_struct *mm)
{
@@ -517,6 +570,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
{
}
+#define mmu_notifier_range_update_to_read_only(r) false
+
#define ptep_clear_flush_young_notify ptep_clear_flush_young
#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
#define ptep_clear_young_notify ptep_test_and_clear_young
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fba7741533be..5a4aedc160bd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -247,11 +247,6 @@ struct lruvec {
#endif
};
-/* Mask used at gathering information at once (see memcontrol.c) */
-#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
-#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
-#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
-
/* Isolate unmapped file */
#define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2)
/* Isolate for asynchronous migration */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index bcf909d0de5f..9ec3544baee2 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -333,6 +333,19 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
mapping_gfp_mask(mapping));
}
+static inline struct page *find_subpage(struct page *page, pgoff_t offset)
+{
+ unsigned long mask;
+
+ if (PageHuge(page))
+ return page;
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ mask = (1UL << compound_order(page)) - 1;
+ return page + (offset & mask);
+}
+
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
@@ -360,9 +373,6 @@ static inline unsigned find_get_pages_tag(struct address_space *mapping,
return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag,
nr_pages, pages);
}
-unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
- xa_mark_t tag, unsigned int nr_entries,
- struct page **entries, pgoff_t *indices);
struct page *grab_cache_page_write_begin(struct address_space *mapping,
pgoff_t index, unsigned flags);
@@ -527,15 +537,7 @@ static inline int wait_on_page_locked_killable(struct page *page)
extern void put_and_wait_on_page_locked(struct page *page);
-/*
- * Wait for a page to complete writeback
- */
-static inline void wait_on_page_writeback(struct page *page)
-{
- if (PageWriteback(page))
- wait_on_page_bit(page, PG_writeback);
-}
-
+void wait_on_page_writeback(struct page *page);
extern void end_page_writeback(struct page *page);
void wait_for_stable_page(struct page *page);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 37c9eba75c98..ac9d71e24b81 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -28,6 +28,8 @@
#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
+extern int sysctl_unprivileged_userfaultfd;
+
extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 2db8d60981fe..bdeda4b079fe 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -26,7 +26,7 @@ struct reclaim_stat {
unsigned nr_congested;
unsigned nr_writeback;
unsigned nr_immediate;
- unsigned nr_activate;
+ unsigned nr_activate[2];
unsigned nr_ref_keep;
unsigned nr_unmap_fail;
};
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 6074eff3d766..e5bf6ee4e814 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -64,6 +64,7 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken)
);
+#ifdef CONFIG_COMPACTION
TRACE_EVENT(mm_compaction_migratepages,
TP_PROTO(unsigned long nr_all,
@@ -132,7 +133,6 @@ TRACE_EVENT(mm_compaction_begin,
__entry->sync ? "sync" : "async")
);
-#ifdef CONFIG_COMPACTION
TRACE_EVENT(mm_compaction_end,
TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn,
unsigned long free_pfn, unsigned long zone_end, bool sync,
@@ -166,7 +166,6 @@ TRACE_EVENT(mm_compaction_end,
__entry->sync ? "sync" : "async",
__print_symbolic(__entry->status, COMPACTION_STATUS))
);
-#endif
TRACE_EVENT(mm_compaction_try_to_compact_pages,
@@ -189,13 +188,12 @@ TRACE_EVENT(mm_compaction_try_to_compact_pages,
__entry->prio = prio;
),
- TP_printk("order=%d gfp_mask=0x%x priority=%d",
+ TP_printk("order=%d gfp_mask=%s priority=%d",
__entry->order,
- __entry->gfp_mask,
+ show_gfp_flags(__entry->gfp_mask),
__entry->prio)
);
-#ifdef CONFIG_COMPACTION
DECLARE_EVENT_CLASS(mm_compaction_suitable_template,
TP_PROTO(struct zone *zone,
@@ -296,7 +294,6 @@ DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset,
TP_ARGS(zone, order)
);
-#endif
TRACE_EVENT(mm_compaction_kcompactd_sleep,
@@ -352,6 +349,7 @@ DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake,
TP_ARGS(nid, order, classzone_idx)
);
+#endif
#endif /* _TRACE_COMPACTION_H */
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 252327dbfa51..a5ab2973e8dc 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -27,17 +27,11 @@
{RECLAIM_WB_ASYNC, "RECLAIM_WB_ASYNC"} \
) : "RECLAIM_WB_NONE"
-#define trace_reclaim_flags(page) ( \
- (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
+#define trace_reclaim_flags(file) ( \
+ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
(RECLAIM_WB_ASYNC) \
)
-#define trace_shrink_flags(file) \
- ( \
- (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
- (RECLAIM_WB_ASYNC) \
- )
-
TRACE_EVENT(mm_vmscan_kswapd_sleep,
TP_PROTO(int nid),
@@ -73,7 +67,9 @@ TRACE_EVENT(mm_vmscan_kswapd_wake,
__entry->order = order;
),
- TP_printk("nid=%d zid=%d order=%d", __entry->nid, __entry->zid, __entry->order)
+ TP_printk("nid=%d order=%d",
+ __entry->nid,
+ __entry->order)
);
TRACE_EVENT(mm_vmscan_wakeup_kswapd,
@@ -96,60 +92,53 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd,
__entry->gfp_flags = gfp_flags;
),
- TP_printk("nid=%d zid=%d order=%d gfp_flags=%s",
+ TP_printk("nid=%d order=%d gfp_flags=%s",
__entry->nid,
- __entry->zid,
__entry->order,
show_gfp_flags(__entry->gfp_flags))
);
DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template,
- TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
+ TP_PROTO(int order, gfp_t gfp_flags),
- TP_ARGS(order, may_writepage, gfp_flags, classzone_idx),
+ TP_ARGS(order, gfp_flags),
TP_STRUCT__entry(
__field( int, order )
- __field( int, may_writepage )
__field( gfp_t, gfp_flags )
- __field( int, classzone_idx )
),
TP_fast_assign(
__entry->order = order;
- __entry->may_writepage = may_writepage;
__entry->gfp_flags = gfp_flags;
- __entry->classzone_idx = classzone_idx;
),
- TP_printk("order=%d may_writepage=%d gfp_flags=%s classzone_idx=%d",
+ TP_printk("order=%d gfp_flags=%s",
__entry->order,
- __entry->may_writepage,
- show_gfp_flags(__entry->gfp_flags),
- __entry->classzone_idx)
+ show_gfp_flags(__entry->gfp_flags))
);
DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin,
- TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
+ TP_PROTO(int order, gfp_t gfp_flags),
- TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
+ TP_ARGS(order, gfp_flags)
);
#ifdef CONFIG_MEMCG
DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin,
- TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
+ TP_PROTO(int order, gfp_t gfp_flags),
- TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
+ TP_ARGS(order, gfp_flags)
);
DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin,
- TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
+ TP_PROTO(int order, gfp_t gfp_flags),
- TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
+ TP_ARGS(order, gfp_flags)
);
#endif /* CONFIG_MEMCG */
@@ -333,7 +322,8 @@ TRACE_EVENT(mm_vmscan_writepage,
TP_fast_assign(
__entry->pfn = page_to_pfn(page);
- __entry->reclaim_flags = trace_reclaim_flags(page);
+ __entry->reclaim_flags = trace_reclaim_flags(
+ page_is_file_cache(page));
),
TP_printk("page=%p pfn=%lu flags=%s",
@@ -358,7 +348,8 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
__field(unsigned long, nr_writeback)
__field(unsigned long, nr_congested)
__field(unsigned long, nr_immediate)
- __field(unsigned long, nr_activate)
+ __field(unsigned int, nr_activate0)
+ __field(unsigned int, nr_activate1)
__field(unsigned long, nr_ref_keep)
__field(unsigned long, nr_unmap_fail)
__field(int, priority)
@@ -373,20 +364,22 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
__entry->nr_writeback = stat->nr_writeback;
__entry->nr_congested = stat->nr_congested;
__entry->nr_immediate = stat->nr_immediate;
- __entry->nr_activate = stat->nr_activate;
+ __entry->nr_activate0 = stat->nr_activate[0];
+ __entry->nr_activate1 = stat->nr_activate[1];
__entry->nr_ref_keep = stat->nr_ref_keep;
__entry->nr_unmap_fail = stat->nr_unmap_fail;
__entry->priority = priority;
- __entry->reclaim_flags = trace_shrink_flags(file);
+ __entry->reclaim_flags = trace_reclaim_flags(file);
),
- TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate=%ld nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s",
+ TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s",
__entry->nid,
__entry->nr_scanned, __entry->nr_reclaimed,
__entry->nr_dirty, __entry->nr_writeback,
__entry->nr_congested, __entry->nr_immediate,
- __entry->nr_activate, __entry->nr_ref_keep,
- __entry->nr_unmap_fail, __entry->priority,
+ __entry->nr_activate0, __entry->nr_activate1,
+ __entry->nr_ref_keep, __entry->nr_unmap_fail,
+ __entry->priority,
show_reclaim_flags(__entry->reclaim_flags))
);
@@ -415,7 +408,7 @@ TRACE_EVENT(mm_vmscan_lru_shrink_active,
__entry->nr_deactivated = nr_deactivated;
__entry->nr_referenced = nr_referenced;
__entry->priority = priority;
- __entry->reclaim_flags = trace_shrink_flags(file);
+ __entry->reclaim_flags = trace_reclaim_flags(file);
),
TP_printk("nid=%d nr_taken=%ld nr_active=%ld nr_deactivated=%ld nr_referenced=%ld priority=%d flags=%s",
@@ -454,7 +447,8 @@ TRACE_EVENT(mm_vmscan_inactive_list_is_low,
__entry->total_active = total_active;
__entry->active = active;
__entry->ratio = ratio;
- __entry->reclaim_flags = trace_shrink_flags(file) & RECLAIM_WB_LRU;
+ __entry->reclaim_flags = trace_reclaim_flags(file) &
+ RECLAIM_WB_LRU;
),
TP_printk("nid=%d reclaim_idx=%d total_inactive=%ld inactive=%ld total_active=%ld active=%ld ratio=%ld flags=%s",
@@ -465,6 +459,38 @@ TRACE_EVENT(mm_vmscan_inactive_list_is_low,
__entry->ratio,
show_reclaim_flags(__entry->reclaim_flags))
);
+
+TRACE_EVENT(mm_vmscan_node_reclaim_begin,
+
+ TP_PROTO(int nid, int order, gfp_t gfp_flags),
+
+ TP_ARGS(nid, order, gfp_flags),
+
+ TP_STRUCT__entry(
+ __field(int, nid)
+ __field(int, order)
+ __field(gfp_t, gfp_flags)
+ ),
+
+ TP_fast_assign(
+ __entry->nid = nid;
+ __entry->order = order;
+ __entry->gfp_flags = gfp_flags;
+ ),
+
+ TP_printk("nid=%d order=%d gfp_flags=%s",
+ __entry->nid,
+ __entry->order,
+ show_gfp_flags(__entry->gfp_flags))
+);
+
+DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_node_reclaim_end,
+
+ TP_PROTO(unsigned long nr_reclaimed),
+
+ TP_ARGS(nr_reclaimed)
+);
+
#endif /* _TRACE_VMSCAN_H */
/* This part must be outside protection */
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 32db72c7c055..aa7f3aeac740 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -53,7 +53,7 @@ WB_WORK_REASON
struct wb_writeback_work;
-TRACE_EVENT(writeback_dirty_page,
+DECLARE_EVENT_CLASS(writeback_page_template,
TP_PROTO(struct page *page, struct address_space *mapping),
@@ -79,6 +79,20 @@ TRACE_EVENT(writeback_dirty_page,
)
);
+DEFINE_EVENT(writeback_page_template, writeback_dirty_page,
+
+ TP_PROTO(struct page *page, struct address_space *mapping),
+
+ TP_ARGS(page, mapping)
+);
+
+DEFINE_EVENT(writeback_page_template, wait_on_page_writeback,
+
+ TP_PROTO(struct page *page, struct address_space *mapping),
+
+ TP_ARGS(page, mapping)
+);
+
DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
TP_PROTO(struct inode *inode, int flags),
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 121e82ce296b..59c71fa8c553 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -320,6 +320,9 @@ struct fscrypt_key {
#define SYNC_FILE_RANGE_WAIT_BEFORE 1
#define SYNC_FILE_RANGE_WRITE 2
#define SYNC_FILE_RANGE_WAIT_AFTER 4
+#define SYNC_FILE_RANGE_WRITE_AND_WAIT (SYNC_FILE_RANGE_WRITE | \
+ SYNC_FILE_RANGE_WAIT_BEFORE | \
+ SYNC_FILE_RANGE_WAIT_AFTER)
/*
* Flags for preadv2/pwritev2:
diff --git a/init/initramfs.c b/init/initramfs.c
index 4749e1115eef..435a428c2af1 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -513,42 +513,55 @@ static int __init retain_initrd_param(char *str)
}
__setup("retain_initrd", retain_initrd_param);
+#ifdef CONFIG_ARCH_HAS_KEEPINITRD
+static int __init keepinitrd_setup(char *__unused)
+{
+ do_retain_initrd = 1;
+ return 1;
+}
+__setup("keepinitrd", keepinitrd_setup);
+#endif
+
extern char __initramfs_start[];
extern unsigned long __initramfs_size;
#include <linux/initrd.h>
#include <linux/kexec.h>
-static void __init free_initrd(void)
+void __weak free_initrd_mem(unsigned long start, unsigned long end)
{
+ free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
+ "initrd");
+}
+
#ifdef CONFIG_KEXEC_CORE
+static bool kexec_free_initrd(void)
+{
unsigned long crashk_start = (unsigned long)__va(crashk_res.start);
unsigned long crashk_end = (unsigned long)__va(crashk_res.end);
-#endif
- if (do_retain_initrd)
- goto skip;
-#ifdef CONFIG_KEXEC_CORE
/*
* If the initrd region is overlapped with crashkernel reserved region,
* free only memory that is not part of crashkernel region.
*/
- if (initrd_start < crashk_end && initrd_end > crashk_start) {
- /*
- * Initialize initrd memory region since the kexec boot does
- * not do.
- */
- memset((void *)initrd_start, 0, initrd_end - initrd_start);
- if (initrd_start < crashk_start)
- free_initrd_mem(initrd_start, crashk_start);
- if (initrd_end > crashk_end)
- free_initrd_mem(crashk_end, initrd_end);
- } else
-#endif
- free_initrd_mem(initrd_start, initrd_end);
-skip:
- initrd_start = 0;
- initrd_end = 0;
+ if (initrd_start >= crashk_end || initrd_end <= crashk_start)
+ return false;
+
+ /*
+ * Initialize initrd memory region since the kexec boot does not do.
+ */
+ memset((void *)initrd_start, 0, initrd_end - initrd_start);
+ if (initrd_start < crashk_start)
+ free_initrd_mem(initrd_start, crashk_start);
+ if (initrd_end > crashk_end)
+ free_initrd_mem(crashk_end, initrd_end);
+ return true;
+}
+#else
+static inline bool kexec_free_initrd(void)
+{
+ return false;
}
+#endif /* CONFIG_KEXEC_CORE */
#ifdef CONFIG_BLK_DEV_RAM
#define BUF_SIZE 1024
@@ -597,7 +610,38 @@ static void __init clean_rootfs(void)
ksys_close(fd);
kfree(buf);
}
-#endif
+#else
+static inline void clean_rootfs(void)
+{
+}
+#endif /* CONFIG_BLK_DEV_RAM */
+
+#ifdef CONFIG_BLK_DEV_RAM
+static void populate_initrd_image(char *err)
+{
+ ssize_t written;
+ int fd;
+
+ unpack_to_rootfs(__initramfs_start, __initramfs_size);
+
+ printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n",
+ err);
+ fd = ksys_open("/initrd.image", O_WRONLY | O_CREAT, 0700);
+ if (fd < 0)
+ return;
+
+ written = xwrite(fd, (char *)initrd_start, initrd_end - initrd_start);
+ if (written != initrd_end - initrd_start)
+ pr_err("/initrd.image: incomplete write (%zd != %ld)\n",
+ written, initrd_end - initrd_start);
+ ksys_close(fd);
+}
+#else
+static void populate_initrd_image(char *err)
+{
+ printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err);
+}
+#endif /* CONFIG_BLK_DEV_RAM */
static int __init populate_rootfs(void)
{
@@ -605,46 +649,31 @@ static int __init populate_rootfs(void)
char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
if (err)
panic("%s", err); /* Failed to decompress INTERNAL initramfs */
- /* If available load the bootloader supplied initrd */
- if (initrd_start && !IS_ENABLED(CONFIG_INITRAMFS_FORCE)) {
-#ifdef CONFIG_BLK_DEV_RAM
- int fd;
+
+ if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE))
+ goto done;
+
+ if (IS_ENABLED(CONFIG_BLK_DEV_RAM))
printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n");
- err = unpack_to_rootfs((char *)initrd_start,
- initrd_end - initrd_start);
- if (!err) {
- free_initrd();
- goto done;
- } else {
- clean_rootfs();
- unpack_to_rootfs(__initramfs_start, __initramfs_size);
- }
- printk(KERN_INFO "rootfs image is not initramfs (%s)"
- "; looks like an initrd\n", err);
- fd = ksys_open("/initrd.image",
- O_WRONLY|O_CREAT, 0700);
- if (fd >= 0) {
- ssize_t written = xwrite(fd, (char *)initrd_start,
- initrd_end - initrd_start);
-
- if (written != initrd_end - initrd_start)
- pr_err("/initrd.image: incomplete write (%zd != %ld)\n",
- written, initrd_end - initrd_start);
-
- ksys_close(fd);
- free_initrd();
- }
- done:
- /* empty statement */;
-#else
+ else
printk(KERN_INFO "Unpacking initramfs...\n");
- err = unpack_to_rootfs((char *)initrd_start,
- initrd_end - initrd_start);
- if (err)
- printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err);
- free_initrd();
-#endif
+
+ err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start);
+ if (err) {
+ clean_rootfs();
+ populate_initrd_image(err);
}
+
+done:
+ /*
+ * If the initrd region is overlapped with crashkernel reserved region,
+ * free only memory that is not part of crashkernel region.
+ */
+ if (!do_retain_initrd && !kexec_free_initrd())
+ free_initrd_mem(initrd_start, initrd_end);
+ initrd_start = 0;
+ initrd_end = 0;
+
flush_delayed_fput();
return 0;
}
diff --git a/init/main.c b/init/main.c
index 33c87e91dc37..5a2c69b4d7b3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1074,6 +1074,11 @@ static inline void mark_readonly(void)
}
#endif
+void __weak free_initmem(void)
+{
+ free_initmem_default(POISON_FREE_INITMEM);
+}
+
static int __ref kernel_init(void *unused)
{
int ret;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4ca7364c956d..78f61bfc6b79 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct mmu_notifier_range range;
struct mem_cgroup *memcg;
- mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+ addr + PAGE_SIZE);
VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
diff --git a/kernel/futex.c b/kernel/futex.c
index 6262f1534ac9..2268b97d5439 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -543,7 +543,7 @@ again:
if (unlikely(should_fail_futex(fshared)))
return -EFAULT;
- err = get_user_pages_fast(address, 1, 1, &page);
+ err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
/*
* If write access is not required (eg. FUTEX_WAIT), try
* and get read-only access.
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f7fb8f6a688f..072b6ee55e3f 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -500,13 +500,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg)
return locate_mem_hole_bottom_up(start, end, kbuf);
}
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-static int kexec_walk_memblock(struct kexec_buf *kbuf,
- int (*func)(struct resource *, void *))
-{
- return 0;
-}
-#else
+#ifdef CONFIG_ARCH_KEEP_MEMBLOCK
static int kexec_walk_memblock(struct kexec_buf *kbuf,
int (*func)(struct resource *, void *))
{
@@ -550,6 +544,12 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
return ret;
}
+#else
+static int kexec_walk_memblock(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ return 0;
+}
#endif
/**
@@ -589,7 +589,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN)
return 0;
- if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK))
+ if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);
else
ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index a856cb5ff192..1490e63f69a9 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -45,7 +45,6 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
*/
return devmem->page_fault(vma, addr, page, flags, pmdp);
}
-EXPORT_SYMBOL(device_private_entry_fault);
#endif /* CONFIG_DEVICE_PRIVATE */
static void pgmap_array_delete(struct resource *res)
@@ -148,6 +147,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
&pgmap->altmap : NULL;
struct resource *res = &pgmap->res;
struct dev_pagemap *conflict_pgmap;
+ struct mhp_restrictions restrictions = {
+ /*
+ * We do not want any optional features only our own memmap
+ */
+ .altmap = altmap,
+ };
pgprot_t pgprot = PAGE_KERNEL;
int error, nid, is_ram;
@@ -214,7 +219,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
*/
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
error = add_pages(nid, align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT, NULL, false);
+ align_size >> PAGE_SHIFT, &restrictions);
} else {
error = kasan_add_zero_shadow(__va(align_start), align_size);
if (error) {
@@ -222,8 +227,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
goto err_kasan;
}
- error = arch_add_memory(nid, align_start, align_size, altmap,
- false);
+ error = arch_add_memory(nid, align_start, align_size,
+ &restrictions);
}
if (!error) {
diff --git a/kernel/sys.c b/kernel/sys.c
index 12df0e5434b8..bdbfe8d37418 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1924,7 +1924,7 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map)
((unsigned long)prctl_map->__m1 __op \
(unsigned long)prctl_map->__m2) ? 0 : -EINVAL
error = __prctl_check_order(start_code, <, end_code);
- error |= __prctl_check_order(start_data, <, end_data);
+ error |= __prctl_check_order(start_data,<=, end_data);
error |= __prctl_check_order(start_brk, <=, brk);
error |= __prctl_check_order(arg_start, <=, arg_end);
error |= __prctl_check_order(env_start, <=, env_end);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 599510a3355e..ba158f61aab4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -66,6 +66,7 @@
#include <linux/kexec.h>
#include <linux/bpf.h>
#include <linux/mount.h>
+#include <linux/userfaultfd_k.h>
#include "../lib/kstrtox.h"
@@ -1720,6 +1721,17 @@ static struct ctl_table vm_table[] = {
.extra2 = (void *)&mmap_rnd_compat_bits_max,
},
#endif
+#ifdef CONFIG_USERFAULTFD
+ {
+ .procname = "unprivileged_userfaultfd",
+ .data = &sysctl_unprivileged_userfaultfd,
+ .maxlen = sizeof(sysctl_unprivileged_userfaultfd),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
{ }
};
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index b396d328a764..f74fa832f3aa 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1293,7 +1293,9 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
len = maxpages * PAGE_SIZE;
addr &= ~(PAGE_SIZE - 1);
n = DIV_ROUND_UP(len, PAGE_SIZE);
- res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, pages);
+ res = get_user_pages_fast(addr, n,
+ iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0,
+ pages);
if (unlikely(res < 0))
return res;
return (res == n ? len : res * PAGE_SIZE) - *start;
@@ -1374,7 +1376,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
p = get_pages_array(n);
if (!p)
return -ENOMEM;
- res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, p);
+ res = get_user_pages_fast(addr, n,
+ iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p);
if (unlikely(res < 0)) {
kvfree(p);
return res;
diff --git a/mm/Kconfig b/mm/Kconfig
index 25c71eb8a7db..ee8d1f311858 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -11,23 +11,24 @@ choice
default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
default FLATMEM_MANUAL
+ help
+ This option allows you to change some of the ways that
+ Linux manages its memory internally. Most users will
+ only have one option here selected by the architecture
+ configuration. This is normal.
config FLATMEM_MANUAL
bool "Flat Memory"
depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
help
- This option allows you to change some of the ways that
- Linux manages its memory internally. Most users will
- only have one option here: FLATMEM. This is normal
- and a correct option.
-
- Some users of more advanced features like NUMA and
- memory hotplug may have different options here.
- DISCONTIGMEM is a more mature, better tested system,
- but is incompatible with memory hotplug and may suffer
- decreased performance over SPARSEMEM. If unsure between
- "Sparse Memory" and "Discontiguous Memory", choose
- "Discontiguous Memory".
+ This option is best suited for non-NUMA systems with
+ flat address space. The FLATMEM is the most efficient
+ system in terms of performance and resource consumption
+ and it is the best option for smaller systems.
+
+ For systems that have holes in their physical address
+ spaces and for features like NUMA and memory hotplug,
+ choose "Sparse Memory"
If unsure, choose this option (Flat Memory) over any other.
@@ -38,29 +39,26 @@ config DISCONTIGMEM_MANUAL
This option provides enhanced support for discontiguous
memory systems, over FLATMEM. These systems have holes
in their physical address spaces, and this option provides
- more efficient handling of these holes. However, the vast
- majority of hardware has quite flat address spaces, and
- can have degraded performance from the extra overhead that
- this option imposes.
+ more efficient handling of these holes.
- Many NUMA configurations will have this as the only option.
+ Although "Discontiguous Memory" is still used by several
+ architectures, it is considered deprecated in favor of
+ "Sparse Memory".
- If unsure, choose "Flat Memory" over this option.
+ If unsure, choose "Sparse Memory" over this option.
config SPARSEMEM_MANUAL
bool "Sparse Memory"
depends on ARCH_SPARSEMEM_ENABLE
help
This will be the only option for some systems, including
- memory hotplug systems. This is normal.
+ memory hot-plug systems. This is normal.
- For many other systems, this will be an alternative to
- "Discontiguous Memory". This option provides some potential
- performance benefits, along with decreased code complexity,
- but it is newer, and more experimental.
+ This option provides efficient support for systems with
+ holes is their physical address space and allows memory
+ hot-plug and hot-remove.
- If unsure, choose "Discontiguous Memory" or "Flat Memory"
- over this option.
+ If unsure, choose "Flat Memory" over this option.
endchoice
@@ -136,7 +134,7 @@ config HAVE_MEMBLOCK_PHYS_MAP
config HAVE_GENERIC_GUP
bool
-config ARCH_DISCARD_MEMBLOCK
+config ARCH_KEEP_MEMBLOCK
bool
config MEMORY_ISOLATION
@@ -161,7 +159,6 @@ config MEMORY_HOTPLUG_SPARSE
config MEMORY_HOTPLUG_DEFAULT_ONLINE
bool "Online the newly added memory blocks by default"
- default n
depends on MEMORY_HOTPLUG
help
This option sets the default policy setting for memory hotplug
@@ -258,6 +255,9 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
config ARCH_ENABLE_THP_MIGRATION
bool
+config CONTIG_ALLOC
+ def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
+
config PHYS_ADDR_T_64BIT
def_bool 64BIT
@@ -436,7 +436,6 @@ config NEED_PER_CPU_KM
config CLEANCACHE
bool "Enable cleancache driver to cache clean pages if tmem is present"
- default n
help
Cleancache can be thought of as a page-granularity victim cache
for clean pages that the kernel's pageframe replacement algorithm
@@ -460,7 +459,6 @@ config CLEANCACHE
config FRONTSWAP
bool "Enable frontswap to cache swap pages if tmem is present"
depends on SWAP
- default n
help
Frontswap is so named because it can be thought of as the opposite
of a "backing" store for a swap device. The data is stored into
@@ -532,7 +530,6 @@ config ZSWAP
depends on FRONTSWAP && CRYPTO=y
select CRYPTO_LZO
select ZPOOL
- default n
help
A lightweight compressed cache for swap pages. It takes
pages that are in the process of being swapped out and attempts to
@@ -549,14 +546,12 @@ config ZSWAP
config ZPOOL
tristate "Common API for compressed memory storage"
- default n
help
Compressed memory storage API. This allows using either zbud or
zsmalloc.
config ZBUD
tristate "Low (Up to 2x) density storage for compressed pages"
- default n
help
A special purpose allocator for storing compressed pages.
It is designed to store up to two compressed pages per physical
@@ -567,7 +562,6 @@ config ZBUD
config Z3FOLD
tristate "Up to 3x density storage for compressed pages"
depends on ZPOOL
- default n
help
A special purpose allocator for storing compressed pages.
It is designed to store up to three compressed pages per physical
@@ -577,7 +571,6 @@ config Z3FOLD
config ZSMALLOC
tristate "Memory allocator for compressed pages"
depends on MMU
- default n
help
zsmalloc is a slab-based memory allocator designed to store
compressed RAM pages. zsmalloc uses virtual memory mapping
@@ -628,7 +621,6 @@ config MAX_STACK_SIZE_MB
config DEFERRED_STRUCT_PAGE_INIT
bool "Defer initialisation of struct pages to kthreads"
- default n
depends on SPARSEMEM
depends on !NEED_PER_CPU_KM
depends on 64BIT
@@ -676,6 +668,22 @@ config ZONE_DEVICE
If FS_DAX is enabled, then say Y.
+config ARCH_HAS_HMM_MIRROR
+ bool
+ default y
+ depends on (X86_64 || PPC64)
+ depends on MMU && 64BIT
+
+config ARCH_HAS_HMM_DEVICE
+ bool
+ default y
+ depends on (X86_64 || PPC64)
+ depends on MEMORY_HOTPLUG
+ depends on MEMORY_HOTREMOVE
+ depends on SPARSEMEM_VMEMMAP
+ depends on ARCH_HAS_ZONE_DEVICE
+ select XARRAY_MULTI
+
config ARCH_HAS_HMM
bool
default y
@@ -694,12 +702,12 @@ config DEV_PAGEMAP_OPS
config HMM
bool
+ select MMU_NOTIFIER
select MIGRATE_VMA_HELPER
config HMM_MIRROR
bool "HMM mirror CPU page table into a device page table"
depends on ARCH_HAS_HMM
- select MMU_NOTIFIER
select HMM
help
Select HMM_MIRROR if you want to mirror range of the CPU page table of a
@@ -740,7 +748,6 @@ config ARCH_HAS_PKEYS
config PERCPU_STATS
bool "Collect percpu memory statistics"
- default n
help
This feature collects and exposes statistics via debugfs. The
information includes global and per chunk statistics, which can
@@ -748,7 +755,6 @@ config PERCPU_STATS
config GUP_BENCHMARK
bool "Enable infrastructure for get_user_pages_fast() benchmarking"
- default n
help
Provides /sys/kernel/debug/gup_benchmark that helps with testing
performance of get_user_pages_fast().
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index e3df921208c0..e980ceb775a4 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -33,7 +33,6 @@ config DEBUG_PAGEALLOC
config DEBUG_PAGEALLOC_ENABLE_DEFAULT
bool "Enable debug page memory allocations by default?"
- default n
depends on DEBUG_PAGEALLOC
---help---
Enable debug page memory allocations by default? This value
diff --git a/mm/cma.c b/mm/cma.c
index bb2d333ffcb3..5e36d7418031 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -106,8 +106,10 @@ static int __init cma_activate_area(struct cma *cma)
cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
- if (!cma->bitmap)
+ if (!cma->bitmap) {
+ cma->count = 0;
return -ENOMEM;
+ }
WARN_ON_ONCE(!pfn_valid(pfn));
zone = page_zone(pfn_to_page(pfn));
@@ -367,23 +369,26 @@ err:
#ifdef CONFIG_CMA_DEBUG
static void cma_debug_show_areas(struct cma *cma)
{
- unsigned long next_zero_bit, next_set_bit;
+ unsigned long next_zero_bit, next_set_bit, nr_zero;
unsigned long start = 0;
- unsigned int nr_zero, nr_total = 0;
+ unsigned long nr_part, nr_total = 0;
+ unsigned long nbits = cma_bitmap_maxno(cma);
mutex_lock(&cma->lock);
pr_info("number of available pages: ");
for (;;) {
- next_zero_bit = find_next_zero_bit(cma->bitmap, cma->count, start);
- if (next_zero_bit >= cma->count)
+ next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start);
+ if (next_zero_bit >= nbits)
break;
- next_set_bit = find_next_bit(cma->bitmap, cma->count, next_zero_bit);
+ next_set_bit = find_next_bit(cma->bitmap, nbits, next_zero_bit);
nr_zero = next_set_bit - next_zero_bit;
- pr_cont("%s%u@%lu", nr_total ? "+" : "", nr_zero, next_zero_bit);
- nr_total += nr_zero;
+ nr_part = nr_zero << cma->order_per_bit;
+ pr_cont("%s%lu@%lu", nr_total ? "+" : "", nr_part,
+ next_zero_bit);
+ nr_total += nr_part;
start = next_zero_bit + nr_zero;
}
- pr_cont("=> %u free of %lu total pages\n", nr_total, cma->count);
+ pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
mutex_unlock(&cma->lock);
}
#else
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 8d7b2fd52225..a7dd9e8e10d5 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -56,7 +56,7 @@ static int cma_maxchunk_get(void *data, u64 *val)
mutex_lock(&cma->lock);
for (;;) {
start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end);
- if (start >= cma->count)
+ if (start >= bitmap_maxno)
break;
end = find_next_bit(cma->bitmap, bitmap_maxno, start);
maxchunk = max(end - start, maxchunk);
diff --git a/mm/compaction.c b/mm/compaction.c
index 3319e0872d01..6cc4bea33dcb 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1164,7 +1164,9 @@ static bool suitable_migration_target(struct compact_control *cc,
static inline unsigned int
freelist_scan_limit(struct compact_control *cc)
{
- return (COMPACT_CLUSTER_MAX >> cc->fast_search_fail) + 1;
+ unsigned short shift = BITS_PER_LONG - 1;
+
+ return (COMPACT_CLUSTER_MAX >> min(shift, cc->fast_search_fail)) + 1;
}
/*
diff --git a/mm/filemap.c b/mm/filemap.c
index d78f577baef2..c5af80c43d36 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -24,6 +24,7 @@
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/uio.h>
+#include <linux/error-injection.h>
#include <linux/hash.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
@@ -279,11 +280,11 @@ EXPORT_SYMBOL(delete_from_page_cache);
* @pvec: pagevec with pages to delete
*
* The function walks over mapping->i_pages and removes pages passed in @pvec
- * from the mapping. The function expects @pvec to be sorted by page index.
+ * from the mapping. The function expects @pvec to be sorted by page index
+ * and is optimised for it to be dense.
* It tolerates holes in @pvec (mapping entries at those indices are not
* modified). The function expects only THP head pages to be present in the
- * @pvec and takes care to delete all corresponding tail pages from the
- * mapping as well.
+ * @pvec.
*
* The function expects the i_pages lock to be held.
*/
@@ -292,40 +293,44 @@ static void page_cache_delete_batch(struct address_space *mapping,
{
XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
int total_pages = 0;
- int i = 0, tail_pages = 0;
+ int i = 0;
struct page *page;
mapping_set_update(&xas, mapping);
xas_for_each(&xas, page, ULONG_MAX) {
- if (i >= pagevec_count(pvec) && !tail_pages)
+ if (i >= pagevec_count(pvec))
break;
+
+ /* A swap/dax/shadow entry got inserted? Skip it. */
if (xa_is_value(page))
continue;
- if (!tail_pages) {
- /*
- * Some page got inserted in our range? Skip it. We
- * have our pages locked so they are protected from
- * being removed.
- */
- if (page != pvec->pages[i]) {
- VM_BUG_ON_PAGE(page->index >
- pvec->pages[i]->index, page);
- continue;
- }
- WARN_ON_ONCE(!PageLocked(page));
- if (PageTransHuge(page) && !PageHuge(page))
- tail_pages = HPAGE_PMD_NR - 1;
+ /*
+ * A page got inserted in our range? Skip it. We have our
+ * pages locked so they are protected from being removed.
+ * If we see a page whose index is higher than ours, it
+ * means our page has been removed, which shouldn't be
+ * possible because we're holding the PageLock.
+ */
+ if (page != pvec->pages[i]) {
+ VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
+ page);
+ continue;
+ }
+
+ WARN_ON_ONCE(!PageLocked(page));
+
+ if (page->index == xas.xa_index)
page->mapping = NULL;
- /*
- * Leave page->index set: truncation lookup relies
- * upon it
- */
+ /* Leave page->index set: truncation lookup relies on it */
+
+ /*
+ * Move to the next page in the vector if this is a regular
+ * page or the index is of the last sub-page of this compound
+ * page.
+ */
+ if (page->index + (1UL << compound_order(page)) - 1 ==
+ xas.xa_index)
i++;
- } else {
- VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages
- != pvec->pages[i]->index, page);
- tail_pages--;
- }
xas_store(&xas, NULL);
total_pages++;
}
@@ -878,6 +883,7 @@ error:
put_page(page);
return xas_error(&xas);
}
+ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
/**
* add_to_page_cache_locked - add a locked page to the pagecache
@@ -1440,7 +1446,7 @@ pgoff_t page_cache_next_miss(struct address_space *mapping,
EXPORT_SYMBOL(page_cache_next_miss);
/**
- * page_cache_prev_miss() - Find the next gap in the page cache.
+ * page_cache_prev_miss() - Find the previous gap in the page cache.
* @mapping: Mapping.
* @index: Index.
* @max_scan: Maximum range to search.
@@ -1491,7 +1497,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
{
XA_STATE(xas, &mapping->i_pages, offset);
- struct page *head, *page;
+ struct page *page;
rcu_read_lock();
repeat:
@@ -1506,25 +1512,19 @@ repeat:
if (!page || xa_is_value(page))
goto out;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto repeat;
- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
-
/*
- * Has the page moved?
+ * Has the page moved or been split?
* This is part of the lockless pagecache protocol. See
* include/linux/pagemap.h for details.
*/
if (unlikely(page != xas_reload(&xas))) {
- put_page(head);
+ put_page(page);
goto repeat;
}
+ page = find_subpage(page, offset);
out:
rcu_read_unlock();
@@ -1706,7 +1706,6 @@ unsigned find_get_entries(struct address_space *mapping,
rcu_read_lock();
xas_for_each(&xas, page, ULONG_MAX) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/*
@@ -1717,17 +1716,13 @@ unsigned find_get_entries(struct address_space *mapping,
if (xa_is_value(page))
goto export;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
+ page = find_subpage(page, xas.xa_index);
export:
indices[ret] = xas.xa_index;
@@ -1736,7 +1731,7 @@ export:
break;
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -1778,33 +1773,27 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
rcu_read_lock();
xas_for_each(&xas, page, end) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/* Skip over shadow, swap and DAX entries */
if (xa_is_value(page))
continue;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages) {
*start = xas.xa_index + 1;
goto out;
}
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -1849,7 +1838,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
rcu_read_lock();
for (page = xas_load(&xas); page; page = xas_next(&xas)) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/*
@@ -1859,24 +1847,19 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
if (xa_is_value(page))
break;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages)
break;
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -1912,7 +1895,6 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
rcu_read_lock();
xas_for_each_marked(&xas, page, end, tag) {
- struct page *head;
if (xas_retry(&xas, page))
continue;
/*
@@ -1923,26 +1905,21 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
if (xa_is_value(page))
continue;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto put_page;
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages) {
*index = xas.xa_index + 1;
goto out;
}
continue;
put_page:
- put_page(head);
+ put_page(page);
retry:
xas_reset(&xas);
}
@@ -1964,72 +1941,6 @@ out:
}
EXPORT_SYMBOL(find_get_pages_range_tag);
-/**
- * find_get_entries_tag - find and return entries that match @tag
- * @mapping: the address_space to search
- * @start: the starting page cache index
- * @tag: the tag index
- * @nr_entries: the maximum number of entries
- * @entries: where the resulting entries are placed
- * @indices: the cache indices corresponding to the entries in @entries
- *
- * Like find_get_entries, except we only return entries which are tagged with
- * @tag.
- *
- * Return: the number of entries which were found.
- */
-unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
- xa_mark_t tag, unsigned int nr_entries,
- struct page **entries, pgoff_t *indices)
-{
- XA_STATE(xas, &mapping->i_pages, start);
- struct page *page;
- unsigned int ret = 0;
-
- if (!nr_entries)
- return 0;
-
- rcu_read_lock();
- xas_for_each_marked(&xas, page, ULONG_MAX, tag) {
- struct page *head;
- if (xas_retry(&xas, page))
- continue;
- /*
- * A shadow entry of a recently evicted page, a swap
- * entry from shmem/tmpfs or a DAX entry. Return it
- * without attempting to raise page count.
- */
- if (xa_is_value(page))
- goto export;
-
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
- goto retry;
-
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto put_page;
-
- /* Has the page moved? */
- if (unlikely(page != xas_reload(&xas)))
- goto put_page;
-
-export:
- indices[ret] = xas.xa_index;
- entries[ret] = page;
- if (++ret == nr_entries)
- break;
- continue;
-put_page:
- put_page(head);
-retry:
- xas_reset(&xas);
- }
- rcu_read_unlock();
- return ret;
-}
-EXPORT_SYMBOL(find_get_entries_tag);
-
/*
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
* a _large_ part of the i/o request. Imagine the worst scenario:
@@ -2691,7 +2602,7 @@ void filemap_map_pages(struct vm_fault *vmf,
pgoff_t last_pgoff = start_pgoff;
unsigned long max_idx;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
- struct page *head, *page;
+ struct page *page;
rcu_read_lock();
xas_for_each(&xas, page, end_pgoff) {
@@ -2700,24 +2611,19 @@ void filemap_map_pages(struct vm_fault *vmf,
if (xa_is_value(page))
goto next;
- head = compound_head(page);
-
/*
* Check for a locked page first, as a speculative
* reference may adversely influence page migration.
*/
- if (PageLocked(head))
+ if (PageLocked(page))
goto next;
- if (!page_cache_get_speculative(head))
+ if (!page_cache_get_speculative(page))
goto next;
- /* The page was split under us? */
- if (compound_head(page) != head)
- goto skip;
-
- /* Has the page moved? */
+ /* Has the page moved or been split? */
if (unlikely(page != xas_reload(&xas)))
goto skip;
+ page = find_subpage(page, xas.xa_index);
if (!PageUptodate(page) ||
PageReadahead(page) ||
diff --git a/mm/gup.c b/mm/gup.c
index 91819b8ad9cc..2c08248d4fa2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -28,6 +28,111 @@ struct follow_page_context {
unsigned int page_mask;
};
+typedef int (*set_dirty_func_t)(struct page *page);
+
+static void __put_user_pages_dirty(struct page **pages,
+ unsigned long npages,
+ set_dirty_func_t sdf)
+{
+ unsigned long index;
+
+ for (index = 0; index < npages; index++) {
+ struct page *page = compound_head(pages[index]);
+
+ /*
+ * Checking PageDirty at this point may race with
+ * clear_page_dirty_for_io(), but that's OK. Two key cases:
+ *
+ * 1) This code sees the page as already dirty, so it skips
+ * the call to sdf(). That could happen because
+ * clear_page_dirty_for_io() called page_mkclean(),
+ * followed by set_page_dirty(). However, now the page is
+ * going to get written back, which meets the original
+ * intention of setting it dirty, so all is well:
+ * clear_page_dirty_for_io() goes on to call
+ * TestClearPageDirty(), and write the page back.
+ *
+ * 2) This code sees the page as clean, so it calls sdf().
+ * The page stays dirty, despite being written back, so it
+ * gets written back again in the next writeback cycle.
+ * This is harmless.
+ */
+ if (!PageDirty(page))
+ sdf(page);
+
+ put_user_page(page);
+ }
+}
+
+/**
+ * put_user_pages_dirty() - release and dirty an array of gup-pinned pages
+ * @pages: array of pages to be marked dirty and released.
+ * @npages: number of pages in the @pages array.
+ *
+ * "gup-pinned page" refers to a page that has had one of the get_user_pages()
+ * variants called on that page.
+ *
+ * For each page in the @pages array, make that page (or its head page, if a
+ * compound page) dirty, if it was previously listed as clean. Then, release
+ * the page using put_user_page().
+ *
+ * Please see the put_user_page() documentation for details.
+ *
+ * set_page_dirty(), which does not lock the page, is used here.
+ * Therefore, it is the caller's responsibility to ensure that this is
+ * safe. If not, then put_user_pages_dirty_lock() should be called instead.
+ *
+ */
+void put_user_pages_dirty(struct page **pages, unsigned long npages)
+{
+ __put_user_pages_dirty(pages, npages, set_page_dirty);
+}
+EXPORT_SYMBOL(put_user_pages_dirty);
+
+/**
+ * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages
+ * @pages: array of pages to be marked dirty and released.
+ * @npages: number of pages in the @pages array.
+ *
+ * For each page in the @pages array, make that page (or its head page, if a
+ * compound page) dirty, if it was previously listed as clean. Then, release
+ * the page using put_user_page().
+ *
+ * Please see the put_user_page() documentation for details.
+ *
+ * This is just like put_user_pages_dirty(), except that it invokes
+ * set_page_dirty_lock(), instead of set_page_dirty().
+ *
+ */
+void put_user_pages_dirty_lock(struct page **pages, unsigned long npages)
+{
+ __put_user_pages_dirty(pages, npages, set_page_dirty_lock);
+}
+EXPORT_SYMBOL(put_user_pages_dirty_lock);
+
+/**
+ * put_user_pages() - release an array of gup-pinned pages.
+ * @pages: array of pages to be marked dirty and released.
+ * @npages: number of pages in the @pages array.
+ *
+ * For each page in the @pages array, release the page using put_user_page().
+ *
+ * Please see the put_user_page() documentation for details.
+ */
+void put_user_pages(struct page **pages, unsigned long npages)
+{
+ unsigned long index;
+
+ /*
+ * TODO: this can be optimized for huge pages: if a series of pages is
+ * physically contiguous and part of the same compound page, then a
+ * single operation to the head page should suffice.
+ */
+ for (index = 0; index < npages; index++)
+ put_user_page(pages[index]);
+}
+EXPORT_SYMBOL(put_user_pages);
+
static struct page *no_page_table(struct vm_area_struct *vma,
unsigned int flags)
{
@@ -1018,6 +1123,15 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
int *locked)
{
+ /*
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. As there are no users of this flag in this call we simply
+ * disallow this option for now.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return -EINVAL;
+
return __get_user_pages_locked(current, current->mm, start, nr_pages,
pages, NULL, locked,
gup_flags | FOLL_TOUCH);
@@ -1046,6 +1160,15 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
int locked = 1;
long ret;
+ /*
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. As there are no users of this flag in this call we simply
+ * disallow this option for now.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return -EINVAL;
+
down_read(&mm->mmap_sem);
ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
&locked, gup_flags | FOLL_TOUCH);
@@ -1116,32 +1239,22 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
+ /*
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. As there are no users of this flag in this call we simply
+ * disallow this option for now.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return -EINVAL;
+
return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
locked,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
EXPORT_SYMBOL(get_user_pages_remote);
-/*
- * This is the same as get_user_pages_remote(), just with a
- * less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's and don't allow
- * passing of a locked parameter. We also obviously don't pass
- * FOLL_REMOTE in here.
- */
-long get_user_pages(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas)
-{
- return __get_user_pages_locked(current, current->mm, start, nr_pages,
- pages, vmas, NULL,
- gup_flags | FOLL_TOUCH);
-}
-EXPORT_SYMBOL(get_user_pages);
-
#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
-
-#ifdef CONFIG_FS_DAX
static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
{
long i;
@@ -1160,12 +1273,6 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
}
return false;
}
-#else
-static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
-{
- return false;
-}
-#endif
#ifdef CONFIG_CMA
static struct page *new_non_cma_page(struct page *page, unsigned long private)
@@ -1219,10 +1326,13 @@ static struct page *new_non_cma_page(struct page *page, unsigned long private)
return __alloc_pages_node(nid, gfp_mask, 0);
}
-static long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
- unsigned int gup_flags,
+static long check_and_migrate_cma_pages(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
struct page **pages,
- struct vm_area_struct **vmas)
+ struct vm_area_struct **vmas,
+ unsigned int gup_flags)
{
long i;
bool drain_allow = true;
@@ -1278,10 +1388,14 @@ check_again:
putback_movable_pages(&cma_page_list);
}
/*
- * We did migrate all the pages, Try to get the page references again
- * migrating any new CMA pages which we failed to isolate earlier.
+ * We did migrate all the pages, Try to get the page references
+ * again migrating any new CMA pages which we failed to isolate
+ * earlier.
*/
- nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+ nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages,
+ pages, vmas, NULL,
+ gup_flags);
+
if ((nr_pages > 0) && migrate_allow) {
drain_allow = true;
goto check_again;
@@ -1291,66 +1405,101 @@ check_again:
return nr_pages;
}
#else
-static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages,
- unsigned int gup_flags,
- struct page **pages,
- struct vm_area_struct **vmas)
+static long check_and_migrate_cma_pages(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int gup_flags)
{
return nr_pages;
}
#endif
/*
- * This is the same as get_user_pages() in that it assumes we are
- * operating on the current task's mm, but it goes further to validate
- * that the vmas associated with the address range are suitable for
- * longterm elevated page reference counts. For example, filesystem-dax
- * mappings are subject to the lifetime enforced by the filesystem and
- * we need guarantees that longterm users like RDMA and V4L2 only
- * establish mappings that have a kernel enforced revocation mechanism.
- *
- * "longterm" == userspace controlled elevated page count lifetime.
- * Contrast this to iov_iter_get_pages() usages which are transient.
+ * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
+ * allows us to process the FOLL_LONGTERM flag.
*/
-long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas_arg)
+static long __gup_longterm_locked(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int gup_flags)
{
- struct vm_area_struct **vmas = vmas_arg;
- unsigned long flags;
+ struct vm_area_struct **vmas_tmp = vmas;
+ unsigned long flags = 0;
long rc, i;
- if (!pages)
- return -EINVAL;
-
- if (!vmas) {
- vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
- GFP_KERNEL);
- if (!vmas)
- return -ENOMEM;
+ if (gup_flags & FOLL_LONGTERM) {
+ if (!pages)
+ return -EINVAL;
+
+ if (!vmas_tmp) {
+ vmas_tmp = kcalloc(nr_pages,
+ sizeof(struct vm_area_struct *),
+ GFP_KERNEL);
+ if (!vmas_tmp)
+ return -ENOMEM;
+ }
+ flags = memalloc_nocma_save();
}
- flags = memalloc_nocma_save();
- rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
- memalloc_nocma_restore(flags);
- if (rc < 0)
- goto out;
+ rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages,
+ vmas_tmp, NULL, gup_flags);
- if (check_dax_vmas(vmas, rc)) {
- for (i = 0; i < rc; i++)
- put_page(pages[i]);
- rc = -EOPNOTSUPP;
- goto out;
+ if (gup_flags & FOLL_LONGTERM) {
+ memalloc_nocma_restore(flags);
+ if (rc < 0)
+ goto out;
+
+ if (check_dax_vmas(vmas_tmp, rc)) {
+ for (i = 0; i < rc; i++)
+ put_page(pages[i]);
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages,
+ vmas_tmp, gup_flags);
}
- rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas);
out:
- if (vmas != vmas_arg)
- kfree(vmas);
+ if (vmas_tmp != vmas)
+ kfree(vmas_tmp);
return rc;
}
-EXPORT_SYMBOL(get_user_pages_longterm);
-#endif /* CONFIG_FS_DAX */
+#else /* !CONFIG_FS_DAX && !CONFIG_CMA */
+static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int flags)
+{
+ return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+ NULL, flags);
+}
+#endif /* CONFIG_FS_DAX || CONFIG_CMA */
+
+/*
+ * This is the same as get_user_pages_remote(), just with a
+ * less-flexible calling convention where we assume that the task
+ * and mm being operated on are the current task's and don't allow
+ * passing of a locked parameter. We also obviously don't pass
+ * FOLL_REMOTE in here.
+ */
+long get_user_pages(unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ return __gup_longterm_locked(current, current->mm, start, nr_pages,
+ pages, vmas, gup_flags | FOLL_TOUCH);
+}
+EXPORT_SYMBOL(get_user_pages);
/**
* populate_vma_page_range() - populate a range of pages in the vma.
@@ -1571,7 +1720,7 @@ static inline struct page *try_get_compound_head(struct page *page, int refs)
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
@@ -1589,10 +1738,13 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
if (pte_protnone(pte))
goto pte_unmap;
- if (!pte_access_permitted(pte, write))
+ if (!pte_access_permitted(pte, flags & FOLL_WRITE))
goto pte_unmap;
if (pte_devmap(pte)) {
+ if (unlikely(flags & FOLL_LONGTERM))
+ goto pte_unmap;
+
pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, pages);
@@ -1641,7 +1793,7 @@ pte_unmap:
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
return 0;
}
@@ -1724,16 +1876,19 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
#endif
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags, struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
- if (!pmd_access_permitted(orig, write))
+ if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
return 0;
- if (pmd_devmap(orig))
+ if (pmd_devmap(orig)) {
+ if (unlikely(flags & FOLL_LONGTERM))
+ return 0;
return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
+ }
refs = 0;
page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1762,16 +1917,19 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
}
static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags, struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
- if (!pud_access_permitted(orig, write))
+ if (!pud_access_permitted(orig, flags & FOLL_WRITE))
return 0;
- if (pud_devmap(orig))
+ if (pud_devmap(orig)) {
+ if (unlikely(flags & FOLL_LONGTERM))
+ return 0;
return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
+ }
refs = 0;
page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
@@ -1800,13 +1958,13 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
}
static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
- unsigned long end, int write,
+ unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
int refs;
struct page *head, *page;
- if (!pgd_access_permitted(orig, write))
+ if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
return 0;
BUILD_BUG_ON(pgd_devmap(orig));
@@ -1837,7 +1995,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
}
static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pmd_t *pmdp;
@@ -1860,7 +2018,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
if (pmd_protnone(pmd))
return 0;
- if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
+ if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
pages, nr))
return 0;
@@ -1870,9 +2028,9 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
* pmd format and THP pmd format
*/
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
- PMD_SHIFT, next, write, pages, nr))
+ PMD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+ } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
@@ -1880,7 +2038,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
}
static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;
@@ -1893,14 +2051,14 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
if (pud_none(pud))
return 0;
if (unlikely(pud_huge(pud))) {
- if (!gup_huge_pud(pud, pudp, addr, next, write,
+ if (!gup_huge_pud(pud, pudp, addr, next, flags,
pages, nr))
return 0;
} else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
- PUD_SHIFT, next, write, pages, nr))
+ PUD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+ } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
return 0;
} while (pudp++, addr = next, addr != end);
@@ -1908,7 +2066,7 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
}
static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
p4d_t *p4dp;
@@ -1923,9 +2081,9 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
BUILD_BUG_ON(p4d_huge(p4d));
if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
- P4D_SHIFT, next, write, pages, nr))
+ P4D_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pud_range(p4d, addr, next, write, pages, nr))
+ } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr))
return 0;
} while (p4dp++, addr = next, addr != end);
@@ -1933,7 +2091,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
}
static void gup_pgd_range(unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pgd_t *pgdp;
@@ -1946,14 +2104,14 @@ static void gup_pgd_range(unsigned long addr, unsigned long end,
if (pgd_none(pgd))
return;
if (unlikely(pgd_huge(pgd))) {
- if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
+ if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
pages, nr))
return;
} else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
- PGDIR_SHIFT, next, write, pages, nr))
+ PGDIR_SHIFT, next, flags, pages, nr))
return;
- } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr))
+ } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr))
return;
} while (pgdp++, addr = next, addr != end);
}
@@ -2007,18 +2165,41 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
if (gup_fast_permitted(start, nr_pages)) {
local_irq_save(flags);
- gup_pgd_range(start, end, write, pages, &nr);
+ gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr);
local_irq_restore(flags);
}
return nr;
}
+static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ int ret;
+
+ /*
+ * FIXME: FOLL_LONGTERM does not work with
+ * get_user_pages_unlocked() (see comments in that function)
+ */
+ if (gup_flags & FOLL_LONGTERM) {
+ down_read(&current->mm->mmap_sem);
+ ret = __gup_longterm_locked(current, current->mm,
+ start, nr_pages,
+ pages, NULL, gup_flags);
+ up_read(&current->mm->mmap_sem);
+ } else {
+ ret = get_user_pages_unlocked(start, nr_pages,
+ pages, gup_flags);
+ }
+
+ return ret;
+}
+
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
@@ -2030,8 +2211,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*/
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
{
unsigned long addr, len, end;
int nr = 0, ret = 0;
@@ -2049,7 +2230,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
if (gup_fast_permitted(start, nr_pages)) {
local_irq_disable();
- gup_pgd_range(addr, end, write, pages, &nr);
+ gup_pgd_range(addr, end, gup_flags, pages, &nr);
local_irq_enable();
ret = nr;
}
@@ -2059,8 +2240,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
start += nr << PAGE_SHIFT;
pages += nr;
- ret = get_user_pages_unlocked(start, nr_pages - nr, pages,
- write ? FOLL_WRITE : 0);
+ ret = __gup_longterm_unlocked(start, nr_pages - nr,
+ gup_flags, pages);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 6c0279e70cc4..7dd602d7f8db 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -54,8 +54,9 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
pages + i);
break;
case GUP_LONGTERM_BENCHMARK:
- nr = get_user_pages_longterm(addr, nr, gup->flags & 1,
- pages + i, NULL);
+ nr = get_user_pages(addr, nr,
+ (gup->flags & 1) | FOLL_LONGTERM,
+ pages + i, NULL);
break;
case GUP_BENCHMARK:
nr = get_user_pages(addr, nr, gup->flags & 1, pages + i,
diff --git a/mm/hmm.c b/mm/hmm.c
index fe1cd87e49ac..0db8491090b8 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -30,6 +30,7 @@
#include <linux/hugetlb.h>
#include <linux/memremap.h>
#include <linux/jump_label.h>
+#include <linux/dma-mapping.h>
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
@@ -38,54 +39,48 @@
#if IS_ENABLED(CONFIG_HMM_MIRROR)
static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
-/*
- * struct hmm - HMM per mm struct
- *
- * @mm: mm struct this HMM struct is bound to
- * @lock: lock protecting ranges list
- * @ranges: list of range being snapshotted
- * @mirrors: list of mirrors for this mm
- * @mmu_notifier: mmu notifier to track updates to CPU page table
- * @mirrors_sem: read/write semaphore protecting the mirrors list
- */
-struct hmm {
- struct mm_struct *mm;
- spinlock_t lock;
- struct list_head ranges;
- struct list_head mirrors;
- struct mmu_notifier mmu_notifier;
- struct rw_semaphore mirrors_sem;
-};
+static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
+{
+ struct hmm *hmm = READ_ONCE(mm->hmm);
-/*
- * hmm_register - register HMM against an mm (HMM internal)
+ if (hmm && kref_get_unless_zero(&hmm->kref))
+ return hmm;
+
+ return NULL;
+}
+
+/**
+ * hmm_get_or_create - register HMM against an mm (HMM internal)
*
* @mm: mm struct to attach to
+ * Returns: returns an HMM object, either by referencing the existing
+ * (per-process) object, or by creating a new one.
*
- * This is not intended to be used directly by device drivers. It allocates an
- * HMM struct if mm does not have one, and initializes it.
+ * This is not intended to be used directly by device drivers. If mm already
+ * has an HMM struct then it get a reference on it and returns it. Otherwise
+ * it allocates an HMM struct, initializes it, associate it with the mm and
+ * returns it.
*/
-static struct hmm *hmm_register(struct mm_struct *mm)
+static struct hmm *hmm_get_or_create(struct mm_struct *mm)
{
- struct hmm *hmm = READ_ONCE(mm->hmm);
+ struct hmm *hmm = mm_get_hmm(mm);
bool cleanup = false;
- /*
- * The hmm struct can only be freed once the mm_struct goes away,
- * hence we should always have pre-allocated an new hmm struct
- * above.
- */
if (hmm)
return hmm;
hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
if (!hmm)
return NULL;
+ init_waitqueue_head(&hmm->wq);
INIT_LIST_HEAD(&hmm->mirrors);
init_rwsem(&hmm->mirrors_sem);
hmm->mmu_notifier.ops = NULL;
INIT_LIST_HEAD(&hmm->ranges);
- spin_lock_init(&hmm->lock);
+ mutex_init(&hmm->lock);
+ kref_init(&hmm->kref);
+ hmm->notifiers = 0;
+ hmm->dead = false;
hmm->mm = mm;
spin_lock(&mm->page_table_lock);
@@ -106,7 +101,7 @@ static struct hmm *hmm_register(struct mm_struct *mm)
if (__mmu_notifier_register(&hmm->mmu_notifier, mm))
goto error_mm;
- return mm->hmm;
+ return hmm;
error_mm:
spin_lock(&mm->page_table_lock);
@@ -118,54 +113,60 @@ error:
return NULL;
}
-void hmm_mm_destroy(struct mm_struct *mm)
+static void hmm_free(struct kref *kref)
{
- kfree(mm->hmm);
-}
+ struct hmm *hmm = container_of(kref, struct hmm, kref);
+ struct mm_struct *mm = hmm->mm;
-static int hmm_invalidate_range(struct hmm *hmm, bool device,
- const struct hmm_update *update)
-{
- struct hmm_mirror *mirror;
- struct hmm_range *range;
-
- spin_lock(&hmm->lock);
- list_for_each_entry(range, &hmm->ranges, list) {
- unsigned long addr, idx, npages;
+ mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
- if (update->end < range->start || update->start >= range->end)
- continue;
+ spin_lock(&mm->page_table_lock);
+ if (mm->hmm == hmm)
+ mm->hmm = NULL;
+ spin_unlock(&mm->page_table_lock);
- range->valid = false;
- addr = max(update->start, range->start);
- idx = (addr - range->start) >> PAGE_SHIFT;
- npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT;
- memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
- }
- spin_unlock(&hmm->lock);
+ kfree(hmm);
+}
- if (!device)
- return 0;
+static inline void hmm_put(struct hmm *hmm)
+{
+ kref_put(&hmm->kref, hmm_free);
+}
- down_read(&hmm->mirrors_sem);
- list_for_each_entry(mirror, &hmm->mirrors, list) {
- int ret;
+void hmm_mm_destroy(struct mm_struct *mm)
+{
+ struct hmm *hmm;
- ret = mirror->ops->sync_cpu_device_pagetables(mirror, update);
- if (!update->blockable && ret == -EAGAIN) {
- up_read(&hmm->mirrors_sem);
- return -EAGAIN;
- }
+ spin_lock(&mm->page_table_lock);
+ hmm = mm_get_hmm(mm);
+ mm->hmm = NULL;
+ if (hmm) {
+ hmm->mm = NULL;
+ hmm->dead = true;
+ spin_unlock(&mm->page_table_lock);
+ hmm_put(hmm);
+ return;
}
- up_read(&hmm->mirrors_sem);
- return 0;
+ spin_unlock(&mm->page_table_lock);
}
static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
{
+ struct hmm *hmm = mm_get_hmm(mm);
struct hmm_mirror *mirror;
- struct hmm *hmm = mm->hmm;
+ struct hmm_range *range;
+
+ /* Report this HMM as dying. */
+ hmm->dead = true;
+
+ /* Wake-up everyone waiting on any range. */
+ mutex_lock(&hmm->lock);
+ list_for_each_entry(range, &hmm->ranges, list) {
+ range->valid = false;
+ }
+ wake_up_all(&hmm->wq);
+ mutex_unlock(&hmm->lock);
down_write(&hmm->mirrors_sem);
mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
@@ -186,36 +187,86 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
struct hmm_mirror, list);
}
up_write(&hmm->mirrors_sem);
+
+ hmm_put(hmm);
}
static int hmm_invalidate_range_start(struct mmu_notifier *mn,
- const struct mmu_notifier_range *range)
+ const struct mmu_notifier_range *nrange)
{
+ struct hmm *hmm = mm_get_hmm(nrange->mm);
+ struct hmm_mirror *mirror;
struct hmm_update update;
- struct hmm *hmm = range->mm->hmm;
+ struct hmm_range *range;
+ int ret = 0;
VM_BUG_ON(!hmm);
- update.start = range->start;
- update.end = range->end;
+ update.start = nrange->start;
+ update.end = nrange->end;
update.event = HMM_UPDATE_INVALIDATE;
- update.blockable = range->blockable;
- return hmm_invalidate_range(hmm, true, &update);
+ update.blockable = mmu_notifier_range_blockable(nrange);
+
+ if (mmu_notifier_range_blockable(nrange))
+ mutex_lock(&hmm->lock);
+ else if (!mutex_trylock(&hmm->lock)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ hmm->notifiers++;
+ list_for_each_entry(range, &hmm->ranges, list) {
+ if (update.end < range->start || update.start >= range->end)
+ continue;
+
+ range->valid = false;
+ }
+ mutex_unlock(&hmm->lock);
+
+ if (mmu_notifier_range_blockable(nrange))
+ down_read(&hmm->mirrors_sem);
+ else if (!down_read_trylock(&hmm->mirrors_sem)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ list_for_each_entry(mirror, &hmm->mirrors, list) {
+ int ret;
+
+ ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update);
+ if (!update.blockable && ret == -EAGAIN) {
+ up_read(&hmm->mirrors_sem);
+ ret = -EAGAIN;
+ goto out;
+ }
+ }
+ up_read(&hmm->mirrors_sem);
+
+out:
+ hmm_put(hmm);
+ return ret;
}
static void hmm_invalidate_range_end(struct mmu_notifier *mn,
- const struct mmu_notifier_range *range)
+ const struct mmu_notifier_range *nrange)
{
- struct hmm_update update;
- struct hmm *hmm = range->mm->hmm;
+ struct hmm *hmm = mm_get_hmm(nrange->mm);
VM_BUG_ON(!hmm);
- update.start = range->start;
- update.end = range->end;
- update.event = HMM_UPDATE_INVALIDATE;
- update.blockable = true;
- hmm_invalidate_range(hmm, false, &update);
+ mutex_lock(&hmm->lock);
+ hmm->notifiers--;
+ if (!hmm->notifiers) {
+ struct hmm_range *range;
+
+ list_for_each_entry(range, &hmm->ranges, list) {
+ if (range->valid)
+ continue;
+ range->valid = true;
+ }
+ wake_up_all(&hmm->wq);
+ }
+ mutex_unlock(&hmm->lock);
+
+ hmm_put(hmm);
}
static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
@@ -241,24 +292,13 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
if (!mm || !mirror || !mirror->ops)
return -EINVAL;
-again:
- mirror->hmm = hmm_register(mm);
+ mirror->hmm = hmm_get_or_create(mm);
if (!mirror->hmm)
return -ENOMEM;
down_write(&mirror->hmm->mirrors_sem);
- if (mirror->hmm->mm == NULL) {
- /*
- * A racing hmm_mirror_unregister() is about to destroy the hmm
- * struct. Try again to allocate a new one.
- */
- up_write(&mirror->hmm->mirrors_sem);
- mirror->hmm = NULL;
- goto again;
- } else {
- list_add(&mirror->list, &mirror->hmm->mirrors);
- up_write(&mirror->hmm->mirrors_sem);
- }
+ list_add(&mirror->list, &mirror->hmm->mirrors);
+ up_write(&mirror->hmm->mirrors_sem);
return 0;
}
@@ -273,38 +313,24 @@ EXPORT_SYMBOL(hmm_mirror_register);
*/
void hmm_mirror_unregister(struct hmm_mirror *mirror)
{
- bool should_unregister = false;
- struct mm_struct *mm;
- struct hmm *hmm;
+ struct hmm *hmm = READ_ONCE(mirror->hmm);
- if (mirror->hmm == NULL)
+ if (hmm == NULL)
return;
- hmm = mirror->hmm;
down_write(&hmm->mirrors_sem);
list_del_init(&mirror->list);
- should_unregister = list_empty(&hmm->mirrors);
+ /* To protect us against double unregister ... */
mirror->hmm = NULL;
- mm = hmm->mm;
- hmm->mm = NULL;
up_write(&hmm->mirrors_sem);
- if (!should_unregister || mm == NULL)
- return;
-
- mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
-
- spin_lock(&mm->page_table_lock);
- if (mm->hmm == hmm)
- mm->hmm = NULL;
- spin_unlock(&mm->page_table_lock);
-
- kfree(hmm);
+ hmm_put(hmm);
}
EXPORT_SYMBOL(hmm_mirror_unregister);
struct hmm_vma_walk {
struct hmm_range *range;
+ struct dev_pagemap *pgmap;
unsigned long last;
bool fault;
bool block;
@@ -323,13 +349,13 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
flags |= write_fault ? FAULT_FLAG_WRITE : 0;
ret = handle_mm_fault(vma, addr, flags);
if (ret & VM_FAULT_RETRY)
- return -EBUSY;
+ return -EAGAIN;
if (ret & VM_FAULT_ERROR) {
*pfn = range->values[HMM_PFN_ERROR];
return -EFAULT;
}
- return -EAGAIN;
+ return -EBUSY;
}
static int hmm_pfns_bad(unsigned long addr,
@@ -355,7 +381,7 @@ static int hmm_pfns_bad(unsigned long addr,
* @fault: should we fault or not ?
* @write_fault: write fault ?
* @walk: mm_walk structure
- * Returns: 0 on success, -EAGAIN after page fault, or page fault error
+ * Returns: 0 on success, -EBUSY after page fault, or page fault error
*
* This function will be called whenever pmd_none() or pte_none() returns true,
* or whenever there is no page directory covering the virtual address range.
@@ -367,23 +393,25 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
uint64_t *pfns = range->pfns;
- unsigned long i;
+ unsigned long i, page_size;
hmm_vma_walk->last = addr;
- i = (addr - range->start) >> PAGE_SHIFT;
- for (; addr < end; addr += PAGE_SIZE, i++) {
+ page_size = hmm_range_page_size(range);
+ i = (addr - range->start) >> range->page_shift;
+
+ for (; addr < end; addr += page_size, i++) {
pfns[i] = range->values[HMM_PFN_NONE];
if (fault || write_fault) {
int ret;
ret = hmm_vma_do_fault(walk, addr, write_fault,
&pfns[i]);
- if (ret != -EAGAIN)
+ if (ret != -EBUSY)
return ret;
}
}
- return (fault || write_fault) ? -EAGAIN : 0;
+ return (fault || write_fault) ? -EBUSY : 0;
}
static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
@@ -392,10 +420,21 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
{
struct hmm_range *range = hmm_vma_walk->range;
- *fault = *write_fault = false;
if (!hmm_vma_walk->fault)
return;
+ /*
+ * So we not only consider the individual per page request we also
+ * consider the default flags requested for the range. The API can
+ * be use in 2 fashions. The first one where the HMM user coalesce
+ * multiple page fault into one request and set flags per pfns for
+ * of those faults. The second one where the HMM user want to pre-
+ * fault a range with specific flags. For the latter one it is a
+ * waste to have the user pre-fill the pfn arrays with a default
+ * flags value.
+ */
+ pfns = (pfns & range->pfn_flags_mask) | range->default_flags;
+
/* We aren't ask to do anything ... */
if (!(pfns & range->flags[HMM_PFN_VALID]))
return;
@@ -431,10 +470,11 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
return;
}
+ *fault = *write_fault = false;
for (i = 0; i < npages; ++i) {
hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
fault, write_fault);
- if ((*fault) || (*write_fault))
+ if ((*write_fault))
return;
}
}
@@ -465,12 +505,22 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
range->flags[HMM_PFN_VALID];
}
+static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
+{
+ if (!pud_present(pud))
+ return 0;
+ return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
+ range->flags[HMM_PFN_WRITE] :
+ range->flags[HMM_PFN_VALID];
+}
+
static int hmm_vma_handle_pmd(struct mm_walk *walk,
unsigned long addr,
unsigned long end,
uint64_t *pfns,
pmd_t pmd)
{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned long pfn, npages, i;
@@ -486,10 +536,25 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
pfn = pmd_pfn(pmd) + pte_index(addr);
- for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
- pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ if (pmd_devmap(pmd)) {
+ hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+ hmm_vma_walk->pgmap);
+ if (unlikely(!hmm_vma_walk->pgmap))
+ return -EBUSY;
+ }
+ pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
+ }
+ if (hmm_vma_walk->pgmap) {
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
hmm_vma_walk->last = end;
return 0;
+#else
+ /* If THP is not enabled then we should never reach that code ! */
+ return -EINVAL;
+#endif
}
static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
@@ -514,11 +579,11 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
uint64_t orig_pfn = *pfn;
*pfn = range->values[HMM_PFN_NONE];
- cpu_flags = pte_to_hmm_pfn_flags(range, pte);
- hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
- &fault, &write_fault);
+ fault = write_fault = false;
if (pte_none(pte)) {
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0,
+ &fault, &write_fault);
if (fault || write_fault)
goto fault;
return 0;
@@ -546,7 +611,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
&fault, &write_fault);
if (fault || write_fault)
goto fault;
- *pfn = hmm_pfn_from_pfn(range, swp_offset(entry));
+ *pfn = hmm_device_entry_from_pfn(range,
+ swp_offset(entry));
*pfn |= cpu_flags;
return 0;
}
@@ -557,7 +623,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
hmm_vma_walk->last = addr;
migration_entry_wait(vma->vm_mm,
pmdp, addr);
- return -EAGAIN;
+ return -EBUSY;
}
return 0;
}
@@ -565,15 +631,33 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
/* Report error for everything else */
*pfn = range->values[HMM_PFN_ERROR];
return -EFAULT;
+ } else {
+ cpu_flags = pte_to_hmm_pfn_flags(range, pte);
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+ &fault, &write_fault);
}
if (fault || write_fault)
goto fault;
- *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
+ if (pte_devmap(pte)) {
+ hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
+ hmm_vma_walk->pgmap);
+ if (unlikely(!hmm_vma_walk->pgmap))
+ return -EBUSY;
+ } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
+ *pfn = range->values[HMM_PFN_SPECIAL];
+ return -EFAULT;
+ }
+
+ *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;
return 0;
fault:
+ if (hmm_vma_walk->pgmap) {
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
pte_unmap(ptep);
/* Fault any virtual address we were asked to fault */
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
@@ -615,7 +699,7 @@ again:
if (fault || write_fault) {
hmm_vma_walk->last = addr;
pmd_migration_entry_wait(vma->vm_mm, pmdp);
- return -EAGAIN;
+ return -EBUSY;
}
return 0;
} else if (!pmd_present(pmd))
@@ -661,12 +745,158 @@ again:
return r;
}
}
+ if (hmm_vma_walk->pgmap) {
+ /*
+ * We do put_dev_pagemap() here and not in hmm_vma_handle_pte()
+ * so that we can leverage get_dev_pagemap() optimization which
+ * will not re-take a reference on a pgmap if we already have
+ * one.
+ */
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
pte_unmap(ptep - 1);
hmm_vma_walk->last = addr;
return 0;
}
+static int hmm_vma_walk_pud(pud_t *pudp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ unsigned long addr = start, next;
+ pmd_t *pmdp;
+ pud_t pud;
+ int ret;
+
+again:
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud))
+ return hmm_vma_walk_hole(start, end, walk);
+
+ if (pud_huge(pud) && pud_devmap(pud)) {
+ unsigned long i, npages, pfn;
+ uint64_t *pfns, cpu_flags;
+ bool fault, write_fault;
+
+ if (!pud_present(pud))
+ return hmm_vma_walk_hole(start, end, walk);
+
+ i = (addr - range->start) >> PAGE_SHIFT;
+ npages = (end - addr) >> PAGE_SHIFT;
+ pfns = &range->pfns[i];
+
+ cpu_flags = pud_to_hmm_pfn_flags(range, pud);
+ hmm_range_need_fault(hmm_vma_walk, pfns, npages,
+ cpu_flags, &fault, &write_fault);
+ if (fault || write_fault)
+ return hmm_vma_walk_hole_(addr, end, fault,
+ write_fault, walk);
+
+#ifdef CONFIG_HUGETLB_PAGE
+ pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ for (i = 0; i < npages; ++i, ++pfn) {
+ hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+ hmm_vma_walk->pgmap);
+ if (unlikely(!hmm_vma_walk->pgmap))
+ return -EBUSY;
+ pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
+ cpu_flags;
+ }
+ if (hmm_vma_walk->pgmap) {
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
+ hmm_vma_walk->last = end;
+ return 0;
+#else
+ return -EINVAL;
+#endif
+ }
+
+ split_huge_pud(walk->vma, pudp, addr);
+ if (pud_none(*pudp))
+ goto again;
+
+ pmdp = pmd_offset(pudp, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ ret = hmm_vma_walk_pmd(pmdp, addr, next, walk);
+ if (ret)
+ return ret;
+ } while (pmdp++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
+ unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ unsigned long addr = start, i, pfn, mask, size, pfn_inc;
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ struct vm_area_struct *vma = walk->vma;
+ struct hstate *h = hstate_vma(vma);
+ uint64_t orig_pfn, cpu_flags;
+ bool fault, write_fault;
+ spinlock_t *ptl;
+ pte_t entry;
+ int ret = 0;
+
+ size = 1UL << huge_page_shift(h);
+ mask = size - 1;
+ if (range->page_shift != PAGE_SHIFT) {
+ /* Make sure we are looking at full page. */
+ if (start & mask)
+ return -EINVAL;
+ if (end < (start + size))
+ return -EINVAL;
+ pfn_inc = size >> PAGE_SHIFT;
+ } else {
+ pfn_inc = 1;
+ size = PAGE_SIZE;
+ }
+
+
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+ entry = huge_ptep_get(pte);
+
+ i = (start - range->start) >> range->page_shift;
+ orig_pfn = range->pfns[i];
+ range->pfns[i] = range->values[HMM_PFN_NONE];
+ cpu_flags = pte_to_hmm_pfn_flags(range, entry);
+ fault = write_fault = false;
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+ &fault, &write_fault);
+ if (fault || write_fault) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+
+ pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift);
+ for (; addr < end; addr += size, i++, pfn += pfn_inc)
+ range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
+ cpu_flags;
+ hmm_vma_walk->last = end;
+
+unlock:
+ spin_unlock(ptl);
+
+ if (ret == -ENOENT)
+ return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
+
+ return ret;
+#else /* CONFIG_HUGETLB_PAGE */
+ return -EINVAL;
+#endif
+}
+
static void hmm_pfns_clear(struct hmm_range *range,
uint64_t *pfns,
unsigned long addr,
@@ -676,279 +906,437 @@ static void hmm_pfns_clear(struct hmm_range *range,
*pfns = range->values[HMM_PFN_NONE];
}
-static void hmm_pfns_special(struct hmm_range *range)
-{
- unsigned long addr = range->start, i = 0;
-
- for (; addr < range->end; addr += PAGE_SIZE, i++)
- range->pfns[i] = range->values[HMM_PFN_SPECIAL];
-}
-
/*
- * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
- * @range: range being snapshotted
- * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
- * vma permission, 0 success
- *
- * This snapshots the CPU page table for a range of virtual addresses. Snapshot
- * validity is tracked by range struct. See hmm_vma_range_done() for further
- * information.
- *
- * The range struct is initialized here. It tracks the CPU page table, but only
- * if the function returns success (0), in which case the caller must then call
- * hmm_vma_range_done() to stop CPU page table update tracking on this range.
+ * hmm_range_register() - start tracking change to CPU page table over a range
+ * @range: range
+ * @mm: the mm struct for the range of virtual address
+ * @start: start virtual address (inclusive)
+ * @end: end virtual address (exclusive)
+ * @page_shift: expect page shift for the range
+ * Returns 0 on success, -EFAULT if the address space is no longer valid
*
- * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
- * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
+ * Track updates to the CPU page table see include/linux/hmm.h
*/
-int hmm_vma_get_pfns(struct hmm_range *range)
+int hmm_range_register(struct hmm_range *range,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end,
+ unsigned page_shift)
{
- struct vm_area_struct *vma = range->vma;
- struct hmm_vma_walk hmm_vma_walk;
- struct mm_walk mm_walk;
- struct hmm *hmm;
+ unsigned long mask = ((1UL << page_shift) - 1UL);
+
+ range->valid = false;
+ range->hmm = NULL;
- /* Sanity check, this really should not happen ! */
- if (range->start < vma->vm_start || range->start >= vma->vm_end)
+ if ((start & mask) || (end & mask))
return -EINVAL;
- if (range->end < vma->vm_start || range->end > vma->vm_end)
+ if (start >= end)
return -EINVAL;
- hmm = hmm_register(vma->vm_mm);
- if (!hmm)
- return -ENOMEM;
- /* Caller must have registered a mirror, via hmm_mirror_register() ! */
- if (!hmm->mmu_notifier.ops)
- return -EINVAL;
+ range->page_shift = page_shift;
+ range->start = start;
+ range->end = end;
- /* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
- vma_is_dax(vma)) {
- hmm_pfns_special(range);
- return -EINVAL;
- }
+ range->hmm = hmm_get_or_create(mm);
+ if (!range->hmm)
+ return -EFAULT;
- if (!(vma->vm_flags & VM_READ)) {
- /*
- * If vma do not allow read access, then assume that it does
- * not allow write access, either. Architecture that allow
- * write without read access are not supported by HMM, because
- * operations such has atomic access would not work.
- */
- hmm_pfns_clear(range, range->pfns, range->start, range->end);
- return -EPERM;
+ /* Check if hmm_mm_destroy() was call. */
+ if (range->hmm->mm == NULL || range->hmm->dead) {
+ hmm_put(range->hmm);
+ return -EFAULT;
}
/* Initialize range to track CPU page table update */
- spin_lock(&hmm->lock);
- range->valid = true;
- list_add_rcu(&range->list, &hmm->ranges);
- spin_unlock(&hmm->lock);
-
- hmm_vma_walk.fault = false;
- hmm_vma_walk.range = range;
- mm_walk.private = &hmm_vma_walk;
-
- mm_walk.vma = vma;
- mm_walk.mm = vma->vm_mm;
- mm_walk.pte_entry = NULL;
- mm_walk.test_walk = NULL;
- mm_walk.hugetlb_entry = NULL;
- mm_walk.pmd_entry = hmm_vma_walk_pmd;
- mm_walk.pte_hole = hmm_vma_walk_hole;
-
- walk_page_range(range->start, range->end, &mm_walk);
+ mutex_lock(&range->hmm->lock);
+
+ list_add_rcu(&range->list, &range->hmm->ranges);
+
+ /*
+ * If there are any concurrent notifiers we have to wait for them for
+ * the range to be valid (see hmm_range_wait_until_valid()).
+ */
+ if (!range->hmm->notifiers)
+ range->valid = true;
+ mutex_unlock(&range->hmm->lock);
+
return 0;
}
-EXPORT_SYMBOL(hmm_vma_get_pfns);
+EXPORT_SYMBOL(hmm_range_register);
/*
- * hmm_vma_range_done() - stop tracking change to CPU page table over a range
- * @range: range being tracked
- * Returns: false if range data has been invalidated, true otherwise
+ * hmm_range_unregister() - stop tracking change to CPU page table over a range
+ * @range: range
*
* Range struct is used to track updates to the CPU page table after a call to
- * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
- * using the data, or wants to lock updates to the data it got from those
- * functions, it must call the hmm_vma_range_done() function, which will then
- * stop tracking CPU page table updates.
- *
- * Note that device driver must still implement general CPU page table update
- * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
- * the mmu_notifier API directly.
- *
- * CPU page table update tracking done through hmm_range is only temporary and
- * to be used while trying to duplicate CPU page table contents for a range of
- * virtual addresses.
- *
- * There are two ways to use this :
- * again:
- * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
- * trans = device_build_page_table_update_transaction(pfns);
- * device_page_table_lock();
- * if (!hmm_vma_range_done(range)) {
- * device_page_table_unlock();
- * goto again;
- * }
- * device_commit_transaction(trans);
- * device_page_table_unlock();
+ * hmm_range_register(). See include/linux/hmm.h for how to use it.
+ */
+void hmm_range_unregister(struct hmm_range *range)
+{
+ /* Sanity check this really should not happen. */
+ if (range->hmm == NULL || range->end <= range->start)
+ return;
+
+ mutex_lock(&range->hmm->lock);
+ list_del_rcu(&range->list);
+ mutex_unlock(&range->hmm->lock);
+
+ /* Drop reference taken by hmm_range_register() */
+ range->valid = false;
+ hmm_put(range->hmm);
+ range->hmm = NULL;
+}
+EXPORT_SYMBOL(hmm_range_unregister);
+
+/*
+ * hmm_range_snapshot() - snapshot CPU page table for a range
+ * @range: range
+ * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
+ * permission (for instance asking for write and range is read only),
+ * -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid
+ * vma or it is illegal to access that range), number of valid pages
+ * in range->pfns[] (from range start address).
*
- * Or:
- * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
- * device_page_table_lock();
- * hmm_vma_range_done(range);
- * device_update_page_table(range->pfns);
- * device_page_table_unlock();
+ * This snapshots the CPU page table for a range of virtual addresses. Snapshot
+ * validity is tracked by range struct. See in include/linux/hmm.h for example
+ * on how to use.
*/
-bool hmm_vma_range_done(struct hmm_range *range)
+long hmm_range_snapshot(struct hmm_range *range)
{
- unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
- struct hmm *hmm;
+ const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
+ unsigned long start = range->start, end;
+ struct hmm_vma_walk hmm_vma_walk;
+ struct hmm *hmm = range->hmm;
+ struct vm_area_struct *vma;
+ struct mm_walk mm_walk;
- if (range->end <= range->start) {
- BUG();
- return false;
- }
+ /* Check if hmm_mm_destroy() was call. */
+ if (hmm->mm == NULL || hmm->dead)
+ return -EFAULT;
- hmm = hmm_register(range->vma->vm_mm);
- if (!hmm) {
- memset(range->pfns, 0, sizeof(*range->pfns) * npages);
- return false;
- }
+ do {
+ /* If range is no longer valid force retry. */
+ if (!range->valid)
+ return -EAGAIN;
- spin_lock(&hmm->lock);
- list_del_rcu(&range->list);
- spin_unlock(&hmm->lock);
+ vma = find_vma(hmm->mm, start);
+ if (vma == NULL || (vma->vm_flags & device_vma))
+ return -EFAULT;
+
+ if (is_vm_hugetlb_page(vma)) {
+ struct hstate *h = hstate_vma(vma);
- return range->valid;
+ if (huge_page_shift(h) != range->page_shift &&
+ range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
+ } else {
+ if (range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
+ }
+
+ if (!(vma->vm_flags & VM_READ)) {
+ /*
+ * If vma do not allow read access, then assume that it
+ * does not allow write access, either. HMM does not
+ * support architecture that allow write without read.
+ */
+ hmm_pfns_clear(range, range->pfns,
+ range->start, range->end);
+ return -EPERM;
+ }
+
+ range->vma = vma;
+ hmm_vma_walk.pgmap = NULL;
+ hmm_vma_walk.last = start;
+ hmm_vma_walk.fault = false;
+ hmm_vma_walk.range = range;
+ mm_walk.private = &hmm_vma_walk;
+ end = min(range->end, vma->vm_end);
+
+ mm_walk.vma = vma;
+ mm_walk.mm = vma->vm_mm;
+ mm_walk.pte_entry = NULL;
+ mm_walk.test_walk = NULL;
+ mm_walk.hugetlb_entry = NULL;
+ mm_walk.pud_entry = hmm_vma_walk_pud;
+ mm_walk.pmd_entry = hmm_vma_walk_pmd;
+ mm_walk.pte_hole = hmm_vma_walk_hole;
+ mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
+
+ walk_page_range(start, end, &mm_walk);
+ start = end;
+ } while (start < range->end);
+
+ return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
}
-EXPORT_SYMBOL(hmm_vma_range_done);
+EXPORT_SYMBOL(hmm_range_snapshot);
/*
- * hmm_vma_fault() - try to fault some address in a virtual address range
+ * hmm_range_fault() - try to fault some address in a virtual address range
* @range: range being faulted
* @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
+ * Returns: number of valid pages in range->pfns[] (from range start
+ * address). This may be zero. If the return value is negative,
+ * then one of the following values may be returned:
+ *
+ * -EINVAL invalid arguments or mm or virtual address are in an
+ * invalid vma (for instance device file vma).
+ * -ENOMEM: Out of memory.
+ * -EPERM: Invalid permission (for instance asking for write and
+ * range is read only).
+ * -EAGAIN: If you need to retry and mmap_sem was drop. This can only
+ * happens if block argument is false.
+ * -EBUSY: If the the range is being invalidated and you should wait
+ * for invalidation to finish.
+ * -EFAULT: Invalid (ie either no valid vma or it is illegal to access
+ * that range), number of valid pages in range->pfns[] (from
+ * range start address).
*
* This is similar to a regular CPU page fault except that it will not trigger
- * any memory migration if the memory being faulted is not accessible by CPUs.
+ * any memory migration if the memory being faulted is not accessible by CPUs
+ * and caller does not ask for migration.
*
* On error, for one virtual address in the range, the function will mark the
* corresponding HMM pfn entry with an error flag.
- *
- * Expected use pattern:
- * retry:
- * down_read(&mm->mmap_sem);
- * // Find vma and address device wants to fault, initialize hmm_pfn_t
- * // array accordingly
- * ret = hmm_vma_fault(range, write, block);
- * switch (ret) {
- * case -EAGAIN:
- * hmm_vma_range_done(range);
- * // You might want to rate limit or yield to play nicely, you may
- * // also commit any valid pfn in the array assuming that you are
- * // getting true from hmm_vma_range_monitor_end()
- * goto retry;
- * case 0:
- * break;
- * case -ENOMEM:
- * case -EINVAL:
- * case -EPERM:
- * default:
- * // Handle error !
- * up_read(&mm->mmap_sem)
- * return;
- * }
- * // Take device driver lock that serialize device page table update
- * driver_lock_device_page_table_update();
- * hmm_vma_range_done(range);
- * // Commit pfns we got from hmm_vma_fault()
- * driver_unlock_device_page_table_update();
- * up_read(&mm->mmap_sem)
- *
- * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
- * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
- *
- * YOU HAVE BEEN WARNED !
*/
-int hmm_vma_fault(struct hmm_range *range, bool block)
+long hmm_range_fault(struct hmm_range *range, bool block)
{
- struct vm_area_struct *vma = range->vma;
- unsigned long start = range->start;
+ const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
+ unsigned long start = range->start, end;
struct hmm_vma_walk hmm_vma_walk;
+ struct hmm *hmm = range->hmm;
+ struct vm_area_struct *vma;
struct mm_walk mm_walk;
- struct hmm *hmm;
int ret;
- /* Sanity check, this really should not happen ! */
- if (range->start < vma->vm_start || range->start >= vma->vm_end)
- return -EINVAL;
- if (range->end < vma->vm_start || range->end > vma->vm_end)
- return -EINVAL;
+ /* Check if hmm_mm_destroy() was call. */
+ if (hmm->mm == NULL || hmm->dead)
+ return -EFAULT;
- hmm = hmm_register(vma->vm_mm);
- if (!hmm) {
- hmm_pfns_clear(range, range->pfns, range->start, range->end);
- return -ENOMEM;
- }
- /* Caller must have registered a mirror using hmm_mirror_register() */
- if (!hmm->mmu_notifier.ops)
- return -EINVAL;
+ do {
+ /* If range is no longer valid force retry. */
+ if (!range->valid) {
+ up_read(&hmm->mm->mmap_sem);
+ return -EAGAIN;
+ }
- /* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
- vma_is_dax(vma)) {
- hmm_pfns_special(range);
- return -EINVAL;
- }
+ vma = find_vma(hmm->mm, start);
+ if (vma == NULL || (vma->vm_flags & device_vma))
+ return -EFAULT;
+
+ if (is_vm_hugetlb_page(vma)) {
+ if (huge_page_shift(hstate_vma(vma)) !=
+ range->page_shift &&
+ range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
+ } else {
+ if (range->page_shift != PAGE_SHIFT)
+ return -EINVAL;
+ }
+
+ if (!(vma->vm_flags & VM_READ)) {
+ /*
+ * If vma do not allow read access, then assume that it
+ * does not allow write access, either. HMM does not
+ * support architecture that allow write without read.
+ */
+ hmm_pfns_clear(range, range->pfns,
+ range->start, range->end);
+ return -EPERM;
+ }
+
+ range->vma = vma;
+ hmm_vma_walk.pgmap = NULL;
+ hmm_vma_walk.last = start;
+ hmm_vma_walk.fault = true;
+ hmm_vma_walk.block = block;
+ hmm_vma_walk.range = range;
+ mm_walk.private = &hmm_vma_walk;
+ end = min(range->end, vma->vm_end);
+
+ mm_walk.vma = vma;
+ mm_walk.mm = vma->vm_mm;
+ mm_walk.pte_entry = NULL;
+ mm_walk.test_walk = NULL;
+ mm_walk.hugetlb_entry = NULL;
+ mm_walk.pud_entry = hmm_vma_walk_pud;
+ mm_walk.pmd_entry = hmm_vma_walk_pmd;
+ mm_walk.pte_hole = hmm_vma_walk_hole;
+ mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
+
+ do {
+ ret = walk_page_range(start, end, &mm_walk);
+ start = hmm_vma_walk.last;
+
+ /* Keep trying while the range is valid. */
+ } while (ret == -EBUSY && range->valid);
+
+ if (ret) {
+ unsigned long i;
+
+ i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+ hmm_pfns_clear(range, &range->pfns[i],
+ hmm_vma_walk.last, range->end);
+ return ret;
+ }
+ start = end;
+
+ } while (start < range->end);
+
+ return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+}
+EXPORT_SYMBOL(hmm_range_fault);
+
+/**
+ * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one.
+ * @range: range being faulted
+ * @device: device against to dma map page to
+ * @daddrs: dma address of mapped pages
+ * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
+ * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been
+ * drop and you need to try again, some other error value otherwise
+ *
+ * Note same usage pattern as hmm_range_fault().
+ */
+long hmm_range_dma_map(struct hmm_range *range,
+ struct device *device,
+ dma_addr_t *daddrs,
+ bool block)
+{
+ unsigned long i, npages, mapped;
+ long ret;
+
+ ret = hmm_range_fault(range, block);
+ if (ret <= 0)
+ return ret ? ret : -EBUSY;
+
+ npages = (range->end - range->start) >> PAGE_SHIFT;
+ for (i = 0, mapped = 0; i < npages; ++i) {
+ enum dma_data_direction dir = DMA_TO_DEVICE;
+ struct page *page;
- if (!(vma->vm_flags & VM_READ)) {
/*
- * If vma do not allow read access, then assume that it does
- * not allow write access, either. Architecture that allow
- * write without read access are not supported by HMM, because
- * operations such has atomic access would not work.
+ * FIXME need to update DMA API to provide invalid DMA address
+ * value instead of a function to test dma address value. This
+ * would remove lot of dumb code duplicated accross many arch.
+ *
+ * For now setting it to 0 here is good enough as the pfns[]
+ * value is what is use to check what is valid and what isn't.
*/
- hmm_pfns_clear(range, range->pfns, range->start, range->end);
- return -EPERM;
+ daddrs[i] = 0;
+
+ page = hmm_device_entry_to_page(range, range->pfns[i]);
+ if (page == NULL)
+ continue;
+
+ /* Check if range is being invalidated */
+ if (!range->valid) {
+ ret = -EBUSY;
+ goto unmap;
+ }
+
+ /* If it is read and write than map bi-directional. */
+ if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
+ dir = DMA_BIDIRECTIONAL;
+
+ daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir);
+ if (dma_mapping_error(device, daddrs[i])) {
+ ret = -EFAULT;
+ goto unmap;
+ }
+
+ mapped++;
}
- /* Initialize range to track CPU page table update */
- spin_lock(&hmm->lock);
- range->valid = true;
- list_add_rcu(&range->list, &hmm->ranges);
- spin_unlock(&hmm->lock);
-
- hmm_vma_walk.fault = true;
- hmm_vma_walk.block = block;
- hmm_vma_walk.range = range;
- mm_walk.private = &hmm_vma_walk;
- hmm_vma_walk.last = range->start;
-
- mm_walk.vma = vma;
- mm_walk.mm = vma->vm_mm;
- mm_walk.pte_entry = NULL;
- mm_walk.test_walk = NULL;
- mm_walk.hugetlb_entry = NULL;
- mm_walk.pmd_entry = hmm_vma_walk_pmd;
- mm_walk.pte_hole = hmm_vma_walk_hole;
+ return mapped;
- do {
- ret = walk_page_range(start, range->end, &mm_walk);
- start = hmm_vma_walk.last;
- } while (ret == -EAGAIN);
+unmap:
+ for (npages = i, i = 0; (i < npages) && mapped; ++i) {
+ enum dma_data_direction dir = DMA_TO_DEVICE;
+ struct page *page;
- if (ret) {
- unsigned long i;
+ page = hmm_device_entry_to_page(range, range->pfns[i]);
+ if (page == NULL)
+ continue;
+
+ if (dma_mapping_error(device, daddrs[i]))
+ continue;
- i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
- hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last,
- range->end);
- hmm_vma_range_done(range);
+ /* If it is read and write than map bi-directional. */
+ if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
+ dir = DMA_BIDIRECTIONAL;
+
+ dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
+ mapped--;
}
+
return ret;
}
-EXPORT_SYMBOL(hmm_vma_fault);
+EXPORT_SYMBOL(hmm_range_dma_map);
+
+/**
+ * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map()
+ * @range: range being unmapped
+ * @vma: the vma against which the range (optional)
+ * @device: device against which dma map was done
+ * @daddrs: dma address of mapped pages
+ * @dirty: dirty page if it had the write flag set
+ * Returns: number of page unmapped on success, -EINVAL otherwise
+ *
+ * Note that caller MUST abide by mmu notifier or use HMM mirror and abide
+ * to the sync_cpu_device_pagetables() callback so that it is safe here to
+ * call set_page_dirty(). Caller must also take appropriate locks to avoid
+ * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress.
+ */
+long hmm_range_dma_unmap(struct hmm_range *range,
+ struct vm_area_struct *vma,
+ struct device *device,
+ dma_addr_t *daddrs,
+ bool dirty)
+{
+ unsigned long i, npages;
+ long cpages = 0;
+
+ /* Sanity check. */
+ if (range->end <= range->start)
+ return -EINVAL;
+ if (!daddrs)
+ return -EINVAL;
+ if (!range->pfns)
+ return -EINVAL;
+
+ npages = (range->end - range->start) >> PAGE_SHIFT;
+ for (i = 0; i < npages; ++i) {
+ enum dma_data_direction dir = DMA_TO_DEVICE;
+ struct page *page;
+
+ page = hmm_device_entry_to_page(range, range->pfns[i]);
+ if (page == NULL)
+ continue;
+
+ /* If it is read and write than map bi-directional. */
+ if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) {
+ dir = DMA_BIDIRECTIONAL;
+
+ /*
+ * See comments in function description on why it is
+ * safe here to call set_page_dirty()
+ */
+ if (dirty)
+ set_page_dirty(page);
+ }
+
+ /* Unmap and clear pfns/dma address */
+ dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
+ range->pfns[i] = range->values[HMM_PFN_NONE];
+ /* FIXME see comments in hmm_vma_dma_map() */
+ daddrs[i] = 0;
+ cpages++;
+ }
+
+ return cpages;
+}
+EXPORT_SYMBOL(hmm_range_dma_unmap);
#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b6a34b32d8ac..9f8bce9a6b32 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -509,7 +509,7 @@ void prep_transhuge_page(struct page *page)
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}
-unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
+static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
loff_t off, unsigned long flags, unsigned long size)
{
unsigned long addr;
@@ -793,11 +793,13 @@ out_unlock:
pte_free(mm, pgtable);
}
-vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, pfn_t pfn, bool write)
+vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
{
+ unsigned long addr = vmf->address & PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
pgtable_t pgtable = NULL;
+
/*
* If we had pmd_special, we could avoid all these restrictions,
* but we need to be consistent with PTEs and architectures that
@@ -820,7 +822,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
track_pfn_insert(vma, &pgprot, pfn);
- insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable);
+ insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
@@ -869,10 +871,12 @@ out_unlock:
spin_unlock(ptl);
}
-vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, pfn_t pfn, bool write)
+vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
{
+ unsigned long addr = vmf->address & PUD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
+
/*
* If we had pud_special, we could avoid all these restrictions,
* but we need to be consistent with PTEs and architectures that
@@ -889,7 +893,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
track_pfn_insert(vma, &pgprot, pfn);
- insert_pfn_pud(vma, addr, pud, pfn, pgprot, write);
+ insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
@@ -1220,8 +1224,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
cond_resched();
}
- mmu_notifier_range_init(&range, vma->vm_mm, haddr,
- haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ haddr, haddr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
@@ -1384,8 +1388,8 @@ alloc:
vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);
- mmu_notifier_range_init(&range, vma->vm_mm, haddr,
- haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ haddr, haddr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
spin_lock(vmf->ptl);
@@ -2060,7 +2064,8 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
spinlock_t *ptl;
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address & HPAGE_PUD_MASK,
(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptl = pud_lock(vma->vm_mm, pud);
@@ -2278,7 +2283,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
spinlock_t *ptl;
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address & HPAGE_PMD_MASK,
(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptl = pmd_lock(vma->vm_mm, pmd);
@@ -2492,6 +2498,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
shmem_uncharge(head->mapping->host, 1);
put_page(head + i);
+ } else if (!PageAnon(page)) {
+ __xa_store(&head->mapping->i_pages, head[i].index,
+ head + i, 0);
}
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 641cedfc8c0f..81718c56b8f5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -740,7 +740,15 @@ void resv_map_release(struct kref *ref)
static inline struct resv_map *inode_resv_map(struct inode *inode)
{
- return inode->i_mapping->private_data;
+ /*
+ * At inode evict time, i_mapping may not point to the original
+ * address space within the inode. This original address space
+ * contains the pointer to the resv_map. So, always use the
+ * address space embedded within the inode.
+ * The VERY common case is inode->mapping == &inode->i_data but,
+ * this may not be true for device special inodes.
+ */
+ return (struct resv_map *)(&inode->i_data)->private_data;
}
static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
@@ -1059,6 +1067,7 @@ static void free_gigantic_page(struct page *page, unsigned int order)
free_contig_range(page_to_pfn(page), 1 << order);
}
+#ifdef CONFIG_CONTIG_ALLOC
static int __alloc_gigantic_page(unsigned long start_pfn,
unsigned long nr_pages, gfp_t gfp_mask)
{
@@ -1143,11 +1152,20 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
static void prep_compound_gigantic_page(struct page *page, unsigned int order);
+#else /* !CONFIG_CONTIG_ALLOC */
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
+{
+ return NULL;
+}
+#endif /* CONFIG_CONTIG_ALLOC */
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
-static inline bool gigantic_page_supported(void) { return false; }
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nodemask) { return NULL; }
+ int nid, nodemask_t *nodemask)
+{
+ return NULL;
+}
static inline void free_gigantic_page(struct page *page, unsigned int order) { }
static inline void destroy_compound_gigantic_page(struct page *page,
unsigned int order) { }
@@ -1157,7 +1175,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
{
int i;
- if (hstate_is_gigantic(h) && !gigantic_page_supported())
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
h->nr_huge_pages--;
@@ -1258,12 +1276,23 @@ void free_huge_page(struct page *page)
ClearPagePrivate(page);
/*
- * A return code of zero implies that the subpool will be under its
- * minimum size if the reservation is not restored after page is free.
- * Therefore, force restore_reserve operation.
+ * If PagePrivate() was set on page, page allocation consumed a
+ * reservation. If the page was associated with a subpool, there
+ * would have been a page reserved in the subpool before allocation
+ * via hugepage_subpool_get_pages(). Since we are 'restoring' the
+ * reservtion, do not call hugepage_subpool_put_pages() as this will
+ * remove the reserved page from the subpool.
*/
- if (hugepage_subpool_put_pages(spool, 1) == 0)
- restore_reserve = true;
+ if (!restore_reserve) {
+ /*
+ * A return code of zero implies that the subpool will be
+ * under its minimum size if the reservation is not restored
+ * after page is free. Therefore, force restore_reserve
+ * operation.
+ */
+ if (hugepage_subpool_put_pages(spool, 1) == 0)
+ restore_reserve = true;
+ }
spin_lock(&hugetlb_lock);
clear_page_huge_active(page);
@@ -1574,8 +1603,9 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
*/
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
SetPageHugeTemporary(page);
+ spin_unlock(&hugetlb_lock);
put_page(page);
- page = NULL;
+ return NULL;
} else {
h->surplus_huge_pages++;
h->surplus_huge_pages_node[page_to_nid(page)]++;
@@ -2277,13 +2307,47 @@ found:
}
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
-static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
- nodemask_t *nodes_allowed)
+static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
+ nodemask_t *nodes_allowed)
{
unsigned long min_count, ret;
- if (hstate_is_gigantic(h) && !gigantic_page_supported())
- return h->max_huge_pages;
+ spin_lock(&hugetlb_lock);
+
+ /*
+ * Check for a node specific request.
+ * Changing node specific huge page count may require a corresponding
+ * change to the global count. In any case, the passed node mask
+ * (nodes_allowed) will restrict alloc/free to the specified node.
+ */
+ if (nid != NUMA_NO_NODE) {
+ unsigned long old_count = count;
+
+ count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
+ /*
+ * User may have specified a large count value which caused the
+ * above calculation to overflow. In this case, they wanted
+ * to allocate as many huge pages as possible. Set count to
+ * largest possible value to align with their intention.
+ */
+ if (count < old_count)
+ count = ULONG_MAX;
+ }
+
+ /*
+ * Gigantic pages runtime allocation depend on the capability for large
+ * page range allocation.
+ * If the system does not provide this feature, return an error when
+ * the user tries to allocate gigantic pages but let the user free the
+ * boottime allocated gigantic pages.
+ */
+ if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
+ if (count > persistent_huge_pages(h)) {
+ spin_unlock(&hugetlb_lock);
+ return -EINVAL;
+ }
+ /* Fall through to decrease pool */
+ }
/*
* Increase the pool size
@@ -2296,7 +2360,6 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* pool might be one hugepage larger than it needs to be, but
* within all the constraints specified by the sysctls.
*/
- spin_lock(&hugetlb_lock);
while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, nodes_allowed, -1))
break;
@@ -2351,9 +2414,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
break;
}
out:
- ret = persistent_huge_pages(h);
+ h->max_huge_pages = persistent_huge_pages(h);
spin_unlock(&hugetlb_lock);
- return ret;
+
+ return 0;
}
#define HSTATE_ATTR_RO(_name) \
@@ -2403,41 +2467,32 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
unsigned long count, size_t len)
{
int err;
- NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
+ nodemask_t nodes_allowed, *n_mask;
- if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
- err = -EINVAL;
- goto out;
- }
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+ return -EINVAL;
if (nid == NUMA_NO_NODE) {
/*
* global hstate attribute
*/
if (!(obey_mempolicy &&
- init_nodemask_of_mempolicy(nodes_allowed))) {
- NODEMASK_FREE(nodes_allowed);
- nodes_allowed = &node_states[N_MEMORY];
- }
- } else if (nodes_allowed) {
+ init_nodemask_of_mempolicy(&nodes_allowed)))
+ n_mask = &node_states[N_MEMORY];
+ else
+ n_mask = &nodes_allowed;
+ } else {
/*
- * per node hstate attribute: adjust count to global,
- * but restrict alloc/free to the specified node.
+ * Node specific request. count adjustment happens in
+ * set_max_huge_pages() after acquiring hugetlb_lock.
*/
- count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
- init_nodemask_of_node(nodes_allowed, nid);
- } else
- nodes_allowed = &node_states[N_MEMORY];
-
- h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
+ init_nodemask_of_node(&nodes_allowed, nid);
+ n_mask = &nodes_allowed;
+ }
- if (nodes_allowed != &node_states[N_MEMORY])
- NODEMASK_FREE(nodes_allowed);
+ err = set_max_huge_pages(h, count, nid, n_mask);
- return len;
-out:
- NODEMASK_FREE(nodes_allowed);
- return err;
+ return err ? err : len;
}
static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
@@ -3247,7 +3302,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
if (cow) {
- mmu_notifier_range_init(&range, src, vma->vm_start,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
+ vma->vm_start,
vma->vm_end);
mmu_notifier_invalidate_range_start(&range);
}
@@ -3359,7 +3415,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
/*
* If sharing possible, alert mmu notifiers of worst case.
*/
- mmu_notifier_range_init(&range, mm, start, end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
+ end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
mmu_notifier_invalidate_range_start(&range);
address = start;
@@ -3626,7 +3683,8 @@ retry_avoidcopy:
pages_per_huge_page(h));
__SetPageUptodate(new_page);
- mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h));
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
+ haddr + huge_page_size(h));
mmu_notifier_invalidate_range_start(&range);
/*
@@ -3777,8 +3835,7 @@ retry:
* handling userfault. Reacquire after handling
* fault to make calling code simpler.
*/
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
- idx, haddr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -3886,21 +3943,14 @@ backout_unlocked:
}
#ifdef CONFIG_SMP
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
pgoff_t idx, unsigned long address)
{
unsigned long key[2];
u32 hash;
- if (vma->vm_flags & VM_SHARED) {
- key[0] = (unsigned long) mapping;
- key[1] = idx;
- } else {
- key[0] = (unsigned long) mm;
- key[1] = address >> huge_page_shift(h);
- }
+ key[0] = (unsigned long) mapping;
+ key[1] = idx;
hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
@@ -3911,9 +3961,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
* For uniprocesor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
pgoff_t idx, unsigned long address)
{
return 0;
@@ -3958,7 +4006,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
@@ -4371,7 +4419,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* start/end. Set range.start/range.end to cover the maximum possible
* range if PMD sharing is possible.
*/
- mmu_notifier_range_init(&range, mm, start, end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
+ 0, vma, mm, start, end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
BUG_ON(address >= end);
@@ -4477,6 +4526,11 @@ int hugetlb_reserve_pages(struct inode *inode,
* called to make the mapping read-write. Assume !vma is a shm mapping
*/
if (!vma || vma->vm_flags & VM_MAYSHARE) {
+ /*
+ * resv_map can not be NULL as hugetlb_reserve_pages is only
+ * called for inodes for which resv_maps were created (see
+ * hugetlbfs_get_inode).
+ */
resv_map = inode_resv_map(inode);
chg = region_chg(resv_map, from, to);
@@ -4568,6 +4622,10 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
struct hugepage_subpool *spool = subpool_inode(inode);
long gbl_reserve;
+ /*
+ * Since this routine can be called in the evict inode path for all
+ * hugetlbfs inodes, resv_map could be NULL.
+ */
if (resv_map) {
chg = region_del(resv_map, start, end);
/*
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 449044378782..a335f7c1fac4 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1016,7 +1016,8 @@ static void collapse_huge_page(struct mm_struct *mm,
pte = pte_offset_map(pmd, address);
pte_ptl = pte_lockptr(mm, pmd);
- mmu_notifier_range_init(&range, mm, address, address + HPAGE_PMD_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
+ address, address + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
@@ -1374,7 +1375,7 @@ static void collapse_shmem(struct mm_struct *mm,
result = SCAN_FAIL;
goto xa_locked;
}
- xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
+ xas_store(&xas, new_page);
nr_none++;
continue;
}
@@ -1450,7 +1451,7 @@ static void collapse_shmem(struct mm_struct *mm,
list_add_tail(&page->lru, &pagelist);
/* Finally, replace with the new page. */
- xas_store(&xas, new_page + (index % HPAGE_PMD_NR));
+ xas_store(&xas, new_page);
continue;
out_unlock:
unlock_page(page);
diff --git a/mm/ksm.c b/mm/ksm.c
index fc64874dc6f4..81c20ed57bf6 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1066,7 +1066,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
BUG_ON(PageTransCompound(page));
- mmu_notifier_range_init(&range, mm, pvmw.address,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ pvmw.address,
pvmw.address + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
@@ -1154,7 +1155,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
if (!pmd)
goto out;
- mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+ addr + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
diff --git a/mm/madvise.c b/mm/madvise.c
index bb3a4554d5d5..628022e674a7 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -472,7 +472,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
range.end = min(vma->vm_end, end_addr);
if (range.end <= vma->vm_start)
return -EINVAL;
- mmu_notifier_range_init(&range, mm, range.start, range.end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ range.start, range.end);
lru_add_drain();
tlb_gather_mmu(&tlb, mm, range.start, range.end);
diff --git a/mm/memblock.c b/mm/memblock.c
index a48f520c2d01..6bbad46f4d2c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -94,7 +94,7 @@
* :c:func:`mem_init` function frees all the memory to the buddy page
* allocator.
*
- * If an architecure enables %CONFIG_ARCH_DISCARD_MEMBLOCK, the
+ * Unless an architecure enables %CONFIG_ARCH_KEEP_MEMBLOCK, the
* memblock data structures will be discarded after the system
* initialization compltes.
*/
@@ -375,7 +375,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
}
}
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+#ifndef CONFIG_ARCH_KEEP_MEMBLOCK
/**
* memblock_discard - discard memory and reserved arrays if they were allocated
*/
@@ -1255,6 +1255,70 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
return 0;
}
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+/**
+ * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone()
+ *
+ * @idx: pointer to u64 loop variable
+ * @zone: zone in which all of the memory blocks reside
+ * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL
+ * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL
+ *
+ * This function is meant to be a zone/pfn specific wrapper for the
+ * for_each_mem_range type iterators. Specifically they are used in the
+ * deferred memory init routines and as such we were duplicating much of
+ * this logic throughout the code. So instead of having it in multiple
+ * locations it seemed like it would make more sense to centralize this to
+ * one new iterator that does everything they need.
+ */
+void __init_memblock
+__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
+ unsigned long *out_spfn, unsigned long *out_epfn)
+{
+ int zone_nid = zone_to_nid(zone);
+ phys_addr_t spa, epa;
+ int nid;
+
+ __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
+ &memblock.memory, &memblock.reserved,
+ &spa, &epa, &nid);
+
+ while (*idx != U64_MAX) {
+ unsigned long epfn = PFN_DOWN(epa);
+ unsigned long spfn = PFN_UP(spa);
+
+ /*
+ * Verify the end is at least past the start of the zone and
+ * that we have at least one PFN to initialize.
+ */
+ if (zone->zone_start_pfn < epfn && spfn < epfn) {
+ /* if we went too far just stop searching */
+ if (zone_end_pfn(zone) <= spfn) {
+ *idx = U64_MAX;
+ break;
+ }
+
+ if (out_spfn)
+ *out_spfn = max(zone->zone_start_pfn, spfn);
+ if (out_epfn)
+ *out_epfn = min(zone_end_pfn(zone), epfn);
+
+ return;
+ }
+
+ __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
+ &memblock.memory, &memblock.reserved,
+ &spa, &epa, &nid);
+ }
+
+ /* signal end of iteration */
+ if (out_spfn)
+ *out_spfn = ULONG_MAX;
+ if (out_epfn)
+ *out_epfn = 0;
+}
+
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
/**
* memblock_alloc_range_nid - allocate boot memory block
@@ -1923,7 +1987,7 @@ unsigned long __init memblock_free_all(void)
return pages;
}
-#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK)
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK)
static int memblock_debug_show(struct seq_file *m, void *private)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 81a0d3914ec9..287933005e11 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -725,34 +725,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
}
-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask)
-{
- struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
- unsigned long nr = 0;
- enum lru_list lru;
-
- VM_BUG_ON((unsigned)nid >= nr_node_ids);
-
- for_each_lru(lru) {
- if (!(BIT(lru) & lru_mask))
- continue;
- nr += mem_cgroup_get_lru_size(lruvec, lru);
- }
- return nr;
-}
-
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
- unsigned int lru_mask)
-{
- unsigned long nr = 0;
- int nid;
-
- for_each_node_state(nid, N_MEMORY)
- nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
- return nr;
-}
-
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
enum mem_cgroup_events_target target)
{
@@ -1358,7 +1330,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
for (i = 0; i < NR_LRU_LISTS; i++)
pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
- K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
+ K(memcg_page_state(iter, NR_LRU_BASE + i)));
pr_cont("\n");
}
@@ -1422,11 +1394,15 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
int nid, bool noswap)
{
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
+ struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
+
+ if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
+ lruvec_page_state(lruvec, NR_ACTIVE_FILE))
return true;
if (noswap || !total_swap_pages)
return false;
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
+ if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
+ lruvec_page_state(lruvec, NR_ACTIVE_ANON))
return true;
return false;
@@ -2990,8 +2966,8 @@ static void accumulate_memcg_tree(struct mem_cgroup *memcg,
acc->events_array ? acc->events_array[i] : i);
for (i = 0; i < NR_LRU_LISTS; i++)
- acc->lru_pages[i] +=
- mem_cgroup_nr_lru_pages(mi, BIT(i));
+ acc->lru_pages[i] += memcg_page_state(mi,
+ NR_LRU_BASE + i);
}
}
@@ -3331,6 +3307,42 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
#endif
#ifdef CONFIG_NUMA
+
+#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
+#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
+#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
+
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid, unsigned int lru_mask)
+{
+ struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
+ unsigned long nr = 0;
+ enum lru_list lru;
+
+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
+
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
+ }
+ return nr;
+}
+
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
+ unsigned int lru_mask)
+{
+ unsigned long nr = 0;
+ enum lru_list lru;
+
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
+ }
+ return nr;
+}
+
static int memcg_numa_stat_show(struct seq_file *m, void *v)
{
struct numa_stat {
@@ -3421,7 +3433,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
- mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
+ memcg_page_state(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
@@ -3927,8 +3940,8 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
/* this should eventually include NR_UNSTABLE_NFS */
*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
- *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
- (1 << LRU_ACTIVE_FILE));
+ *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
+ memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
*pheadroom = PAGE_COUNTER_MAX;
while ((parent = parent_mem_cgroup(memcg))) {
diff --git a/mm/memfd.c b/mm/memfd.c
index 650e65a46b9c..2647c898990c 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -39,6 +39,7 @@ static void memfd_tag_pins(struct xa_state *xas)
xas_for_each(xas, page, ULONG_MAX) {
if (xa_is_value(page))
continue;
+ page = find_subpage(page, xas->xa_index);
if (page_count(page) - page_mapcount(page) > 1)
xas_set_mark(xas, MEMFD_TAG_PINNED);
@@ -88,6 +89,7 @@ static int memfd_wait_for_pins(struct address_space *mapping)
bool clear = true;
if (xa_is_value(page))
continue;
+ page = find_subpage(page, xas.xa_index);
if (page_count(page) - page_mapcount(page) != 1) {
/*
* On the last scan, we clean up all those tags
diff --git a/mm/memory.c b/mm/memory.c
index f7d962d7de19..96f1d473c89a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1010,7 +1010,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
is_cow = is_cow_mapping(vma->vm_flags);
if (is_cow) {
- mmu_notifier_range_init(&range, src_mm, addr, end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
+ 0, vma, src_mm, addr, end);
mmu_notifier_invalidate_range_start(&range);
}
@@ -1334,7 +1335,8 @@ void unmap_vmas(struct mmu_gather *tlb,
{
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+ start_addr, end_addr);
mmu_notifier_invalidate_range_start(&range);
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
@@ -1356,7 +1358,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
struct mmu_gather tlb;
lru_add_drain();
- mmu_notifier_range_init(&range, vma->vm_mm, start, start + size);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ start, start + size);
tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
@@ -1382,7 +1385,8 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
struct mmu_gather tlb;
lru_add_drain();
- mmu_notifier_range_init(&range, vma->vm_mm, address, address + size);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address, address + size);
tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
@@ -1523,6 +1527,87 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vm_insert_page);
+/*
+ * __vm_map_pages - maps range of kernel pages into user vma
+ * @vma: user vma to map to
+ * @pages: pointer to array of source kernel pages
+ * @num: number of pages in page array
+ * @offset: user's requested vm_pgoff
+ *
+ * This allows drivers to map range of kernel pages into a user vma.
+ *
+ * Return: 0 on success and error code otherwise.
+ */
+static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num, unsigned long offset)
+{
+ unsigned long count = vma_pages(vma);
+ unsigned long uaddr = vma->vm_start;
+ int ret, i;
+
+ /* Fail if the user requested offset is beyond the end of the object */
+ if (offset > num)
+ return -ENXIO;
+
+ /* Fail if the user requested size exceeds available object size */
+ if (count > num - offset)
+ return -ENXIO;
+
+ for (i = 0; i < count; i++) {
+ ret = vm_insert_page(vma, uaddr, pages[offset + i]);
+ if (ret < 0)
+ return ret;
+ uaddr += PAGE_SIZE;
+ }
+
+ return 0;
+}
+
+/**
+ * vm_map_pages - maps range of kernel pages starts with non zero offset
+ * @vma: user vma to map to
+ * @pages: pointer to array of source kernel pages
+ * @num: number of pages in page array
+ *
+ * Maps an object consisting of @num pages, catering for the user's
+ * requested vm_pgoff
+ *
+ * If we fail to insert any page into the vma, the function will return
+ * immediately leaving any previously inserted pages present. Callers
+ * from the mmap handler may immediately return the error as their caller
+ * will destroy the vma, removing any successfully inserted pages. Other
+ * callers should make their own arrangements for calling unmap_region().
+ *
+ * Context: Process context. Called by mmap handlers.
+ * Return: 0 on success and error code otherwise.
+ */
+int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
+}
+EXPORT_SYMBOL(vm_map_pages);
+
+/**
+ * vm_map_pages_zero - map range of kernel pages starts with zero offset
+ * @vma: user vma to map to
+ * @pages: pointer to array of source kernel pages
+ * @num: number of pages in page array
+ *
+ * Similar to vm_map_pages(), except that it explicitly sets the offset
+ * to 0. This function is intended for the drivers that did not consider
+ * vm_pgoff.
+ *
+ * Context: Process context. Called by mmap handlers.
+ * Return: 0 on success and error code otherwise.
+ */
+int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return __vm_map_pages(vma, pages, num, 0);
+}
+EXPORT_SYMBOL(vm_map_pages_zero);
+
static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn, pgprot_t prot, bool mkwrite)
{
@@ -2279,7 +2364,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
__SetPageUptodate(new_page);
- mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ vmf->address & PAGE_MASK,
(vmf->address & PAGE_MASK) + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
@@ -4104,8 +4190,9 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
goto out;
if (range) {
- mmu_notifier_range_init(range, mm, address & PMD_MASK,
- (address & PMD_MASK) + PMD_SIZE);
+ mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
+ NULL, mm, address & PMD_MASK,
+ (address & PMD_MASK) + PMD_SIZE);
mmu_notifier_invalidate_range_start(range);
}
*ptlp = pmd_lock(mm, pmd);
@@ -4122,8 +4209,9 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
goto out;
if (range) {
- mmu_notifier_range_init(range, mm, address & PAGE_MASK,
- (address & PAGE_MASK) + PAGE_SIZE);
+ mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
+ address & PAGE_MASK,
+ (address & PAGE_MASK) + PAGE_SIZE);
mmu_notifier_invalidate_range_start(range);
}
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b236069ff0d8..6c0c4f48638e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -273,12 +273,12 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
* add the new pages.
*/
int __ref __add_pages(int nid, unsigned long phys_start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap,
- bool want_memblock)
+ unsigned long nr_pages, struct mhp_restrictions *restrictions)
{
unsigned long i;
int err = 0;
int start_sec, end_sec;
+ struct vmem_altmap *altmap = restrictions->altmap;
/* during initialize mem_map, align hot-added range to section */
start_sec = pfn_to_section_nr(phys_start_pfn);
@@ -299,7 +299,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
for (i = start_sec; i <= end_sec; i++) {
err = __add_section(nid, section_nr_to_pfn(i), altmap,
- want_memblock);
+ restrictions->flags & MHP_MEMBLOCK_API);
/*
* EEXIST is finally dealt with by ioresource collision
@@ -516,26 +516,23 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
pgdat_resize_unlock(zone->zone_pgdat, &flags);
}
-static int __remove_section(struct zone *zone, struct mem_section *ms,
- unsigned long map_offset, struct vmem_altmap *altmap)
+static void __remove_section(struct zone *zone, struct mem_section *ms,
+ unsigned long map_offset,
+ struct vmem_altmap *altmap)
{
unsigned long start_pfn;
int scn_nr;
- int ret = -EINVAL;
- if (!valid_section(ms))
- return ret;
+ if (WARN_ON_ONCE(!valid_section(ms)))
+ return;
- ret = unregister_memory_section(ms);
- if (ret)
- return ret;
+ unregister_memory_section(ms);
scn_nr = __section_nr(ms);
start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
__remove_zone(zone, start_pfn);
sparse_remove_one_section(zone, ms, map_offset, altmap);
- return 0;
}
/**
@@ -550,31 +547,17 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
* sure that pages are marked reserved and zones are adjust properly by
* calling offline_pages().
*/
-int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap)
+void __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
+ unsigned long nr_pages, struct vmem_altmap *altmap)
{
unsigned long i;
unsigned long map_offset = 0;
- int sections_to_remove, ret = 0;
+ int sections_to_remove;
/* In the ZONE_DEVICE case device driver owns the memory region */
if (is_dev_zone(zone)) {
if (altmap)
map_offset = vmem_altmap_offset(altmap);
- } else {
- resource_size_t start, size;
-
- start = phys_start_pfn << PAGE_SHIFT;
- size = nr_pages * PAGE_SIZE;
-
- ret = release_mem_region_adjustable(&iomem_resource, start,
- size);
- if (ret) {
- resource_size_t endres = start + size - 1;
-
- pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
- &start, &endres, ret);
- }
}
clear_zone_contiguous(zone);
@@ -590,16 +573,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
cond_resched();
- ret = __remove_section(zone, __pfn_to_section(pfn), map_offset,
- altmap);
+ __remove_section(zone, __pfn_to_section(pfn), map_offset,
+ altmap);
map_offset = 0;
- if (ret)
- break;
}
set_zone_contiguous(zone);
-
- return ret;
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
@@ -714,7 +693,7 @@ static void node_states_check_changes_online(unsigned long nr_pages,
if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
arg->status_change_nid_normal = nid;
#ifdef CONFIG_HIGHMEM
- if (zone_idx(zone) <= N_HIGH_MEMORY && !node_state(nid, N_HIGH_MEMORY))
+ if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
arg->status_change_nid_high = nid;
#endif
}
@@ -1097,6 +1076,9 @@ static int online_memory_block(struct memory_block *mem, void *arg)
*/
int __ref add_memory_resource(int nid, struct resource *res)
{
+ struct mhp_restrictions restrictions = {
+ .flags = MHP_MEMBLOCK_API,
+ };
u64 start, size;
bool new_node = false;
int ret;
@@ -1124,7 +1106,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
new_node = ret;
/* call arch's memory hotadd */
- ret = arch_add_memory(nid, start, size, NULL, true);
+ ret = arch_add_memory(nid, start, size, &restrictions);
if (ret < 0)
goto error;
@@ -1341,8 +1323,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
if (!PageHuge(page))
continue;
head = compound_head(page);
- if (hugepage_migration_supported(page_hstate(head)) &&
- page_huge_active(head))
+ if (page_huge_active(head))
return pfn;
skip = (1 << compound_order(head)) - (page - head);
pfn += skip - 1;
@@ -1382,10 +1363,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (PageHuge(page)) {
struct page *head = compound_head(page);
- if (compound_order(head) > PFN_SECTION_SHIFT) {
- ret = -EBUSY;
- break;
- }
pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
isolate_huge_page(head, &source);
continue;
@@ -1454,15 +1431,10 @@ static int
offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
void *data)
{
- __offline_isolated_pages(start, start + nr_pages);
- return 0;
-}
+ unsigned long *offlined_pages = (unsigned long *)data;
-static void
-offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
-{
- walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
- offline_isolated_pages_cb);
+ *offlined_pages += __offline_isolated_pages(start, start + nr_pages);
+ return 0;
}
/*
@@ -1472,26 +1444,7 @@ static int
check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
void *data)
{
- int ret;
- long offlined = *(long *)data;
- ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
- offlined = nr_pages;
- if (!ret)
- *(long *)data += offlined;
- return ret;
-}
-
-static long
-check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
-{
- long offlined = 0;
- int ret;
-
- ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
- check_pages_isolated_cb);
- if (ret < 0)
- offlined = (long)ret;
- return offlined;
+ return test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
}
static int __init cmdline_parse_movable_node(char *p)
@@ -1576,7 +1529,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
unsigned long end_pfn)
{
unsigned long pfn, nr_pages;
- long offlined_pages;
+ unsigned long offlined_pages = 0;
int ret, node, nr_isolate_pageblock;
unsigned long flags;
unsigned long valid_start, valid_end;
@@ -1652,14 +1605,15 @@ static int __ref __offline_pages(unsigned long start_pfn,
goto failed_removal_isolated;
}
/* check again */
- offlined_pages = check_pages_isolated(start_pfn, end_pfn);
- } while (offlined_pages < 0);
+ ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
+ NULL, check_pages_isolated_cb);
+ } while (ret);
- pr_info("Offlined Pages %ld\n", offlined_pages);
/* Ok, all of our target is isolated.
We cannot do rollback at this point. */
- offline_isolated_pages(start_pfn, end_pfn);
-
+ walk_system_ram_range(start_pfn, end_pfn - start_pfn,
+ &offlined_pages, offline_isolated_pages_cb);
+ pr_info("Offlined Pages %ld\n", offlined_pages);
/*
* Onlining will reset pagetype flags and makes migrate type
* MOVABLE, so just need to decrease the number of isolated
@@ -1843,6 +1797,26 @@ void try_offline_node(int nid)
}
EXPORT_SYMBOL(try_offline_node);
+static void __release_memory_resource(resource_size_t start,
+ resource_size_t size)
+{
+ int ret;
+
+ /*
+ * When removing memory in the same granularity as it was added,
+ * this function never fails. It might only fail if resources
+ * have to be adjusted or split. We'll ignore the error, as
+ * removing of memory cannot fail.
+ */
+ ret = release_mem_region_adjustable(&iomem_resource, start, size);
+ if (ret) {
+ resource_size_t endres = start + size - 1;
+
+ pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
+ &start, &endres, ret);
+ }
+}
+
/**
* remove_memory
* @nid: the node ID
@@ -1877,6 +1851,7 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
memblock_remove(start, size);
arch_remove_memory(nid, start, size, NULL);
+ __release_memory_resource(start, size);
try_offline_node(nid);
diff --git a/mm/migrate.c b/mm/migrate.c
index 663a5449367a..f2ecc2855a12 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -463,7 +463,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
for (i = 1; i < HPAGE_PMD_NR; i++) {
xas_next(&xas);
- xas_store(&xas, newpage + i);
+ xas_store(&xas, newpage);
}
}
@@ -2356,7 +2356,8 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
mm_walk.mm = migrate->vma->vm_mm;
mm_walk.private = migrate;
- mmu_notifier_range_init(&range, mm_walk.mm, migrate->start,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm_walk.mm,
+ migrate->start,
migrate->end);
mmu_notifier_invalidate_range_start(&range);
walk_page_range(migrate->start, migrate->end, &mm_walk);
@@ -2764,6 +2765,8 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
notified = true;
mmu_notifier_range_init(&range,
+ MMU_NOTIFY_CLEAR, 0,
+ NULL,
migrate->vma->vm_mm,
addr, migrate->end);
mmu_notifier_invalidate_range_start(&range);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 9c884abc7850..ee36068077b6 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -180,7 +180,7 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
if (_ret) {
pr_info("%pS callback failed with %d in %sblockable context.\n",
mn->ops->invalidate_range_start, _ret,
- !range->blockable ? "non-" : "");
+ !mmu_notifier_range_blockable(range) ? "non-" : "");
ret = _ret;
}
}
@@ -395,3 +395,13 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn,
mmdrop(mm);
}
EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release);
+
+bool
+mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range)
+{
+ if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA)
+ return false;
+ /* Return true if the vma still have the read flag set. */
+ return range->vma->vm_flags & VM_READ;
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 028c724dcb1a..bf38dfbbb4b4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -39,7 +39,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable, int prot_numa)
{
- struct mm_struct *mm = vma->vm_mm;
pte_t *pte, oldpte;
spinlock_t *ptl;
unsigned long pages = 0;
@@ -136,7 +135,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
newpte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(oldpte))
newpte = pte_swp_mksoft_dirty(newpte);
- set_pte_at(mm, addr, pte, newpte);
+ set_pte_at(vma->vm_mm, addr, pte, newpte);
pages++;
}
@@ -150,7 +149,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
*/
make_device_private_entry_read(&entry);
newpte = swp_entry_to_pte(entry);
- set_pte_at(mm, addr, pte, newpte);
+ set_pte_at(vma->vm_mm, addr, pte, newpte);
pages++;
}
@@ -185,7 +184,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
/* invoke the mmu notifier if the pmd is populated */
if (!range.start) {
- mmu_notifier_range_init(&range, vma->vm_mm, addr, end);
+ mmu_notifier_range_init(&range,
+ MMU_NOTIFY_PROTECTION_VMA, 0,
+ vma, vma->vm_mm, addr, end);
mmu_notifier_invalidate_range_start(&range);
}
diff --git a/mm/mremap.c b/mm/mremap.c
index e3edef6b7a12..fc241d23cd97 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -249,7 +249,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);
- mmu_notifier_range_init(&range, vma->vm_mm, old_addr, old_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+ old_addr, old_end);
mmu_notifier_invalidate_range_start(&range);
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
diff --git a/mm/nommu.c b/mm/nommu.c
index 749276beb109..b492fd1fcf9f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -473,6 +473,20 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vm_insert_page);
+int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL(vm_map_pages);
+
+int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL(vm_map_pages_zero);
+
/*
* sys_brk() for the most part doesn't need the global kernel
* lock, except when an application is doing something nasty
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3a2484884cfd..539c91d0b26a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -531,7 +531,8 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
struct mmu_notifier_range range;
struct mmu_gather tlb;
- mmu_notifier_range_init(&range, mm, vma->vm_start,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
+ vma, mm, vma->vm_start,
vma->vm_end);
tlb_gather_mmu(&tlb, mm, range.start, range.end);
if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 9f61dfec6a1f..07656485c0e6 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2808,6 +2808,18 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
}
EXPORT_SYMBOL(__test_set_page_writeback);
+/*
+ * Wait for a page to complete writeback
+ */
+void wait_on_page_writeback(struct page *page)
+{
+ if (PageWriteback(page)) {
+ trace_wait_on_page_writeback(page, page_mapping(page));
+ wait_on_page_bit(page, PG_writeback);
+ }
+}
+EXPORT_SYMBOL_GPL(wait_on_page_writeback);
+
/**
* wait_for_stable_page() - wait for writeback to finish, if necessary.
* @page: The page to wait on.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 59661106da16..f2f3fb4921d1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1416,36 +1416,22 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
#endif
#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-static inline bool __meminit __maybe_unused
-meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
+/* Only safe to use early in boot when initialisation is single-threaded */
+static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
int nid;
- nid = __early_pfn_to_nid(pfn, state);
+ nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
if (nid >= 0 && nid != node)
return false;
return true;
}
-/* Only safe to use early in boot when initialisation is single-threaded */
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
- return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
-}
-
#else
-
static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
return true;
}
-static inline bool __meminit __maybe_unused
-meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
-{
- return true;
-}
#endif
@@ -1574,21 +1560,13 @@ static inline void __init pgdat_init_report_one_done(void)
*
* Then, we check if a current large page is valid by only checking the validity
* of the head pfn.
- *
- * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave
- * within a node: a pfn is between start and end of a node, but does not belong
- * to this memory node.
*/
-static inline bool __init
-deferred_pfn_valid(int nid, unsigned long pfn,
- struct mminit_pfnnid_cache *nid_init_state)
+static inline bool __init deferred_pfn_valid(unsigned long pfn)
{
if (!pfn_valid_within(pfn))
return false;
if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
return false;
- if (!meminit_pfn_in_nid(pfn, nid, nid_init_state))
- return false;
return true;
}
@@ -1596,15 +1574,14 @@ deferred_pfn_valid(int nid, unsigned long pfn,
* Free pages to buddy allocator. Try to free aligned pages in
* pageblock_nr_pages sizes.
*/
-static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
+static void __init deferred_free_pages(unsigned long pfn,
unsigned long end_pfn)
{
- struct mminit_pfnnid_cache nid_init_state = { };
unsigned long nr_pgmask = pageblock_nr_pages - 1;
unsigned long nr_free = 0;
for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+ if (!deferred_pfn_valid(pfn)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 0;
} else if (!(pfn & nr_pgmask)) {
@@ -1624,17 +1601,18 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
* by performing it only once every pageblock_nr_pages.
* Return number of pages initialized.
*/
-static unsigned long __init deferred_init_pages(int nid, int zid,
+static unsigned long __init deferred_init_pages(struct zone *zone,
unsigned long pfn,
unsigned long end_pfn)
{
- struct mminit_pfnnid_cache nid_init_state = { };
unsigned long nr_pgmask = pageblock_nr_pages - 1;
+ int nid = zone_to_nid(zone);
unsigned long nr_pages = 0;
+ int zid = zone_idx(zone);
struct page *page = NULL;
for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+ if (!deferred_pfn_valid(pfn)) {
page = NULL;
continue;
} else if (!page || !(pfn & nr_pgmask)) {
@@ -1649,18 +1627,100 @@ static unsigned long __init deferred_init_pages(int nid, int zid,
return (nr_pages);
}
+/*
+ * This function is meant to pre-load the iterator for the zone init.
+ * Specifically it walks through the ranges until we are caught up to the
+ * first_init_pfn value and exits there. If we never encounter the value we
+ * return false indicating there are no valid ranges left.
+ */
+static bool __init
+deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
+ unsigned long *spfn, unsigned long *epfn,
+ unsigned long first_init_pfn)
+{
+ u64 j;
+
+ /*
+ * Start out by walking through the ranges in this zone that have
+ * already been initialized. We don't need to do anything with them
+ * so we just need to flush them out of the system.
+ */
+ for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
+ if (*epfn <= first_init_pfn)
+ continue;
+ if (*spfn < first_init_pfn)
+ *spfn = first_init_pfn;
+ *i = j;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Initialize and free pages. We do it in two loops: first we initialize
+ * struct page, then free to buddy allocator, because while we are
+ * freeing pages we can access pages that are ahead (computing buddy
+ * page in __free_one_page()).
+ *
+ * In order to try and keep some memory in the cache we have the loop
+ * broken along max page order boundaries. This way we will not cause
+ * any issues with the buddy page computation.
+ */
+static unsigned long __init
+deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
+ unsigned long *end_pfn)
+{
+ unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
+ unsigned long spfn = *start_pfn, epfn = *end_pfn;
+ unsigned long nr_pages = 0;
+ u64 j = *i;
+
+ /* First we loop through and initialize the page values */
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
+ unsigned long t;
+
+ if (mo_pfn <= *start_pfn)
+ break;
+
+ t = min(mo_pfn, *end_pfn);
+ nr_pages += deferred_init_pages(zone, *start_pfn, t);
+
+ if (mo_pfn < *end_pfn) {
+ *start_pfn = mo_pfn;
+ break;
+ }
+ }
+
+ /* Reset values and now loop through freeing pages as needed */
+ swap(j, *i);
+
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
+ unsigned long t;
+
+ if (mo_pfn <= spfn)
+ break;
+
+ t = min(mo_pfn, epfn);
+ deferred_free_pages(spfn, t);
+
+ if (mo_pfn <= epfn)
+ break;
+ }
+
+ return nr_pages;
+}
+
/* Initialise remaining memory on a node */
static int __init deferred_init_memmap(void *data)
{
pg_data_t *pgdat = data;
- int nid = pgdat->node_id;
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+ unsigned long spfn = 0, epfn = 0, nr_pages = 0;
+ unsigned long first_init_pfn, flags;
unsigned long start = jiffies;
- unsigned long nr_pages = 0;
- unsigned long spfn, epfn, first_init_pfn, flags;
- phys_addr_t spa, epa;
- int zid;
struct zone *zone;
- const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+ int zid;
u64 i;
/* Bind memory initialisation thread to a local node if possible */
@@ -1686,31 +1746,27 @@ static int __init deferred_init_memmap(void *data)
if (first_init_pfn < zone_end_pfn(zone))
break;
}
- first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
+
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_init_pfn))
+ goto zone_empty;
/*
- * Initialize and free pages. We do it in two loops: first we initialize
- * struct page, than free to buddy allocator, because while we are
- * freeing pages we can access pages that are ahead (computing buddy
- * page in __free_one_page()).
+ * Initialize and free pages in MAX_ORDER sized increments so
+ * that we can avoid introducing any issues with the buddy
+ * allocator.
*/
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
- nr_pages += deferred_init_pages(nid, zid, spfn, epfn);
- }
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
- deferred_free_pages(nid, zid, spfn, epfn);
- }
+ while (spfn < epfn)
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+zone_empty:
pgdat_resize_unlock(pgdat, &flags);
/* Sanity check that the next zone really is unpopulated */
WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
- pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
- jiffies_to_msecs(jiffies - start));
+ pr_info("node %d initialised, %lu pages in %ums\n",
+ pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start));
pgdat_init_report_one_done();
return 0;
@@ -1734,14 +1790,11 @@ static int __init deferred_init_memmap(void *data)
static noinline bool __init
deferred_grow_zone(struct zone *zone, unsigned int order)
{
- int zid = zone_idx(zone);
- int nid = zone_to_nid(zone);
- pg_data_t *pgdat = NODE_DATA(nid);
unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
- unsigned long nr_pages = 0;
- unsigned long first_init_pfn, spfn, epfn, t, flags;
+ pg_data_t *pgdat = zone->zone_pgdat;
unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
- phys_addr_t spa, epa;
+ unsigned long spfn, epfn, flags;
+ unsigned long nr_pages = 0;
u64 i;
/* Only the last zone may have deferred pages */
@@ -1770,38 +1823,35 @@ deferred_grow_zone(struct zone *zone, unsigned int order)
return true;
}
- first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
-
- if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_deferred_pfn)) {
+ pgdat->first_deferred_pfn = ULONG_MAX;
pgdat_resize_unlock(pgdat, &flags);
- return false;
+ return true;
}
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
+ /*
+ * Initialize and free pages in MAX_ORDER sized increments so
+ * that we can avoid introducing any issues with the buddy
+ * allocator.
+ */
+ while (spfn < epfn) {
+ /* update our first deferred PFN for this section */
+ first_deferred_pfn = spfn;
- while (spfn < epfn && nr_pages < nr_pages_needed) {
- t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
- first_deferred_pfn = min(t, epfn);
- nr_pages += deferred_init_pages(nid, zid, spfn,
- first_deferred_pfn);
- spfn = first_deferred_pfn;
- }
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+
+ /* We should only stop along section boundaries */
+ if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
+ continue;
+ /* If our quota has been met we can stop here */
if (nr_pages >= nr_pages_needed)
break;
}
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
- deferred_free_pages(nid, zid, spfn, epfn);
-
- if (first_deferred_pfn == epfn)
- break;
- }
- pgdat->first_deferred_pfn = first_deferred_pfn;
+ pgdat->first_deferred_pfn = spfn;
pgdat_resize_unlock(pgdat, &flags);
return nr_pages > 0;
@@ -1846,10 +1896,9 @@ void __init page_alloc_init_late(void)
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
#endif
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+
/* Discard memblock private memory */
memblock_discard();
-#endif
for_each_populated_zone(zone)
set_zone_contiguous(zone);
@@ -3120,9 +3169,8 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
- struct zone *zone, unsigned int order,
- gfp_t gfp_flags, int migratetype,
- unsigned int alloc_flags)
+ struct zone *zone, gfp_t gfp_flags,
+ int migratetype, unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
struct list_head *list;
@@ -3134,7 +3182,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
list = &pcp->lists[migratetype];
page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list);
if (page) {
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
zone_statistics(preferred_zone, zone);
}
local_irq_restore(flags);
@@ -3154,8 +3202,8 @@ struct page *rmqueue(struct zone *preferred_zone,
struct page *page;
if (likely(order == 0)) {
- page = rmqueue_pcplist(preferred_zone, zone, order,
- gfp_flags, migratetype, alloc_flags);
+ page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
+ migratetype, alloc_flags);
goto out;
}
@@ -4821,7 +4869,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
/**
* alloc_pages_exact - allocate an exact number physically-contiguous pages.
* @size: the number of bytes to allocate
- * @gfp_mask: GFP flags for the allocation
+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
*
* This function is similar to alloc_pages(), except that it allocates the
* minimum number of pages to satisfy the request. alloc_pages() can only
@@ -4838,6 +4886,9 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
unsigned int order = get_order(size);
unsigned long addr;
+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
+ gfp_mask &= ~__GFP_COMP;
+
addr = __get_free_pages(gfp_mask, order);
return make_alloc_exact(addr, order, size);
}
@@ -4848,7 +4899,7 @@ EXPORT_SYMBOL(alloc_pages_exact);
* pages on a node.
* @nid: the preferred node ID where memory should be allocated
* @size: the number of bytes to allocate
- * @gfp_mask: GFP flags for the allocation
+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
*
* Like alloc_pages_exact(), but try to allocate on node nid first before falling
* back.
@@ -4858,7 +4909,12 @@ EXPORT_SYMBOL(alloc_pages_exact);
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
{
unsigned int order = get_order(size);
- struct page *p = alloc_pages_node(nid, gfp_mask, order);
+ struct page *p;
+
+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
+ gfp_mask &= ~__GFP_COMP;
+
+ p = alloc_pages_node(nid, gfp_mask, order);
if (!p)
return NULL;
return make_alloc_exact((unsigned long)page_address(p), order, size);
@@ -6247,13 +6303,15 @@ static unsigned long __init zone_spanned_pages_in_node(int nid,
unsigned long *zone_end_pfn,
unsigned long *ignored)
{
+ unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+ unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
/* Get the start and end of the zone */
- *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
- *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+ *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+ *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
zone_start_pfn, zone_end_pfn);
@@ -8129,8 +8187,7 @@ unmovable:
return true;
}
-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
-
+#ifdef CONFIG_CONTIG_ALLOC
static unsigned long pfn_max_align_down(unsigned long pfn)
{
return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
@@ -8339,8 +8396,9 @@ done:
pfn_max_align_up(end), migratetype);
return ret;
}
+#endif /* CONFIG_CONTIG_ALLOC */
-void free_contig_range(unsigned long pfn, unsigned nr_pages)
+void free_contig_range(unsigned long pfn, unsigned int nr_pages)
{
unsigned int count = 0;
@@ -8352,7 +8410,6 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
}
WARN(count != 0, "%d pages are still in use!\n", count);
}
-#endif
#ifdef CONFIG_MEMORY_HOTPLUG
/*
@@ -8394,7 +8451,7 @@ void zone_pcp_reset(struct zone *zone)
* All pages in the range must be in a single zone and isolated
* before calling this.
*/
-void
+unsigned long
__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
struct page *page;
@@ -8402,12 +8459,15 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
unsigned int order, i;
unsigned long pfn;
unsigned long flags;
+ unsigned long offlined_pages = 0;
+
/* find the first valid pfn */
for (pfn = start_pfn; pfn < end_pfn; pfn++)
if (pfn_valid(pfn))
break;
if (pfn == end_pfn)
- return;
+ return offlined_pages;
+
offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
spin_lock_irqsave(&zone->lock, flags);
@@ -8425,12 +8485,14 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
pfn++;
SetPageReserved(page);
+ offlined_pages++;
continue;
}
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
order = page_order(page);
+ offlined_pages += 1 << order;
#ifdef CONFIG_DEBUG_VM
pr_info("remove from free list %lx %d %lx\n",
pfn, 1 << order, end_pfn);
@@ -8443,6 +8505,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
+
+ return offlined_pages;
}
#endif
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 019280712e1b..e3638a5bafff 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -151,8 +151,6 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
for (i = 0; i < nr_pages; i++) {
struct page *page;
- if (!pfn_valid_within(pfn + i))
- continue;
page = pfn_to_online_page(pfn + i);
if (!page)
continue;
diff --git a/mm/rmap.c b/mm/rmap.c
index b30c7c71d1d9..e5dfe2ae6b0d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -850,7 +850,7 @@ int page_referenced(struct page *page,
};
*vm_flags = 0;
- if (!page_mapped(page))
+ if (!pra.mapcount)
return 0;
if (!page_rmapping(page))
@@ -896,7 +896,8 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
* We have to assume the worse case ie pmd for invalidation. Note that
* the page can not be free from this function.
*/
- mmu_notifier_range_init(&range, vma->vm_mm, address,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
+ 0, vma, vma->vm_mm, address,
min(vma->vm_end, address +
(PAGE_SIZE << compound_order(page))));
mmu_notifier_invalidate_range_start(&range);
@@ -928,7 +929,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
continue;
flush_cache_page(vma, address, page_to_pfn(page));
- entry = pmdp_huge_clear_flush(vma, address, pmd);
+ entry = pmdp_invalidate(vma, address, pmd);
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
@@ -1371,7 +1372,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page.
*/
- mmu_notifier_range_init(&range, vma->vm_mm, address,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address,
min(vma->vm_end, address +
(PAGE_SIZE << compound_order(page))));
if (PageHuge(page)) {
diff --git a/mm/shmem.c b/mm/shmem.c
index f4dce9c8670d..1bb3b8dc8bb2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -614,7 +614,7 @@ static int shmem_add_to_page_cache(struct page *page,
if (xas_error(&xas))
goto unlock;
next:
- xas_store(&xas, page + i);
+ xas_store(&xas, page);
if (++i < nr) {
xas_next(&xas);
goto next;
diff --git a/mm/slab.c b/mm/slab.c
index 284ab737faee..2915d912e89a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -990,10 +990,8 @@ static void cpuup_canceled(long cpu)
/* cpu is dead; no one can alloc from it. */
nc = per_cpu_ptr(cachep->cpu_cache, cpu);
- if (nc) {
- free_block(cachep, nc->entry, nc->avail, node, &list);
- nc->avail = 0;
- }
+ free_block(cachep, nc->entry, nc->avail, node, &list);
+ nc->avail = 0;
if (!cpumask_empty(mask)) {
spin_unlock_irq(&n->list_lock);
@@ -1674,8 +1672,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
{
struct page *page, *n;
- list_for_each_entry_safe(page, n, list, lru) {
- list_del(&page->lru);
+ list_for_each_entry_safe(page, n, list, slab_list) {
+ list_del(&page->slab_list);
slab_destroy(cachep, page);
}
}
@@ -2231,8 +2229,8 @@ static int drain_freelist(struct kmem_cache *cache,
goto out;
}
- page = list_entry(p, struct page, lru);
- list_del(&page->lru);
+ page = list_entry(p, struct page, slab_list);
+ list_del(&page->slab_list);
n->free_slabs--;
n->total_slabs--;
/*
@@ -2691,13 +2689,13 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
if (!page)
return;
- INIT_LIST_HEAD(&page->lru);
+ INIT_LIST_HEAD(&page->slab_list);
n = get_node(cachep, page_to_nid(page));
spin_lock(&n->list_lock);
n->total_slabs++;
if (!page->active) {
- list_add_tail(&page->lru, &(n->slabs_free));
+ list_add_tail(&page->slab_list, &n->slabs_free);
n->free_slabs++;
} else
fixup_slab_list(cachep, n, page, &list);
@@ -2806,9 +2804,9 @@ static inline void fixup_slab_list(struct kmem_cache *cachep,
void **list)
{
/* move slabp to correct slabp list: */
- list_del(&page->lru);
+ list_del(&page->slab_list);
if (page->active == cachep->num) {
- list_add(&page->lru, &n->slabs_full);
+ list_add(&page->slab_list, &n->slabs_full);
if (OBJFREELIST_SLAB(cachep)) {
#if DEBUG
/* Poisoning will be done without holding the lock */
@@ -2822,7 +2820,7 @@ static inline void fixup_slab_list(struct kmem_cache *cachep,
page->freelist = NULL;
}
} else
- list_add(&page->lru, &n->slabs_partial);
+ list_add(&page->slab_list, &n->slabs_partial);
}
/* Try to find non-pfmemalloc slab if needed */
@@ -2845,20 +2843,20 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
}
/* Move pfmemalloc slab to the end of list to speed up next search */
- list_del(&page->lru);
+ list_del(&page->slab_list);
if (!page->active) {
- list_add_tail(&page->lru, &n->slabs_free);
+ list_add_tail(&page->slab_list, &n->slabs_free);
n->free_slabs++;
} else
- list_add_tail(&page->lru, &n->slabs_partial);
+ list_add_tail(&page->slab_list, &n->slabs_partial);
- list_for_each_entry(page, &n->slabs_partial, lru) {
+ list_for_each_entry(page, &n->slabs_partial, slab_list) {
if (!PageSlabPfmemalloc(page))
return page;
}
n->free_touched = 1;
- list_for_each_entry(page, &n->slabs_free, lru) {
+ list_for_each_entry(page, &n->slabs_free, slab_list) {
if (!PageSlabPfmemalloc(page)) {
n->free_slabs--;
return page;
@@ -2873,11 +2871,12 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
struct page *page;
assert_spin_locked(&n->list_lock);
- page = list_first_entry_or_null(&n->slabs_partial, struct page, lru);
+ page = list_first_entry_or_null(&n->slabs_partial, struct page,
+ slab_list);
if (!page) {
n->free_touched = 1;
page = list_first_entry_or_null(&n->slabs_free, struct page,
- lru);
+ slab_list);
if (page)
n->free_slabs--;
}
@@ -3378,29 +3377,29 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
objp = objpp[i];
page = virt_to_head_page(objp);
- list_del(&page->lru);
+ list_del(&page->slab_list);
check_spinlock_acquired_node(cachep, node);
slab_put_obj(cachep, page, objp);
STATS_DEC_ACTIVE(cachep);
/* fixup slab chains */
if (page->active == 0) {
- list_add(&page->lru, &n->slabs_free);
+ list_add(&page->slab_list, &n->slabs_free);
n->free_slabs++;
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
*/
- list_add_tail(&page->lru, &n->slabs_partial);
+ list_add_tail(&page->slab_list, &n->slabs_partial);
}
}
while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
n->free_objects -= cachep->num;
- page = list_last_entry(&n->slabs_free, struct page, lru);
- list_move(&page->lru, list);
+ page = list_last_entry(&n->slabs_free, struct page, slab_list);
+ list_move(&page->slab_list, list);
n->free_slabs--;
n->total_slabs--;
}
@@ -3438,7 +3437,7 @@ free_done:
int i = 0;
struct page *page;
- list_for_each_entry(page, &n->slabs_free, lru) {
+ list_for_each_entry(page, &n->slabs_free, slab_list) {
BUG_ON(page->active);
i++;
@@ -4292,8 +4291,12 @@ static int leaks_show(struct seq_file *m, void *p)
* whole processing.
*/
do {
- set_store_user_clean(cachep);
drain_cpu_caches(cachep);
+ /*
+ * drain_cpu_caches() could make kmemleak_object and
+ * debug_objects_cache dirty, so reset afterwards.
+ */
+ set_store_user_clean(cachep);
x[1] = 0;
@@ -4302,9 +4305,9 @@ static int leaks_show(struct seq_file *m, void *p)
check_irq_on();
spin_lock_irq(&n->list_lock);
- list_for_each_entry(page, &n->slabs_full, lru)
+ list_for_each_entry(page, &n->slabs_full, slab_list)
handle_slab(x, cachep, page);
- list_for_each_entry(page, &n->slabs_partial, lru)
+ list_for_each_entry(page, &n->slabs_partial, slab_list)
handle_slab(x, cachep, page);
spin_unlock_irq(&n->list_lock);
}
diff --git a/mm/slob.c b/mm/slob.c
index 307c2c9feb44..84aefd9b91ee 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -112,13 +112,13 @@ static inline int slob_page_free(struct page *sp)
static void set_slob_page_free(struct page *sp, struct list_head *list)
{
- list_add(&sp->lru, list);
+ list_add(&sp->slab_list, list);
__SetPageSlobFree(sp);
}
static inline void clear_slob_page_free(struct page *sp)
{
- list_del(&sp->lru);
+ list_del(&sp->slab_list);
__ClearPageSlobFree(sp);
}
@@ -213,13 +213,26 @@ static void slob_free_pages(void *b, int order)
}
/*
- * Allocate a slob block within a given slob_page sp.
+ * slob_page_alloc() - Allocate a slob block within a given slob_page sp.
+ * @sp: Page to look in.
+ * @size: Size of the allocation.
+ * @align: Allocation alignment.
+ * @page_removed_from_list: Return parameter.
+ *
+ * Tries to find a chunk of memory at least @size bytes big within @page.
+ *
+ * Return: Pointer to memory if allocated, %NULL otherwise. If the
+ * allocation fills up @page then the page is removed from the
+ * freelist, in this case @page_removed_from_list will be set to
+ * true (set to false otherwise).
*/
-static void *slob_page_alloc(struct page *sp, size_t size, int align)
+static void *slob_page_alloc(struct page *sp, size_t size, int align,
+ bool *page_removed_from_list)
{
slob_t *prev, *cur, *aligned = NULL;
int delta = 0, units = SLOB_UNITS(size);
+ *page_removed_from_list = false;
for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
slobidx_t avail = slob_units(cur);
@@ -254,8 +267,10 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align)
}
sp->units -= units;
- if (!sp->units)
+ if (!sp->units) {
clear_slob_page_free(sp);
+ *page_removed_from_list = true;
+ }
return cur;
}
if (slob_last(cur))
@@ -269,10 +284,10 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align)
static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
{
struct page *sp;
- struct list_head *prev;
struct list_head *slob_list;
slob_t *b = NULL;
unsigned long flags;
+ bool _unused;
if (size < SLOB_BREAK1)
slob_list = &free_slob_small;
@@ -283,7 +298,8 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
spin_lock_irqsave(&slob_lock, flags);
/* Iterate through each partially free page, try to find room */
- list_for_each_entry(sp, slob_list, lru) {
+ list_for_each_entry(sp, slob_list, slab_list) {
+ bool page_removed_from_list = false;
#ifdef CONFIG_NUMA
/*
* If there's a node specification, search for a partial
@@ -296,18 +312,25 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
if (sp->units < SLOB_UNITS(size))
continue;
- /* Attempt to alloc */
- prev = sp->lru.prev;
- b = slob_page_alloc(sp, size, align);
+ b = slob_page_alloc(sp, size, align, &page_removed_from_list);
if (!b)
continue;
- /* Improve fragment distribution and reduce our average
- * search time by starting our next search here. (see
- * Knuth vol 1, sec 2.5, pg 449) */
- if (prev != slob_list->prev &&
- slob_list->next != prev->next)
- list_move_tail(slob_list, prev->next);
+ /*
+ * If slob_page_alloc() removed sp from the list then we
+ * cannot call list functions on sp. If so allocation
+ * did not fragment the page anyway so optimisation is
+ * unnecessary.
+ */
+ if (!page_removed_from_list) {
+ /*
+ * Improve fragment distribution and reduce our average
+ * search time by starting our next search here. (see
+ * Knuth vol 1, sec 2.5, pg 449)
+ */
+ if (!list_is_first(&sp->slab_list, slob_list))
+ list_rotate_to_front(&sp->slab_list, slob_list);
+ }
break;
}
spin_unlock_irqrestore(&slob_lock, flags);
@@ -323,10 +346,10 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
spin_lock_irqsave(&slob_lock, flags);
sp->units = SLOB_UNITS(PAGE_SIZE);
sp->freelist = b;
- INIT_LIST_HEAD(&sp->lru);
+ INIT_LIST_HEAD(&sp->slab_list);
set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
set_slob_page_free(sp, slob_list);
- b = slob_page_alloc(sp, size, align);
+ b = slob_page_alloc(sp, size, align, &_unused);
BUG_ON(!b);
spin_unlock_irqrestore(&slob_lock, flags);
}
diff --git a/mm/slub.c b/mm/slub.c
index 6b28cd2b5a58..cd04dbd2b5d0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -58,10 +58,11 @@
* D. page->frozen -> frozen state
*
* If a slab is frozen then it is exempt from list management. It is not
- * on any list. The processor that froze the slab is the one who can
- * perform list operations on the page. Other processors may put objects
- * onto the freelist but the processor that froze the slab is the only
- * one that can retrieve the objects from the page's freelist.
+ * on any list except per cpu partial list. The processor that froze the
+ * slab is the one who can perform list operations on the page. Other
+ * processors may put objects onto the freelist but the processor that
+ * froze the slab is the only one that can retrieve the objects from the
+ * page's freelist.
*
* The list_lock protects the partial and full list on each node and
* the partial slab counter. If taken then no new slabs may be added or
@@ -1014,7 +1015,7 @@ static void add_full(struct kmem_cache *s,
return;
lockdep_assert_held(&n->list_lock);
- list_add(&page->lru, &n->full);
+ list_add(&page->slab_list, &n->full);
}
static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
@@ -1023,7 +1024,7 @@ static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct
return;
lockdep_assert_held(&n->list_lock);
- list_del(&page->lru);
+ list_del(&page->slab_list);
}
/* Tracking of the number of slabs for debugging purposes */
@@ -1764,9 +1765,9 @@ __add_partial(struct kmem_cache_node *n, struct page *page, int tail)
{
n->nr_partial++;
if (tail == DEACTIVATE_TO_TAIL)
- list_add_tail(&page->lru, &n->partial);
+ list_add_tail(&page->slab_list, &n->partial);
else
- list_add(&page->lru, &n->partial);
+ list_add(&page->slab_list, &n->partial);
}
static inline void add_partial(struct kmem_cache_node *n,
@@ -1780,7 +1781,7 @@ static inline void remove_partial(struct kmem_cache_node *n,
struct page *page)
{
lockdep_assert_held(&n->list_lock);
- list_del(&page->lru);
+ list_del(&page->slab_list);
n->nr_partial--;
}
@@ -1854,7 +1855,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
return NULL;
spin_lock(&n->list_lock);
- list_for_each_entry_safe(page, page2, &n->partial, lru) {
+ list_for_each_entry_safe(page, page2, &n->partial, slab_list) {
void *t;
if (!pfmemalloc_match(page, flags))
@@ -1942,7 +1943,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
}
}
} while (read_mems_allowed_retry(cpuset_mems_cookie));
-#endif
+#endif /* CONFIG_NUMA */
return NULL;
}
@@ -2240,7 +2241,7 @@ static void unfreeze_partials(struct kmem_cache *s,
discard_slab(s, page);
stat(s, FREE_SLAB);
}
-#endif
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
}
/*
@@ -2299,7 +2300,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
local_irq_restore(flags);
}
preempt_enable();
-#endif
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
}
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
@@ -2398,7 +2399,7 @@ static unsigned long count_partial(struct kmem_cache_node *n,
struct page *page;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, lru)
+ list_for_each_entry(page, &n->partial, slab_list)
x += get_count(page);
spin_unlock_irqrestore(&n->list_lock, flags);
return x;
@@ -2804,7 +2805,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
}
EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
#endif
-#endif
+#endif /* CONFIG_NUMA */
/*
* Slow path handling. This may still be called frequently since objects
@@ -2903,8 +2904,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
* then add it.
*/
if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
- if (kmem_cache_debug(s))
- remove_full(s, n, page);
+ remove_full(s, n, page);
add_partial(n, page, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
@@ -3696,10 +3696,10 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
BUG_ON(irqs_disabled());
spin_lock_irq(&n->list_lock);
- list_for_each_entry_safe(page, h, &n->partial, lru) {
+ list_for_each_entry_safe(page, h, &n->partial, slab_list) {
if (!page->inuse) {
remove_partial(n, page);
- list_add(&page->lru, &discard);
+ list_add(&page->slab_list, &discard);
} else {
list_slab_objects(s, page,
"Objects remaining in %s on __kmem_cache_shutdown()");
@@ -3707,7 +3707,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
}
spin_unlock_irq(&n->list_lock);
- list_for_each_entry_safe(page, h, &discard, lru)
+ list_for_each_entry_safe(page, h, &discard, slab_list)
discard_slab(s, page);
}
@@ -3839,7 +3839,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
return ret;
}
EXPORT_SYMBOL(__kmalloc_node);
-#endif
+#endif /* CONFIG_NUMA */
#ifdef CONFIG_HARDENED_USERCOPY
/*
@@ -3987,7 +3987,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
* Note that concurrent frees may occur while we hold the
* list_lock. page->inuse here is the upper limit.
*/
- list_for_each_entry_safe(page, t, &n->partial, lru) {
+ list_for_each_entry_safe(page, t, &n->partial, slab_list) {
int free = page->objects - page->inuse;
/* Do not reread page->inuse */
@@ -3997,10 +3997,10 @@ int __kmem_cache_shrink(struct kmem_cache *s)
BUG_ON(free <= 0);
if (free == page->objects) {
- list_move(&page->lru, &discard);
+ list_move(&page->slab_list, &discard);
n->nr_partial--;
} else if (free <= SHRINK_PROMOTE_MAX)
- list_move(&page->lru, promote + free - 1);
+ list_move(&page->slab_list, promote + free - 1);
}
/*
@@ -4013,7 +4013,7 @@ int __kmem_cache_shrink(struct kmem_cache *s)
spin_unlock_irqrestore(&n->list_lock, flags);
/* Release empty slabs */
- list_for_each_entry_safe(page, t, &discard, lru)
+ list_for_each_entry_safe(page, t, &discard, slab_list)
discard_slab(s, page);
if (slabs_node(s, node))
@@ -4057,7 +4057,7 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s)
*/
slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu);
}
-#endif
+#endif /* CONFIG_MEMCG */
static int slab_mem_going_offline_callback(void *arg)
{
@@ -4205,11 +4205,11 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
for_each_kmem_cache_node(s, node, n) {
struct page *p;
- list_for_each_entry(p, &n->partial, lru)
+ list_for_each_entry(p, &n->partial, slab_list)
p->slab_cache = s;
#ifdef CONFIG_SLUB_DEBUG
- list_for_each_entry(p, &n->full, lru)
+ list_for_each_entry(p, &n->full, slab_list)
p->slab_cache = s;
#endif
}
@@ -4426,7 +4426,7 @@ static int validate_slab_node(struct kmem_cache *s,
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, lru) {
+ list_for_each_entry(page, &n->partial, slab_list) {
validate_slab_slab(s, page, map);
count++;
}
@@ -4437,7 +4437,7 @@ static int validate_slab_node(struct kmem_cache *s,
if (!(s->flags & SLAB_STORE_USER))
goto out;
- list_for_each_entry(page, &n->full, lru) {
+ list_for_each_entry(page, &n->full, slab_list) {
validate_slab_slab(s, page, map);
count++;
}
@@ -4633,9 +4633,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
continue;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, lru)
+ list_for_each_entry(page, &n->partial, slab_list)
process_slab(&t, s, page, alloc, map);
- list_for_each_entry(page, &n->full, lru)
+ list_for_each_entry(page, &n->full, slab_list)
process_slab(&t, s, page, alloc, map);
spin_unlock_irqrestore(&n->list_lock, flags);
}
@@ -4690,7 +4690,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
len += sprintf(buf, "No data\n");
return len;
}
-#endif
+#endif /* CONFIG_SLUB_DEBUG */
#ifdef SLUB_RESILIENCY_TEST
static void __init resiliency_test(void)
@@ -4750,7 +4750,7 @@ static void __init resiliency_test(void)
#ifdef CONFIG_SYSFS
static void resiliency_test(void) {};
#endif
-#endif
+#endif /* SLUB_RESILIENCY_TEST */
#ifdef CONFIG_SYSFS
enum slab_stat_type {
@@ -5407,7 +5407,7 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
-#endif
+#endif /* CONFIG_SLUB_STATS */
static struct attribute *slab_attrs[] = {
&slab_size_attr.attr,
@@ -5608,7 +5608,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
if (buffer)
free_page((unsigned long)buffer);
-#endif
+#endif /* CONFIG_MEMCG */
}
static void kmem_cache_release(struct kobject *k)
diff --git a/mm/sparse.c b/mm/sparse.c
index 56e057c432f9..fd13166949b5 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -684,10 +684,18 @@ static void free_map_bootmem(struct page *memmap)
#endif /* CONFIG_MEMORY_HOTREMOVE */
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
-/*
- * returns the number of sections whose mem_maps were properly
- * set. If this is <=0, then that means that the passed-in
- * map was not consumed and must be freed.
+/**
+ * sparse_add_one_section - add a memory section
+ * @nid: The node to add section on
+ * @start_pfn: start pfn of the memory range
+ * @altmap: device page map
+ *
+ * This is only intended for hotplug.
+ *
+ * Return:
+ * * 0 - On success.
+ * * -EEXIST - Section has been present.
+ * * -ENOMEM - Out of memory.
*/
int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
struct vmem_altmap *altmap)
diff --git a/mm/swap.c b/mm/swap.c
index 301ed4e04320..3a75722e68a9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -867,7 +867,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
SetPageLRU(page);
/*
* Page becomes evictable in two ways:
- * 1) Within LRU lock [munlock_vma_pages() and __munlock_pagevec()].
+ * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
* 2) Before acquiring LRU lock to put the page to correct LRU and then
* a) do PageLRU check with lock [check_move_unevictable_pages]
* b) do PageLRU check before lock [clear_page_mlock]
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 85245fdec8d9..eb714165afd2 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -132,7 +132,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
for (i = 0; i < nr; i++) {
VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
set_page_private(page + i, entry.val + i);
- xas_store(&xas, page + i);
+ xas_store(&xas, page);
xas_next(&xas);
}
address_space->nrpages += nr;
@@ -167,7 +167,7 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry)
for (i = 0; i < nr; i++) {
void *entry = xas_store(&xas, NULL);
- VM_BUG_ON_PAGE(entry != page + i, entry);
+ VM_BUG_ON_PAGE(entry != page, entry);
set_page_private(page + i, 0);
xas_next(&xas);
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index d59b5a73dfb3..9932d5755e4c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -271,8 +271,7 @@ retry:
*/
idx = linear_page_index(dst_vma, dst_addr);
mapping = dst_vma->vm_file->f_mapping;
- hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
- idx, dst_addr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
err = -ENOMEM;
diff --git a/mm/util.c b/mm/util.c
index 43a2984bccaa..e2e4f8c3fa12 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -318,7 +318,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
@@ -339,10 +339,10 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
* were pinned, returns -errno.
*/
int __weak get_user_pages_fast(unsigned long start,
- int nr_pages, int write, struct page **pages)
+ int nr_pages, unsigned int gup_flags,
+ struct page **pages)
{
- return get_user_pages_unlocked(start, nr_pages, pages,
- write ? FOLL_WRITE : 0);
+ return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
@@ -652,7 +652,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- long free, allowed, reserve;
+ long allowed;
VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
-(s64)vm_committed_as_batch * num_online_cpus(),
@@ -667,51 +667,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
return 0;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- free = global_zone_page_state(NR_FREE_PAGES);
- free += global_node_page_state(NR_FILE_PAGES);
-
- /*
- * shmem pages shouldn't be counted as free in this
- * case, they can't be purged, only swapped out, and
- * that won't affect the overall amount of available
- * memory in the system.
- */
- free -= global_node_page_state(NR_SHMEM);
-
- free += get_nr_swap_pages();
-
- /*
- * Any slabs which are created with the
- * SLAB_RECLAIM_ACCOUNT flag claim to have contents
- * which are reclaimable, under pressure. The dentry
- * cache and most inode caches should fall into this
- */
- free += global_node_page_state(NR_SLAB_RECLAIMABLE);
-
- /*
- * Part of the kernel memory, which can be released
- * under memory pressure.
- */
- free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
-
- /*
- * Leave reserved pages. The pages are not for anonymous pages.
- */
- if (free <= totalreserve_pages)
+ if (pages > totalram_pages() + total_swap_pages)
goto error;
- else
- free -= totalreserve_pages;
-
- /*
- * Reserve some for root
- */
- if (!cap_sys_admin)
- free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- if (free > pages)
- return 0;
-
- goto error;
+ return 0;
}
allowed = vm_commit_limit();
@@ -725,7 +683,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
* Don't let a single process grow so big a user can't recover
*/
if (mm) {
- reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+ long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+
allowed -= min_t(long, mm->total_vm / 32, reserve);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fd9de504e516..d96c54703948 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -346,7 +346,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
int zid;
if (!mem_cgroup_disabled())
- lru_size = mem_cgroup_get_lru_size(lruvec, lru);
+ lru_size = lruvec_page_state(lruvec, NR_LRU_BASE + lru);
else
lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
@@ -1107,6 +1107,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
unsigned nr_reclaimed = 0;
+ unsigned pgactivate = 0;
memset(stat, 0, sizeof(*stat));
cond_resched();
@@ -1466,8 +1467,10 @@ activate_locked:
try_to_free_swap(page);
VM_BUG_ON_PAGE(PageActive(page), page);
if (!PageMlocked(page)) {
+ int type = page_is_file_cache(page);
SetPageActive(page);
- stat->nr_activate++;
+ pgactivate++;
+ stat->nr_activate[type] += hpage_nr_pages(page);
count_memcg_page_event(page, PGACTIVATE);
}
keep_locked:
@@ -1482,7 +1485,7 @@ keep:
free_unref_page_list(&free_pages);
list_splice(&ret_pages, page_list);
- count_vm_events(PGACTIVATE, stat->nr_activate);
+ count_vm_events(PGACTIVATE, pgactivate);
return nr_reclaimed;
}
@@ -1804,40 +1807,54 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
return isolated > inactive;
}
-static noinline_for_stack void
-putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
+/*
+ * This moves pages from @list to corresponding LRU list.
+ *
+ * We move them the other way if the page is referenced by one or more
+ * processes, from rmap.
+ *
+ * If the pages are mostly unmapped, the processing is fast and it is
+ * appropriate to hold zone_lru_lock across the whole operation. But if
+ * the pages are mapped, the processing is slow (page_referenced()) so we
+ * should drop zone_lru_lock around each page. It's impossible to balance
+ * this, so instead we remove the pages from the LRU while processing them.
+ * It is safe to rely on PG_active against the non-LRU pages in here because
+ * nobody will play with that bit on a non-LRU page.
+ *
+ * The downside is that we have to touch page->_refcount against each page.
+ * But we had to alter page->flags anyway.
+ *
+ * Returns the number of pages moved to the given lruvec.
+ */
+
+static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
+ struct list_head *list)
{
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ int nr_pages, nr_moved = 0;
LIST_HEAD(pages_to_free);
+ struct page *page;
+ enum lru_list lru;
- /*
- * Put back any unfreeable pages.
- */
- while (!list_empty(page_list)) {
- struct page *page = lru_to_page(page_list);
- int lru;
-
+ while (!list_empty(list)) {
+ page = lru_to_page(list);
VM_BUG_ON_PAGE(PageLRU(page), page);
- list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
+ list_del(&page->lru);
spin_unlock_irq(&pgdat->lru_lock);
putback_lru_page(page);
spin_lock_irq(&pgdat->lru_lock);
continue;
}
-
lruvec = mem_cgroup_page_lruvec(page, pgdat);
SetPageLRU(page);
lru = page_lru(page);
- add_page_to_lru_list(page, lruvec, lru);
- if (is_active_lru(lru)) {
- int file = is_file_lru(lru);
- int numpages = hpage_nr_pages(page);
- reclaim_stat->recent_rotated[file] += numpages;
- }
+ nr_pages = hpage_nr_pages(page);
+ update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
+ list_move(&page->lru, &lruvec->lists[lru]);
+
if (put_page_testzero(page)) {
__ClearPageLRU(page);
__ClearPageActive(page);
@@ -1850,13 +1867,17 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
spin_lock_irq(&pgdat->lru_lock);
} else
list_add(&page->lru, &pages_to_free);
+ } else {
+ nr_moved += nr_pages;
}
}
/*
* To save our caller's stack, now use input list for pages to free.
*/
- list_splice(&pages_to_free, page_list);
+ list_splice(&pages_to_free, list);
+
+ return nr_moved;
}
/*
@@ -1886,6 +1907,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
unsigned long nr_taken;
struct reclaim_stat stat;
int file = is_file_lru(lru);
+ enum vm_event_item item;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
bool stalled = false;
@@ -1913,17 +1935,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
- if (current_is_kswapd()) {
- if (global_reclaim(sc))
- __count_vm_events(PGSCAN_KSWAPD, nr_scanned);
- count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD,
- nr_scanned);
- } else {
- if (global_reclaim(sc))
- __count_vm_events(PGSCAN_DIRECT, nr_scanned);
- count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT,
- nr_scanned);
- }
+ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
+ if (global_reclaim(sc))
+ __count_vm_events(item, nr_scanned);
+ __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
spin_unlock_irq(&pgdat->lru_lock);
if (nr_taken == 0)
@@ -1934,19 +1949,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_lock_irq(&pgdat->lru_lock);
- if (current_is_kswapd()) {
- if (global_reclaim(sc))
- __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
- count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD,
- nr_reclaimed);
- } else {
- if (global_reclaim(sc))
- __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
- count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT,
- nr_reclaimed);
- }
+ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+ if (global_reclaim(sc))
+ __count_vm_events(item, nr_reclaimed);
+ __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
+ reclaim_stat->recent_rotated[0] = stat.nr_activate[0];
+ reclaim_stat->recent_rotated[1] = stat.nr_activate[1];
- putback_inactive_pages(lruvec, &page_list);
+ move_pages_to_lru(lruvec, &page_list);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
@@ -1983,73 +1993,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
return nr_reclaimed;
}
-/*
- * This moves pages from the active list to the inactive list.
- *
- * We move them the other way if the page is referenced by one or more
- * processes, from rmap.
- *
- * If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold pgdat->lru_lock across the whole operation. But if
- * the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop pgdat->lru_lock around each page. It's impossible to balance
- * this, so instead we remove the pages from the LRU while processing them.
- * It is safe to rely on PG_active against the non-LRU pages in here because
- * nobody will play with that bit on a non-LRU page.
- *
- * The downside is that we have to touch page->_refcount against each page.
- * But we had to alter page->flags anyway.
- *
- * Returns the number of pages moved to the given lru.
- */
-
-static unsigned move_active_pages_to_lru(struct lruvec *lruvec,
- struct list_head *list,
- struct list_head *pages_to_free,
- enum lru_list lru)
-{
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- struct page *page;
- int nr_pages;
- int nr_moved = 0;
-
- while (!list_empty(list)) {
- page = lru_to_page(list);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
-
- VM_BUG_ON_PAGE(PageLRU(page), page);
- SetPageLRU(page);
-
- nr_pages = hpage_nr_pages(page);
- update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
- list_move(&page->lru, &lruvec->lists[lru]);
-
- if (put_page_testzero(page)) {
- __ClearPageLRU(page);
- __ClearPageActive(page);
- del_page_from_lru_list(page, lruvec, lru);
-
- if (unlikely(PageCompound(page))) {
- spin_unlock_irq(&pgdat->lru_lock);
- mem_cgroup_uncharge(page);
- (*get_compound_page_dtor(page))(page);
- spin_lock_irq(&pgdat->lru_lock);
- } else
- list_add(&page->lru, pages_to_free);
- } else {
- nr_moved += nr_pages;
- }
- }
-
- if (!is_active_lru(lru)) {
- __count_vm_events(PGDEACTIVATE, nr_moved);
- count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
- nr_moved);
- }
-
- return nr_moved;
-}
-
static void shrink_active_list(unsigned long nr_to_scan,
struct lruvec *lruvec,
struct scan_control *sc,
@@ -2079,7 +2022,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
reclaim_stat->recent_scanned[file] += nr_taken;
__count_vm_events(PGREFILL, nr_scanned);
- count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
+ __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
spin_unlock_irq(&pgdat->lru_lock);
@@ -2136,13 +2079,19 @@ static void shrink_active_list(unsigned long nr_to_scan,
*/
reclaim_stat->recent_rotated[file] += nr_rotated;
- nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
- nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
+ nr_activate = move_pages_to_lru(lruvec, &l_active);
+ nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
+ /* Keep all free pages in l_active list */
+ list_splice(&l_inactive, &l_active);
+
+ __count_vm_events(PGDEACTIVATE, nr_deactivate);
+ __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
+
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&pgdat->lru_lock);
- mem_cgroup_uncharge_list(&l_hold);
- free_unref_page_list(&l_hold);
+ mem_cgroup_uncharge_list(&l_active);
+ free_unref_page_list(&l_active);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
nr_deactivate, nr_rotated, sc->priority, file);
}
@@ -3212,10 +3161,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
return 1;
- trace_mm_vmscan_direct_reclaim_begin(order,
- sc.may_writepage,
- sc.gfp_mask,
- sc.reclaim_idx);
+ trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
@@ -3246,9 +3192,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
- sc.may_writepage,
- sc.gfp_mask,
- sc.reclaim_idx);
+ sc.gfp_mask);
/*
* NOTE: Although we can get the priority field, using it
@@ -3297,10 +3241,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
- trace_mm_vmscan_memcg_reclaim_begin(0,
- sc.may_writepage,
- sc.gfp_mask,
- sc.reclaim_idx);
+ trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
@@ -4149,6 +4090,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
.reclaim_idx = gfp_zone(gfp_mask),
};
+ trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
+ sc.gfp_mask);
+
cond_resched();
fs_reclaim_acquire(sc.gfp_mask);
/*
@@ -4175,6 +4119,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
current->flags &= ~PF_SWAPWRITE;
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);
+
+ trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
+
return sc.nr_reclaimed >= nr_pages;
}
diff --git a/mm/workingset.c b/mm/workingset.c
index 0bedf67502d5..6419baebd306 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -426,10 +426,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
#ifdef CONFIG_MEMCG
if (sc->memcg) {
struct lruvec *lruvec;
+ int i;
- pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
- LRU_ALL);
lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg);
+ for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
+ pages += lruvec_page_state(lruvec, NR_LRU_BASE + i);
pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE);
pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE);
} else
diff --git a/mm/z3fold.c b/mm/z3fold.c
index aee9b0b8d907..1ffecd6333e5 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -24,16 +24,47 @@
#include <linux/atomic.h>
#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/dcache.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/page-flags.h>
+#include <linux/migrate.h>
+#include <linux/node.h>
+#include <linux/compaction.h>
#include <linux/percpu.h>
+#include <linux/mount.h>
+#include <linux/fs.h>
#include <linux/preempt.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/zpool.h>
+/*
+ * NCHUNKS_ORDER determines the internal allocation granularity, effectively
+ * adjusting internal fragmentation. It also determines the number of
+ * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
+ * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
+ * in the beginning of an allocated page are occupied by z3fold header, so
+ * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
+ * which shows the max number of free chunks in z3fold page, also there will
+ * be 63, or 62, respectively, freelists per pool.
+ */
+#define NCHUNKS_ORDER 6
+
+#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
+#define CHUNK_SIZE (1 << CHUNK_SHIFT)
+#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
+#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
+#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
+#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
+
+#define BUDDY_MASK (0x3)
+#define BUDDY_SHIFT 2
+#define SLOTS_ALIGN (0x40)
+
/*****************
* Structures
*****************/
@@ -47,8 +78,18 @@ enum buddy {
FIRST,
MIDDLE,
LAST,
- BUDDIES_MAX
+ BUDDIES_MAX = LAST
+};
+
+struct z3fold_buddy_slots {
+ /*
+ * we are using BUDDY_MASK in handle_to_buddy etc. so there should
+ * be enough slots to hold all possible variants
+ */
+ unsigned long slot[BUDDY_MASK + 1];
+ unsigned long pool; /* back link + flags */
};
+#define HANDLE_FLAG_MASK (0x03)
/*
* struct z3fold_header - z3fold page metadata occupying first chunks of each
@@ -58,49 +99,29 @@ enum buddy {
* @page_lock: per-page lock
* @refcount: reference count for the z3fold page
* @work: work_struct for page layout optimization
- * @pool: pointer to the pool which this page belongs to
+ * @slots: pointer to the structure holding buddy slots
* @cpu: CPU which this page "belongs" to
* @first_chunks: the size of the first buddy in chunks, 0 if free
* @middle_chunks: the size of the middle buddy in chunks, 0 if free
* @last_chunks: the size of the last buddy in chunks, 0 if free
* @first_num: the starting number (for the first handle)
+ * @mapped_count: the number of objects currently mapped
*/
struct z3fold_header {
struct list_head buddy;
spinlock_t page_lock;
struct kref refcount;
struct work_struct work;
- struct z3fold_pool *pool;
+ struct z3fold_buddy_slots *slots;
short cpu;
unsigned short first_chunks;
unsigned short middle_chunks;
unsigned short last_chunks;
unsigned short start_middle;
unsigned short first_num:2;
+ unsigned short mapped_count:2;
};
-/*
- * NCHUNKS_ORDER determines the internal allocation granularity, effectively
- * adjusting internal fragmentation. It also determines the number of
- * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
- * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
- * in the beginning of an allocated page are occupied by z3fold header, so
- * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
- * which shows the max number of free chunks in z3fold page, also there will
- * be 63, or 62, respectively, freelists per pool.
- */
-#define NCHUNKS_ORDER 6
-
-#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
-#define CHUNK_SIZE (1 << CHUNK_SHIFT)
-#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
-#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
-#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
-#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
-
-#define BUDDY_MASK (0x3)
-#define BUDDY_SHIFT 2
-
/**
* struct z3fold_pool - stores metadata for each z3fold pool
* @name: pool name
@@ -113,11 +134,13 @@ struct z3fold_header {
* added buddy.
* @stale: list of pages marked for freeing
* @pages_nr: number of z3fold pages in the pool.
+ * @c_handle: cache for z3fold_buddy_slots allocation
* @ops: pointer to a structure of user defined operations specified at
* pool creation time.
* @compact_wq: workqueue for page layout background optimization
* @release_wq: workqueue for safe page release
* @work: work_struct for safe page release
+ * @inode: inode for z3fold pseudo filesystem
*
* This structure is allocated at pool creation time and maintains metadata
* pertaining to a particular z3fold pool.
@@ -130,12 +153,14 @@ struct z3fold_pool {
struct list_head lru;
struct list_head stale;
atomic64_t pages_nr;
+ struct kmem_cache *c_handle;
const struct z3fold_ops *ops;
struct zpool *zpool;
const struct zpool_ops *zpool_ops;
struct workqueue_struct *compact_wq;
struct workqueue_struct *release_wq;
struct work_struct work;
+ struct inode *inode;
};
/*
@@ -164,11 +189,118 @@ static int size_to_chunks(size_t size)
static void compact_page_work(struct work_struct *w);
+static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool)
+{
+ struct z3fold_buddy_slots *slots = kmem_cache_alloc(pool->c_handle,
+ GFP_KERNEL);
+
+ if (slots) {
+ memset(slots->slot, 0, sizeof(slots->slot));
+ slots->pool = (unsigned long)pool;
+ }
+
+ return slots;
+}
+
+static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s)
+{
+ return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK);
+}
+
+static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
+{
+ return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
+}
+
+static inline void free_handle(unsigned long handle)
+{
+ struct z3fold_buddy_slots *slots;
+ int i;
+ bool is_free;
+
+ if (handle & (1 << PAGE_HEADLESS))
+ return;
+
+ WARN_ON(*(unsigned long *)handle == 0);
+ *(unsigned long *)handle = 0;
+ slots = handle_to_slots(handle);
+ is_free = true;
+ for (i = 0; i <= BUDDY_MASK; i++) {
+ if (slots->slot[i]) {
+ is_free = false;
+ break;
+ }
+ }
+
+ if (is_free) {
+ struct z3fold_pool *pool = slots_to_pool(slots);
+
+ kmem_cache_free(pool->c_handle, slots);
+ }
+}
+
+static struct dentry *z3fold_do_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ static const struct dentry_operations ops = {
+ .d_dname = simple_dname,
+ };
+
+ return mount_pseudo(fs_type, "z3fold:", NULL, &ops, 0x33);
+}
+
+static struct file_system_type z3fold_fs = {
+ .name = "z3fold",
+ .mount = z3fold_do_mount,
+ .kill_sb = kill_anon_super,
+};
+
+static struct vfsmount *z3fold_mnt;
+static int z3fold_mount(void)
+{
+ int ret = 0;
+
+ z3fold_mnt = kern_mount(&z3fold_fs);
+ if (IS_ERR(z3fold_mnt))
+ ret = PTR_ERR(z3fold_mnt);
+
+ return ret;
+}
+
+static void z3fold_unmount(void)
+{
+ kern_unmount(z3fold_mnt);
+}
+
+static const struct address_space_operations z3fold_aops;
+static int z3fold_register_migration(struct z3fold_pool *pool)
+{
+ pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb);
+ if (IS_ERR(pool->inode)) {
+ pool->inode = NULL;
+ return 1;
+ }
+
+ pool->inode->i_mapping->private_data = pool;
+ pool->inode->i_mapping->a_ops = &z3fold_aops;
+ return 0;
+}
+
+static void z3fold_unregister_migration(struct z3fold_pool *pool)
+{
+ if (pool->inode)
+ iput(pool->inode);
+ }
+
/* Initializes the z3fold header of a newly allocated z3fold page */
static struct z3fold_header *init_z3fold_page(struct page *page,
struct z3fold_pool *pool)
{
struct z3fold_header *zhdr = page_address(page);
+ struct z3fold_buddy_slots *slots = alloc_slots(pool);
+
+ if (!slots)
+ return NULL;
INIT_LIST_HEAD(&page->lru);
clear_bit(PAGE_HEADLESS, &page->private);
@@ -185,15 +317,21 @@ static struct z3fold_header *init_z3fold_page(struct page *page,
zhdr->first_num = 0;
zhdr->start_middle = 0;
zhdr->cpu = -1;
- zhdr->pool = pool;
+ zhdr->slots = slots;
INIT_LIST_HEAD(&zhdr->buddy);
INIT_WORK(&zhdr->work, compact_page_work);
return zhdr;
}
/* Resets the struct page fields and frees the page */
-static void free_z3fold_page(struct page *page)
+static void free_z3fold_page(struct page *page, bool headless)
{
+ if (!headless) {
+ lock_page(page);
+ __ClearPageMovable(page);
+ unlock_page(page);
+ }
+ ClearPagePrivate(page);
__free_page(page);
}
@@ -215,33 +353,57 @@ static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
spin_unlock(&zhdr->page_lock);
}
+/* Helper function to build the index */
+static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
+{
+ return (bud + zhdr->first_num) & BUDDY_MASK;
+}
+
/*
* Encodes the handle of a particular buddy within a z3fold page
* Pool lock should be held as this function accesses first_num
*/
static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
{
- unsigned long handle;
+ struct z3fold_buddy_slots *slots;
+ unsigned long h = (unsigned long)zhdr;
+ int idx = 0;
- handle = (unsigned long)zhdr;
- if (bud != HEADLESS) {
- handle |= (bud + zhdr->first_num) & BUDDY_MASK;
- if (bud == LAST)
- handle |= (zhdr->last_chunks << BUDDY_SHIFT);
- }
- return handle;
+ /*
+ * For a headless page, its handle is its pointer with the extra
+ * PAGE_HEADLESS bit set
+ */
+ if (bud == HEADLESS)
+ return h | (1 << PAGE_HEADLESS);
+
+ /* otherwise, return pointer to encoded handle */
+ idx = __idx(zhdr, bud);
+ h += idx;
+ if (bud == LAST)
+ h |= (zhdr->last_chunks << BUDDY_SHIFT);
+
+ slots = zhdr->slots;
+ slots->slot[idx] = h;
+ return (unsigned long)&slots->slot[idx];
}
/* Returns the z3fold page where a given handle is stored */
-static struct z3fold_header *handle_to_z3fold_header(unsigned long handle)
+static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
{
- return (struct z3fold_header *)(handle & PAGE_MASK);
+ unsigned long addr = h;
+
+ if (!(addr & (1 << PAGE_HEADLESS)))
+ addr = *(unsigned long *)h;
+
+ return (struct z3fold_header *)(addr & PAGE_MASK);
}
/* only for LAST bud, returns zero otherwise */
static unsigned short handle_to_chunks(unsigned long handle)
{
- return (handle & ~PAGE_MASK) >> BUDDY_SHIFT;
+ unsigned long addr = *(unsigned long *)handle;
+
+ return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
}
/*
@@ -251,21 +413,31 @@ static unsigned short handle_to_chunks(unsigned long handle)
*/
static enum buddy handle_to_buddy(unsigned long handle)
{
- struct z3fold_header *zhdr = handle_to_z3fold_header(handle);
- return (handle - zhdr->first_num) & BUDDY_MASK;
+ struct z3fold_header *zhdr;
+ unsigned long addr;
+
+ WARN_ON(handle & (1 << PAGE_HEADLESS));
+ addr = *(unsigned long *)handle;
+ zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
+ return (addr - zhdr->first_num) & BUDDY_MASK;
+}
+
+static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
+{
+ return slots_to_pool(zhdr->slots);
}
static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
{
struct page *page = virt_to_page(zhdr);
- struct z3fold_pool *pool = zhdr->pool;
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
WARN_ON(!list_empty(&zhdr->buddy));
set_bit(PAGE_STALE, &page->private);
clear_bit(NEEDS_COMPACTING, &page->private);
spin_lock(&pool->lock);
if (!list_empty(&page->lru))
- list_del(&page->lru);
+ list_del_init(&page->lru);
spin_unlock(&pool->lock);
if (locked)
z3fold_page_unlock(zhdr);
@@ -295,9 +467,10 @@ static void release_z3fold_page_locked_list(struct kref *ref)
{
struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
refcount);
- spin_lock(&zhdr->pool->lock);
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
+ spin_lock(&pool->lock);
list_del_init(&zhdr->buddy);
- spin_unlock(&zhdr->pool->lock);
+ spin_unlock(&pool->lock);
WARN_ON(z3fold_page_trylock(zhdr));
__release_z3fold_page(zhdr, true);
@@ -318,7 +491,7 @@ static void free_pages_work(struct work_struct *w)
continue;
spin_unlock(&pool->stale_lock);
cancel_work_sync(&zhdr->work);
- free_z3fold_page(page);
+ free_z3fold_page(page, false);
cond_resched();
spin_lock(&pool->stale_lock);
}
@@ -349,6 +522,23 @@ static int num_free_chunks(struct z3fold_header *zhdr)
return nfree;
}
+/* Add to the appropriate unbuddied list */
+static inline void add_to_unbuddied(struct z3fold_pool *pool,
+ struct z3fold_header *zhdr)
+{
+ if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
+ zhdr->middle_chunks == 0) {
+ struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
+
+ int freechunks = num_free_chunks(zhdr);
+ spin_lock(&pool->lock);
+ list_add(&zhdr->buddy, &unbuddied[freechunks]);
+ spin_unlock(&pool->lock);
+ zhdr->cpu = smp_processor_id();
+ put_cpu_ptr(pool->unbuddied);
+ }
+}
+
static inline void *mchunk_memmove(struct z3fold_header *zhdr,
unsigned short dst_chunk)
{
@@ -367,6 +557,9 @@ static int z3fold_compact_page(struct z3fold_header *zhdr)
if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
return 0; /* can't move middle chunk, it's used */
+ if (unlikely(PageIsolated(page)))
+ return 0;
+
if (zhdr->middle_chunks == 0)
return 0; /* nothing to compact */
@@ -406,10 +599,8 @@ static int z3fold_compact_page(struct z3fold_header *zhdr)
static void do_compact_page(struct z3fold_header *zhdr, bool locked)
{
- struct z3fold_pool *pool = zhdr->pool;
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
struct page *page;
- struct list_head *unbuddied;
- int fchunks;
page = virt_to_page(zhdr);
if (locked)
@@ -429,19 +620,14 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
return;
}
- z3fold_compact_page(zhdr);
- unbuddied = get_cpu_ptr(pool->unbuddied);
- fchunks = num_free_chunks(zhdr);
- if (fchunks < NCHUNKS &&
- (!zhdr->first_chunks || !zhdr->middle_chunks ||
- !zhdr->last_chunks)) {
- /* the page's not completely free and it's unbuddied */
- spin_lock(&pool->lock);
- list_add(&zhdr->buddy, &unbuddied[fchunks]);
- spin_unlock(&pool->lock);
- zhdr->cpu = smp_processor_id();
+ if (unlikely(PageIsolated(page) ||
+ test_bit(PAGE_STALE, &page->private))) {
+ z3fold_page_unlock(zhdr);
+ return;
}
- put_cpu_ptr(pool->unbuddied);
+
+ z3fold_compact_page(zhdr);
+ add_to_unbuddied(pool, zhdr);
z3fold_page_unlock(zhdr);
}
@@ -453,6 +639,103 @@ static void compact_page_work(struct work_struct *w)
do_compact_page(zhdr, false);
}
+/* returns _locked_ z3fold page header or NULL */
+static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
+ size_t size, bool can_sleep)
+{
+ struct z3fold_header *zhdr = NULL;
+ struct page *page;
+ struct list_head *unbuddied;
+ int chunks = size_to_chunks(size), i;
+
+lookup:
+ /* First, try to find an unbuddied z3fold page. */
+ unbuddied = get_cpu_ptr(pool->unbuddied);
+ for_each_unbuddied_list(i, chunks) {
+ struct list_head *l = &unbuddied[i];
+
+ zhdr = list_first_entry_or_null(READ_ONCE(l),
+ struct z3fold_header, buddy);
+
+ if (!zhdr)
+ continue;
+
+ /* Re-check under lock. */
+ spin_lock(&pool->lock);
+ l = &unbuddied[i];
+ if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
+ struct z3fold_header, buddy)) ||
+ !z3fold_page_trylock(zhdr)) {
+ spin_unlock(&pool->lock);
+ zhdr = NULL;
+ put_cpu_ptr(pool->unbuddied);
+ if (can_sleep)
+ cond_resched();
+ goto lookup;
+ }
+ list_del_init(&zhdr->buddy);
+ zhdr->cpu = -1;
+ spin_unlock(&pool->lock);
+
+ page = virt_to_page(zhdr);
+ if (test_bit(NEEDS_COMPACTING, &page->private)) {
+ z3fold_page_unlock(zhdr);
+ zhdr = NULL;
+ put_cpu_ptr(pool->unbuddied);
+ if (can_sleep)
+ cond_resched();
+ goto lookup;
+ }
+
+ /*
+ * this page could not be removed from its unbuddied
+ * list while pool lock was held, and then we've taken
+ * page lock so kref_put could not be called before
+ * we got here, so it's safe to just call kref_get()
+ */
+ kref_get(&zhdr->refcount);
+ break;
+ }
+ put_cpu_ptr(pool->unbuddied);
+
+ if (!zhdr) {
+ int cpu;
+
+ /* look for _exact_ match on other cpus' lists */
+ for_each_online_cpu(cpu) {
+ struct list_head *l;
+
+ unbuddied = per_cpu_ptr(pool->unbuddied, cpu);
+ spin_lock(&pool->lock);
+ l = &unbuddied[chunks];
+
+ zhdr = list_first_entry_or_null(READ_ONCE(l),
+ struct z3fold_header, buddy);
+
+ if (!zhdr || !z3fold_page_trylock(zhdr)) {
+ spin_unlock(&pool->lock);
+ zhdr = NULL;
+ continue;
+ }
+ list_del_init(&zhdr->buddy);
+ zhdr->cpu = -1;
+ spin_unlock(&pool->lock);
+
+ page = virt_to_page(zhdr);
+ if (test_bit(NEEDS_COMPACTING, &page->private)) {
+ z3fold_page_unlock(zhdr);
+ zhdr = NULL;
+ if (can_sleep)
+ cond_resched();
+ continue;
+ }
+ kref_get(&zhdr->refcount);
+ break;
+ }
+ }
+
+ return zhdr;
+}
/*
* API Functions
@@ -476,6 +759,11 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
pool = kzalloc(sizeof(struct z3fold_pool), gfp);
if (!pool)
goto out;
+ pool->c_handle = kmem_cache_create("z3fold_handle",
+ sizeof(struct z3fold_buddy_slots),
+ SLOTS_ALIGN, 0, NULL);
+ if (!pool->c_handle)
+ goto out_c;
spin_lock_init(&pool->lock);
spin_lock_init(&pool->stale_lock);
pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
@@ -497,15 +785,21 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
pool->release_wq = create_singlethread_workqueue(pool->name);
if (!pool->release_wq)
goto out_wq;
+ if (z3fold_register_migration(pool))
+ goto out_rwq;
INIT_WORK(&pool->work, free_pages_work);
pool->ops = ops;
return pool;
+out_rwq:
+ destroy_workqueue(pool->release_wq);
out_wq:
destroy_workqueue(pool->compact_wq);
out_unbuddied:
free_percpu(pool->unbuddied);
out_pool:
+ kmem_cache_destroy(pool->c_handle);
+out_c:
kfree(pool);
out:
return NULL;
@@ -519,6 +813,8 @@ out:
*/
static void z3fold_destroy_pool(struct z3fold_pool *pool)
{
+ kmem_cache_destroy(pool->c_handle);
+ z3fold_unregister_migration(pool);
destroy_workqueue(pool->release_wq);
destroy_workqueue(pool->compact_wq);
kfree(pool);
@@ -546,7 +842,7 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool)
static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
unsigned long *handle)
{
- int chunks = 0, i, freechunks;
+ int chunks = size_to_chunks(size);
struct z3fold_header *zhdr = NULL;
struct page *page = NULL;
enum buddy bud;
@@ -561,56 +857,8 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
bud = HEADLESS;
else {
- struct list_head *unbuddied;
- chunks = size_to_chunks(size);
-
-lookup:
- /* First, try to find an unbuddied z3fold page. */
- unbuddied = get_cpu_ptr(pool->unbuddied);
- for_each_unbuddied_list(i, chunks) {
- struct list_head *l = &unbuddied[i];
-
- zhdr = list_first_entry_or_null(READ_ONCE(l),
- struct z3fold_header, buddy);
-
- if (!zhdr)
- continue;
-
- /* Re-check under lock. */
- spin_lock(&pool->lock);
- l = &unbuddied[i];
- if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
- struct z3fold_header, buddy)) ||
- !z3fold_page_trylock(zhdr)) {
- spin_unlock(&pool->lock);
- put_cpu_ptr(pool->unbuddied);
- goto lookup;
- }
- list_del_init(&zhdr->buddy);
- zhdr->cpu = -1;
- spin_unlock(&pool->lock);
-
- page = virt_to_page(zhdr);
- if (test_bit(NEEDS_COMPACTING, &page->private)) {
- z3fold_page_unlock(zhdr);
- zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
- if (can_sleep)
- cond_resched();
- goto lookup;
- }
-
- /*
- * this page could not be removed from its unbuddied
- * list while pool lock was held, and then we've taken
- * page lock so kref_put could not be called before
- * we got here, so it's safe to just call kref_get()
- */
- kref_get(&zhdr->refcount);
- break;
- }
- put_cpu_ptr(pool->unbuddied);
-
+retry:
+ zhdr = __z3fold_alloc(pool, size, can_sleep);
if (zhdr) {
if (zhdr->first_chunks == 0) {
if (zhdr->middle_chunks != 0 &&
@@ -630,8 +878,9 @@ lookup:
z3fold_page_unlock(zhdr);
pr_err("No free chunks in unbuddied\n");
WARN_ON(1);
- goto lookup;
+ goto retry;
}
+ page = virt_to_page(zhdr);
goto found;
}
bud = FIRST;
@@ -662,13 +911,18 @@ lookup:
if (!page)
return -ENOMEM;
- atomic64_inc(&pool->pages_nr);
zhdr = init_z3fold_page(page, pool);
+ if (!zhdr) {
+ __free_page(page);
+ return -ENOMEM;
+ }
+ atomic64_inc(&pool->pages_nr);
if (bud == HEADLESS) {
set_bit(PAGE_HEADLESS, &page->private);
goto headless;
}
+ __SetPageMovable(page, pool->inode->i_mapping);
z3fold_page_lock(zhdr);
found:
@@ -680,19 +934,7 @@ found:
zhdr->middle_chunks = chunks;
zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
}
-
- if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
- zhdr->middle_chunks == 0) {
- struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
-
- /* Add to unbuddied list */
- freechunks = num_free_chunks(zhdr);
- spin_lock(&pool->lock);
- list_add(&zhdr->buddy, &unbuddied[freechunks]);
- spin_unlock(&pool->lock);
- zhdr->cpu = smp_processor_id();
- put_cpu_ptr(pool->unbuddied);
- }
+ add_to_unbuddied(pool, zhdr);
headless:
spin_lock(&pool->lock);
@@ -739,7 +981,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
spin_lock(&pool->lock);
list_del(&page->lru);
spin_unlock(&pool->lock);
- free_z3fold_page(page);
+ free_z3fold_page(page, true);
atomic64_dec(&pool->pages_nr);
}
return;
@@ -766,6 +1008,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
return;
}
+ free_handle(handle);
if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
atomic64_dec(&pool->pages_nr);
return;
@@ -774,7 +1017,8 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
z3fold_page_unlock(zhdr);
return;
}
- if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
+ if (unlikely(PageIsolated(page)) ||
+ test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
z3fold_page_unlock(zhdr);
return;
}
@@ -855,10 +1099,12 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
if (test_and_set_bit(PAGE_CLAIMED, &page->private))
continue;
- zhdr = page_address(page);
+ if (unlikely(PageIsolated(page)))
+ continue;
if (test_bit(PAGE_HEADLESS, &page->private))
break;
+ zhdr = page_address(page);
if (!z3fold_page_trylock(zhdr)) {
zhdr = NULL;
continue; /* can't evict at this point */
@@ -919,7 +1165,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
next:
if (test_bit(PAGE_HEADLESS, &page->private)) {
if (ret == 0) {
- free_z3fold_page(page);
+ free_z3fold_page(page, true);
atomic64_dec(&pool->pages_nr);
return 0;
}
@@ -996,6 +1242,8 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
break;
}
+ if (addr)
+ zhdr->mapped_count++;
z3fold_page_unlock(zhdr);
out:
return addr;
@@ -1022,6 +1270,7 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
buddy = handle_to_buddy(handle);
if (buddy == MIDDLE)
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+ zhdr->mapped_count--;
z3fold_page_unlock(zhdr);
}
@@ -1036,6 +1285,128 @@ static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
return atomic64_read(&pool->pages_nr);
}
+static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
+{
+ struct z3fold_header *zhdr;
+ struct z3fold_pool *pool;
+
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(PageIsolated(page), page);
+
+ if (test_bit(PAGE_HEADLESS, &page->private))
+ return false;
+
+ zhdr = page_address(page);
+ z3fold_page_lock(zhdr);
+ if (test_bit(NEEDS_COMPACTING, &page->private) ||
+ test_bit(PAGE_STALE, &page->private))
+ goto out;
+
+ pool = zhdr_to_pool(zhdr);
+
+ if (zhdr->mapped_count == 0) {
+ kref_get(&zhdr->refcount);
+ if (!list_empty(&zhdr->buddy))
+ list_del_init(&zhdr->buddy);
+ spin_lock(&pool->lock);
+ if (!list_empty(&page->lru))
+ list_del(&page->lru);
+ spin_unlock(&pool->lock);
+ z3fold_page_unlock(zhdr);
+ return true;
+ }
+out:
+ z3fold_page_unlock(zhdr);
+ return false;
+}
+
+static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage,
+ struct page *page, enum migrate_mode mode)
+{
+ struct z3fold_header *zhdr, *new_zhdr;
+ struct z3fold_pool *pool;
+ struct address_space *new_mapping;
+
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+ zhdr = page_address(page);
+ pool = zhdr_to_pool(zhdr);
+
+ if (!trylock_page(page))
+ return -EAGAIN;
+
+ if (!z3fold_page_trylock(zhdr)) {
+ unlock_page(page);
+ return -EAGAIN;
+ }
+ if (zhdr->mapped_count != 0) {
+ z3fold_page_unlock(zhdr);
+ unlock_page(page);
+ return -EBUSY;
+ }
+ new_zhdr = page_address(newpage);
+ memcpy(new_zhdr, zhdr, PAGE_SIZE);
+ newpage->private = page->private;
+ page->private = 0;
+ z3fold_page_unlock(zhdr);
+ spin_lock_init(&new_zhdr->page_lock);
+ new_mapping = page_mapping(page);
+ __ClearPageMovable(page);
+ ClearPagePrivate(page);
+
+ get_page(newpage);
+ z3fold_page_lock(new_zhdr);
+ if (new_zhdr->first_chunks)
+ encode_handle(new_zhdr, FIRST);
+ if (new_zhdr->last_chunks)
+ encode_handle(new_zhdr, LAST);
+ if (new_zhdr->middle_chunks)
+ encode_handle(new_zhdr, MIDDLE);
+ set_bit(NEEDS_COMPACTING, &newpage->private);
+ new_zhdr->cpu = smp_processor_id();
+ spin_lock(&pool->lock);
+ list_add(&newpage->lru, &pool->lru);
+ spin_unlock(&pool->lock);
+ __SetPageMovable(newpage, new_mapping);
+ z3fold_page_unlock(new_zhdr);
+
+ queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
+
+ page_mapcount_reset(page);
+ unlock_page(page);
+ put_page(page);
+ return 0;
+}
+
+static void z3fold_page_putback(struct page *page)
+{
+ struct z3fold_header *zhdr;
+ struct z3fold_pool *pool;
+
+ zhdr = page_address(page);
+ pool = zhdr_to_pool(zhdr);
+
+ z3fold_page_lock(zhdr);
+ if (!list_empty(&zhdr->buddy))
+ list_del_init(&zhdr->buddy);
+ INIT_LIST_HEAD(&page->lru);
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
+ atomic64_dec(&pool->pages_nr);
+ return;
+ }
+ spin_lock(&pool->lock);
+ list_add(&page->lru, &pool->lru);
+ spin_unlock(&pool->lock);
+ z3fold_page_unlock(zhdr);
+}
+
+static const struct address_space_operations z3fold_aops = {
+ .isolate_page = z3fold_page_isolate,
+ .migratepage = z3fold_page_migrate,
+ .putback_page = z3fold_page_putback,
+};
+
/*****************
* zpool
****************/
@@ -1133,8 +1504,14 @@ MODULE_ALIAS("zpool-z3fold");
static int __init init_z3fold(void)
{
+ int ret;
+
/* Make sure the z3fold header is not larger than the page size */
BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE);
+ ret = z3fold_mount();
+ if (ret)
+ return ret;
+
zpool_register_driver(&z3fold_zpool_driver);
return 0;
@@ -1142,6 +1519,7 @@ static int __init init_z3fold(void)
static void __exit exit_z3fold(void)
{
+ z3fold_unmount();
zpool_unregister_driver(&z3fold_zpool_driver);
}
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index d3736f5bffec..74cafc0142ea 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -27,7 +27,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data,
while (got < num_pages) {
rc = get_user_pages_fast(
(unsigned long)data + ((unsigned long)got * PAGE_SIZE),
- num_pages - got, write_page, pages + got);
+ num_pages - got, write_page ? FOLL_WRITE : 0, pages + got);
if (rc < 0)
break;
BUG_ON(rc == 0);
diff --git a/net/rds/info.c b/net/rds/info.c
index e367a97a18c8..03f6fd56d237 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -193,7 +193,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
ret = -ENOMEM;
goto out;
}
- ret = get_user_pages_fast(start, nr_pages, 1, pages);
+ ret = get_user_pages_fast(start, nr_pages, FOLL_WRITE, pages);
if (ret != nr_pages) {
if (ret > 0)
nr_pages = ret;
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 182ab8430594..b340ed4fc43a 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -158,7 +158,8 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
{
int ret;
- ret = get_user_pages_fast(user_addr, nr_pages, write, pages);
+ ret = get_user_pages_fast(user_addr, nr_pages, write ? FOLL_WRITE : 0,
+ pages);
if (ret >= 0 && ret < nr_pages) {
while (ret--)
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 989e52386c35..2b18223e7eb8 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -253,8 +253,8 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem)
return -ENOMEM;
down_read(&current->mm->mmap_sem);
- npgs = get_user_pages_longterm(umem->address, umem->npgs,
- gup_flags, &umem->pgs[0], NULL);
+ npgs = get_user_pages(umem->address, umem->npgs,
+ gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);
up_read(&current->mm->mmap_sem);
if (npgs != umem->npgs) {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a704d1f9bd96..5fb0f1656a96 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -391,7 +391,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
spin_unlock(&kvm->mmu_lock);
ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start,
- range->end, range->blockable);
+ range->end,
+ mmu_notifier_range_blockable(range));
srcu_read_unlock(&kvm->srcu, idx);