diff options
author | Linus Torvalds | 2017-08-31 17:30:01 -0700 |
---|---|---|
committer | Linus Torvalds | 2017-08-31 17:30:01 -0700 |
commit | ea25c43179462e342d4a0e66c3f6a5f53514da05 (patch) | |
tree | ba4ab1aef8f166de65da4ad9e9162ff277f318a4 | |
parent | c227390c91a355300f47f9bef0aefbdfaaca1500 (diff) | |
parent | 5f32b265400de723ab0db23101a75ac073bdd980 (diff) |
Merge branch 'mmu_notifier_fixes'
Merge mmu_notifier fixes from Jérôme Glisse:
"The invalidate_page callback suffered from 2 pitfalls. First it used
to happen after page table lock was release and thus a new page might
have been setup for the virtual address before the call to
invalidate_page().
This is in a weird way fixed by commit c7ab0d2fdc84 ("mm: convert
try_to_unmap_one() to use page_vma_mapped_walk()") which moved the
callback under the page table lock. Which also broke several existing
user of the mmu_notifier API that assumed they could sleep inside this
callback.
The second pitfall was invalidate_page being the only callback not
taking a range of address in respect to invalidation but was giving an
address and a page. Lot of the callback implementer assumed this could
never be THP and thus failed to invalidate the appropriate range for
THP pages.
By killing this callback we unify the mmu_notifier callback API to
always take a virtual address range as input.
There is now two clear API (I am not mentioning the youngess API which
is seldomly used):
- invalidate_range_start()/end() callback (which allow you to sleep)
- invalidate_range() where you can not sleep but happen right after
page table update under page table lock
Note that a lot of existing user feels broken in respect to
range_start/ range_end. Many user only have range_start() callback but
there is nothing preventing them to undo what was invalidated in their
range_start() callback after it returns but before any CPU page table
update take place.
The code pattern use in kvm or umem odp is an example on how to
properly avoid such race. In a nutshell use some kind of sequence
number and active range invalidation counter to block anything that
might undo what the range_start() callback did.
If you do not care about keeping fully in sync with CPU page table (ie
you can live with CPU page table pointing to new different page for a
given virtual address) then you can take a reference on the pages
inside the range_start callback and drop it in range_end or when your
driver is done with those pages.
Last alternative is to use invalidate_range() if you can do
invalidation without sleeping as invalidate_range() callback happens
under the CPU page table spinlock right after the page table is
updated.
The first two patches convert existing mmu_notifier_invalidate_page()
calls to mmu_notifier_invalidate_range() and bracket those call with
call to mmu_notifier_invalidate_range_start()/end().
The next ten patches remove existing invalidate_page() callback as it
can no longer happen.
Finally the last page remove the invalidate_page() callback completely
so it can RIP.
Changes since v1:
- remove more dead code in kvm (no testing impact)
- more accurate end address computation (patch 2) in page_mkclean_one
and try_to_unmap_one
- added tested-by/reviewed-by gotten so far"
* emailed patches from Jérôme Glisse <jglisse@redhat.com>:
mm/mmu_notifier: kill invalidate_page
KVM: update to new mmu_notifier semantic v2
xen/gntdev: update to new mmu_notifier semantic
sgi-gru: update to new mmu_notifier semantic
misc/mic/scif: update to new mmu_notifier semantic
iommu/intel: update to new mmu_notifier semantic
iommu/amd: update to new mmu_notifier semantic
IB/hfi1: update to new mmu_notifier semantic
IB/umem: update to new mmu_notifier semantic
drm/amdgpu: update to new mmu_notifier semantic
powerpc/powernv: update to new mmu_notifier semantic
mm/rmap: update to new mmu_notifier semantic v2
dax: update to new mmu_notifier semantic
-rw-r--r-- | arch/arm/include/asm/kvm_host.h | 6 | ||||
-rw-r--r-- | arch/arm64/include/asm/kvm_host.h | 6 | ||||
-rw-r--r-- | arch/mips/include/asm/kvm_host.h | 5 | ||||
-rw-r--r-- | arch/powerpc/include/asm/kvm_host.h | 5 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/npu-dma.c | 10 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 11 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 31 | ||||
-rw-r--r-- | drivers/infiniband/core/umem_odp.c | 19 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/mmu_rb.c | 9 | ||||
-rw-r--r-- | drivers/iommu/amd_iommu_v2.c | 8 | ||||
-rw-r--r-- | drivers/iommu/intel-svm.c | 9 | ||||
-rw-r--r-- | drivers/misc/mic/scif/scif_dma.c | 11 | ||||
-rw-r--r-- | drivers/misc/sgi-gru/grutlbpurge.c | 12 | ||||
-rw-r--r-- | drivers/xen/gntdev.c | 8 | ||||
-rw-r--r-- | fs/dax.c | 19 | ||||
-rw-r--r-- | include/linux/mm.h | 1 | ||||
-rw-r--r-- | include/linux/mmu_notifier.h | 25 | ||||
-rw-r--r-- | mm/memory.c | 26 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 14 | ||||
-rw-r--r-- | mm/rmap.c | 35 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 42 |
22 files changed, 65 insertions, 249 deletions
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 127e2dd2e21c..4a879f6ff13b 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -225,12 +225,6 @@ int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -/* We do not have shadow page tables, hence the empty hooks */ -static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address) -{ -} - struct kvm_vcpu *kvm_arm_get_running_vcpu(void); struct kvm_vcpu __percpu **kvm_get_running_vcpus(void); void kvm_arm_halt_guest(struct kvm *kvm); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index d68630007b14..e923b58606e2 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -326,12 +326,6 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -/* We do not have shadow page tables, hence the empty hooks */ -static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address) -{ -} - struct kvm_vcpu *kvm_arm_get_running_vcpu(void); struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); void kvm_arm_halt_guest(struct kvm *kvm); diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 2998479fd4e8..a9af1d2dcd69 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -938,11 +938,6 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); -static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address) -{ -} - /* Emulation */ int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out); enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause); diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 8b3f1238d07f..e372ed871c51 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -67,11 +67,6 @@ extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); -static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address) -{ -} - #define HPTEG_CACHE_NUM (1 << 15) #define HPTEG_HASH_BITS_PTE 13 #define HPTEG_HASH_BITS_PTE_LONG 12 diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index b5d960d6db3d..4c7b8591f737 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -614,15 +614,6 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, mmio_invalidate(npu_context, 1, address, true); } -static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address) -{ - struct npu_context *npu_context = mn_to_npu_context(mn); - - mmio_invalidate(npu_context, 1, address, true); -} - static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) @@ -640,7 +631,6 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { .release = pnv_npu2_mn_release, .change_pte = pnv_npu2_mn_change_pte, - .invalidate_page = pnv_npu2_mn_invalidate_page, .invalidate_range = pnv_npu2_mn_invalidate_range, }; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f4d120a3e22e..92c9032502d8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1375,8 +1375,6 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); -void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address); void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 05a5e57c6f39..272320eb328c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6734,17 +6734,6 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page); -void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, - unsigned long address) -{ - /* - * The physical address of apic access page is stored in the VMCS. - * Update it when it becomes invalid. - */ - if (address == gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT)) - kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); -} - /* * Returns 1 to let vcpu_run() continue the guest execution loop without * exiting to the userspace. Otherwise, the value will be returned to the diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index 6558a3ed57a7..e1cde6b80027 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -147,36 +147,6 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, } /** - * amdgpu_mn_invalidate_page - callback to notify about mm change - * - * @mn: our notifier - * @mn: the mm this callback is about - * @address: address of invalidate page - * - * Invalidation of a single page. Blocks for all BOs mapping it - * and unmap them by move them into system domain again. - */ -static void amdgpu_mn_invalidate_page(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address) -{ - struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn); - struct interval_tree_node *it; - - mutex_lock(&rmn->lock); - - it = interval_tree_iter_first(&rmn->objects, address, address); - if (it) { - struct amdgpu_mn_node *node; - - node = container_of(it, struct amdgpu_mn_node, it); - amdgpu_mn_invalidate_node(node, address, address); - } - - mutex_unlock(&rmn->lock); -} - -/** * amdgpu_mn_invalidate_range_start - callback to notify about mm change * * @mn: our notifier @@ -215,7 +185,6 @@ static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn, static const struct mmu_notifier_ops amdgpu_mn_ops = { .release = amdgpu_mn_release, - .invalidate_page = amdgpu_mn_invalidate_page, .invalidate_range_start = amdgpu_mn_invalidate_range_start, }; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 8c4ec564e495..55e8f5ed8b3c 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -166,24 +166,6 @@ static int invalidate_page_trampoline(struct ib_umem *item, u64 start, return 0; } -static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address) -{ - struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); - - if (!context->invalidate_range) - return; - - ib_ucontext_notifier_start_account(context); - down_read(&context->umem_rwsem); - rbt_ib_umem_for_each_in_range(&context->umem_tree, address, - address + PAGE_SIZE, - invalidate_page_trampoline, NULL); - up_read(&context->umem_rwsem); - ib_ucontext_notifier_end_account(context); -} - static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, u64 end, void *cookie) { @@ -237,7 +219,6 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, static const struct mmu_notifier_ops ib_umem_notifiers = { .release = ib_umem_notifier_release, - .invalidate_page = ib_umem_notifier_invalidate_page, .invalidate_range_start = ib_umem_notifier_invalidate_range_start, .invalidate_range_end = ib_umem_notifier_invalidate_range_end, }; diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index ccbf52c8ff6f..e4b56a0dd6d0 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -67,8 +67,6 @@ struct mmu_rb_handler { static unsigned long mmu_node_start(struct mmu_rb_node *); static unsigned long mmu_node_last(struct mmu_rb_node *); -static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *, - unsigned long); static inline void mmu_notifier_range_start(struct mmu_notifier *, struct mm_struct *, unsigned long, unsigned long); @@ -82,7 +80,6 @@ static void do_remove(struct mmu_rb_handler *handler, static void handle_remove(struct work_struct *work); static const struct mmu_notifier_ops mn_opts = { - .invalidate_page = mmu_notifier_page, .invalidate_range_start = mmu_notifier_range_start, }; @@ -285,12 +282,6 @@ void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler, handler->ops->remove(handler->ops_arg, node); } -static inline void mmu_notifier_page(struct mmu_notifier *mn, - struct mm_struct *mm, unsigned long addr) -{ - mmu_notifier_mem_invalidate(mn, mm, addr, addr + PAGE_SIZE); -} - static inline void mmu_notifier_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index 6629c472eafd..dccf5b76eff2 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -391,13 +391,6 @@ static int mn_clear_flush_young(struct mmu_notifier *mn, return 0; } -static void mn_invalidate_page(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address) -{ - __mn_flush_page(mn, address); -} - static void mn_invalidate_range(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end) @@ -436,7 +429,6 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) static const struct mmu_notifier_ops iommu_mn = { .release = mn_release, .clear_flush_young = mn_clear_flush_young, - .invalidate_page = mn_invalidate_page, .invalidate_range = mn_invalidate_range, }; diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index f167c0d84ebf..f620dccec8ee 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -223,14 +223,6 @@ static void intel_change_pte(struct mmu_notifier *mn, struct mm_struct *mm, intel_flush_svm_range(svm, address, 1, 1, 0); } -static void intel_invalidate_page(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long address) -{ - struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); - - intel_flush_svm_range(svm, address, 1, 1, 0); -} - /* Pages have been freed at this point */ static void intel_invalidate_range(struct mmu_notifier *mn, struct mm_struct *mm, @@ -285,7 +277,6 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) static const struct mmu_notifier_ops intel_mmuops = { .release = intel_mm_release, .change_pte = intel_change_pte, - .invalidate_page = intel_invalidate_page, .invalidate_range = intel_invalidate_range, }; diff --git a/drivers/misc/mic/scif/scif_dma.c b/drivers/misc/mic/scif/scif_dma.c index 64d5760d069a..63d6246d6dff 100644 --- a/drivers/misc/mic/scif/scif_dma.c +++ b/drivers/misc/mic/scif/scif_dma.c @@ -200,16 +200,6 @@ static void scif_mmu_notifier_release(struct mmu_notifier *mn, schedule_work(&scif_info.misc_work); } -static void scif_mmu_notifier_invalidate_page(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address) -{ - struct scif_mmu_notif *mmn; - - mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier); - scif_rma_destroy_tcw(mmn, address, PAGE_SIZE); -} - static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, @@ -235,7 +225,6 @@ static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, static const struct mmu_notifier_ops scif_mmu_notifier_ops = { .release = scif_mmu_notifier_release, .clear_flush_young = NULL, - .invalidate_page = scif_mmu_notifier_invalidate_page, .invalidate_range_start = scif_mmu_notifier_invalidate_range_start, .invalidate_range_end = scif_mmu_notifier_invalidate_range_end}; diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c index e936d43895d2..9918eda0e05f 100644 --- a/drivers/misc/sgi-gru/grutlbpurge.c +++ b/drivers/misc/sgi-gru/grutlbpurge.c @@ -247,17 +247,6 @@ static void gru_invalidate_range_end(struct mmu_notifier *mn, gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx\n", gms, start, end); } -static void gru_invalidate_page(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long address) -{ - struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct, - ms_notifier); - - STAT(mmu_invalidate_page); - gru_flush_tlb_range(gms, address, PAGE_SIZE); - gru_dbg(grudev, "gms %p, address 0x%lx\n", gms, address); -} - static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct, @@ -269,7 +258,6 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm) static const struct mmu_notifier_ops gru_mmuops = { - .invalidate_page = gru_invalidate_page, .invalidate_range_start = gru_invalidate_range_start, .invalidate_range_end = gru_invalidate_range_end, .release = gru_release, diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index f3bf8f4e2d6c..82360594fa8e 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -484,13 +484,6 @@ static void mn_invl_range_start(struct mmu_notifier *mn, mutex_unlock(&priv->lock); } -static void mn_invl_page(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address) -{ - mn_invl_range_start(mn, mm, address, address + PAGE_SIZE); -} - static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) { @@ -522,7 +515,6 @@ static void mn_release(struct mmu_notifier *mn, static const struct mmu_notifier_ops gntdev_mmu_ops = { .release = mn_release, - .invalidate_page = mn_invl_page, .invalidate_range_start = mn_invl_range_start, }; @@ -646,11 +646,10 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, pte_t pte, *ptep = NULL; pmd_t *pmdp = NULL; spinlock_t *ptl; - bool changed; i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { - unsigned long address; + unsigned long address, start, end; cond_resched(); @@ -658,8 +657,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, continue; address = pgoff_address(index, vma); - changed = false; - if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl)) + + /* + * Note because we provide start/end to follow_pte_pmd it will + * call mmu_notifier_invalidate_range_start() on our behalf + * before taking any lock. + */ + if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl)) continue; if (pmdp) { @@ -676,7 +680,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, pmd = pmd_wrprotect(pmd); pmd = pmd_mkclean(pmd); set_pmd_at(vma->vm_mm, address, pmdp, pmd); - changed = true; + mmu_notifier_invalidate_range(vma->vm_mm, start, end); unlock_pmd: spin_unlock(ptl); #endif @@ -691,13 +695,12 @@ unlock_pmd: pte = pte_wrprotect(pte); pte = pte_mkclean(pte); set_pte_at(vma->vm_mm, address, ptep, pte); - changed = true; + mmu_notifier_invalidate_range(vma->vm_mm, start, end); unlock_pte: pte_unmap_unlock(ptep, ptl); } - if (changed) - mmu_notifier_invalidate_page(vma->vm_mm, address); + mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); } i_mmap_unlock_read(mapping); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 46b9ac5e8569..c1f6c95f3496 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1260,6 +1260,7 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, + unsigned long *start, unsigned long *end, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index c91b3bcd158f..7b2e31b1745a 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -95,17 +95,6 @@ struct mmu_notifier_ops { pte_t pte); /* - * Before this is invoked any secondary MMU is still ok to - * read/write to the page previously pointed to by the Linux - * pte because the page hasn't been freed yet and it won't be - * freed until this returns. If required set_page_dirty has to - * be called internally to this method. - */ - void (*invalidate_page)(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address); - - /* * invalidate_range_start() and invalidate_range_end() must be * paired and are called only when the mmap_sem and/or the * locks protecting the reverse maps are held. If the subsystem @@ -220,8 +209,6 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address); extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); -extern void __mmu_notifier_invalidate_page(struct mm_struct *mm, - unsigned long address); extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end); extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, @@ -268,13 +255,6 @@ static inline void mmu_notifier_change_pte(struct mm_struct *mm, __mmu_notifier_change_pte(mm, address, pte); } -static inline void mmu_notifier_invalidate_page(struct mm_struct *mm, - unsigned long address) -{ - if (mm_has_notifiers(mm)) - __mmu_notifier_invalidate_page(mm, address); -} - static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end) { @@ -442,11 +422,6 @@ static inline void mmu_notifier_change_pte(struct mm_struct *mm, { } -static inline void mmu_notifier_invalidate_page(struct mm_struct *mm, - unsigned long address) -{ -} - static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end) { diff --git a/mm/memory.c b/mm/memory.c index fe2fba27ded2..56e48e4593cb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4008,7 +4008,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) #endif /* __PAGETABLE_PMD_FOLDED */ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) + unsigned long *start, unsigned long *end, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { pgd_t *pgd; p4d_t *p4d; @@ -4035,17 +4036,29 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, if (!pmdpp) goto out; + if (start && end) { + *start = address & PMD_MASK; + *end = *start + PMD_SIZE; + mmu_notifier_invalidate_range_start(mm, *start, *end); + } *ptlp = pmd_lock(mm, pmd); if (pmd_huge(*pmd)) { *pmdpp = pmd; return 0; } spin_unlock(*ptlp); + if (start && end) + mmu_notifier_invalidate_range_end(mm, *start, *end); } if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; + if (start && end) { + *start = address & PAGE_MASK; + *end = *start + PAGE_SIZE; + mmu_notifier_invalidate_range_start(mm, *start, *end); + } ptep = pte_offset_map_lock(mm, pmd, address, ptlp); if (!pte_present(*ptep)) goto unlock; @@ -4053,6 +4066,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, return 0; unlock: pte_unmap_unlock(ptep, *ptlp); + if (start && end) + mmu_notifier_invalidate_range_end(mm, *start, *end); out: return -EINVAL; } @@ -4064,20 +4079,21 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address, /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, - !(res = __follow_pte_pmd(mm, address, ptepp, NULL, - ptlp))); + !(res = __follow_pte_pmd(mm, address, NULL, NULL, + ptepp, NULL, ptlp))); return res; } int follow_pte_pmd(struct mm_struct *mm, unsigned long address, + unsigned long *start, unsigned long *end, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { int res; /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, - !(res = __follow_pte_pmd(mm, address, ptepp, pmdpp, - ptlp))); + !(res = __follow_pte_pmd(mm, address, start, end, + ptepp, pmdpp, ptlp))); return res; } EXPORT_SYMBOL(follow_pte_pmd); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 54ca54562928..314285284e6e 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -174,20 +174,6 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, srcu_read_unlock(&srcu, id); } -void __mmu_notifier_invalidate_page(struct mm_struct *mm, - unsigned long address) -{ - struct mmu_notifier *mn; - int id; - - id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops->invalidate_page) - mn->ops->invalidate_page(mn, mm, address); - } - srcu_read_unlock(&srcu, id); -} - void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end) { diff --git a/mm/rmap.c b/mm/rmap.c index c8993c63eb25..c570f82e6827 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -887,11 +887,21 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, .address = address, .flags = PVMW_SYNC, }; + unsigned long start = address, end; int *cleaned = arg; + /* + * We have to assume the worse case ie pmd for invalidation. Note that + * the page can not be free from this function. + */ + end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); + mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); + while (page_vma_mapped_walk(&pvmw)) { + unsigned long cstart, cend; int ret = 0; - address = pvmw.address; + + cstart = address = pvmw.address; if (pvmw.pte) { pte_t entry; pte_t *pte = pvmw.pte; @@ -904,6 +914,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(vma->vm_mm, address, pte, entry); + cend = cstart + PAGE_SIZE; ret = 1; } else { #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE @@ -918,6 +929,8 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); set_pmd_at(vma->vm_mm, address, pmd, entry); + cstart &= PMD_MASK; + cend = cstart + PMD_SIZE; ret = 1; #else /* unexpected pmd-mapped page? */ @@ -926,11 +939,13 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, } if (ret) { - mmu_notifier_invalidate_page(vma->vm_mm, address); + mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend); (*cleaned)++; } } + mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); + return true; } @@ -1324,6 +1339,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, pte_t pteval; struct page *subpage; bool ret = true; + unsigned long start = address, end; enum ttu_flags flags = (enum ttu_flags)arg; /* munlock has nothing to gain from examining un-locked vmas */ @@ -1335,6 +1351,14 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, flags & TTU_MIGRATION, page); } + /* + * We have to assume the worse case ie pmd for invalidation. Note that + * the page can not be free in this function as call of try_to_unmap() + * must hold a reference on the page. + */ + end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page))); + mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); + while (page_vma_mapped_walk(&pvmw)) { /* * If the page is mlock()d, we cannot swap it out. @@ -1445,6 +1469,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { WARN_ON_ONCE(1); ret = false; + /* We have to invalidate as we cleared the pte */ page_vma_mapped_walk_done(&pvmw); break; } @@ -1490,8 +1515,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, discard: page_remove_rmap(subpage, PageHuge(page)); put_page(page); - mmu_notifier_invalidate_page(mm, address); + mmu_notifier_invalidate_range(mm, address, + address + PAGE_SIZE); } + + mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); + return ret; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 15252d723b54..4d81f6ded88e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -322,47 +322,6 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) return container_of(mn, struct kvm, mmu_notifier); } -static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address) -{ - struct kvm *kvm = mmu_notifier_to_kvm(mn); - int need_tlb_flush, idx; - - /* - * When ->invalidate_page runs, the linux pte has been zapped - * already but the page is still allocated until - * ->invalidate_page returns. So if we increase the sequence - * here the kvm page fault will notice if the spte can't be - * established because the page is going to be freed. If - * instead the kvm page fault establishes the spte before - * ->invalidate_page runs, kvm_unmap_hva will release it - * before returning. - * - * The sequence increase only need to be seen at spin_unlock - * time, and not at spin_lock time. - * - * Increasing the sequence after the spin_unlock would be - * unsafe because the kvm page fault could then establish the - * pte after kvm_unmap_hva returned, without noticing the page - * is going to be freed. - */ - idx = srcu_read_lock(&kvm->srcu); - spin_lock(&kvm->mmu_lock); - - kvm->mmu_notifier_seq++; - need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; - /* we've to flush the tlb before the pages can be freed */ - if (need_tlb_flush) - kvm_flush_remote_tlbs(kvm); - - spin_unlock(&kvm->mmu_lock); - - kvm_arch_mmu_notifier_invalidate_page(kvm, address); - - srcu_read_unlock(&kvm->srcu, idx); -} - static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address, @@ -510,7 +469,6 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { - .invalidate_page = kvm_mmu_notifier_invalidate_page, .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, |