From 234b239bea395316d7f78018c672f4a88b3cdf0d Mon Sep 17 00:00:00 2001 From: Andres Lagar-Cavilla Date: Wed, 17 Sep 2014 10:51:48 -0700 Subject: kvm: Faults which trigger IO release the mmap_sem When KVM handles a tdp fault it uses FOLL_NOWAIT. If the guest memory has been swapped out or is behind a filemap, this will trigger async readahead and return immediately. The rationale is that KVM will kick back the guest with an "async page fault" and allow for some other guest process to take over. If async PFs are enabled the fault is retried asap from an async workqueue. If not, it's retried immediately in the same code path. In either case the retry will not relinquish the mmap semaphore and will block on the IO. This is a bad thing, as other mmap semaphore users now stall as a function of swap or filemap latency. This patch ensures both the regular and async PF path re-enter the fault allowing for the mmap semaphore to be relinquished in the case of IO wait. Reviewed-by: Radim Krčmář Signed-off-by: Andres Lagar-Cavilla Acked-by: Andrew Morton Signed-off-by: Paolo Bonzini --- mm/gup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 91d044b1600d..af7ea3e0826b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -281,6 +281,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_ALLOW_RETRY; if (*flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; + if (*flags & FOLL_TRIED) { + VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY); + fault_flags |= FAULT_FLAG_TRIED; + } ret = handle_mm_fault(mm, vma, address, fault_flags); if (ret & VM_FAULT_ERROR) { -- cgit v1.2.3 From 57128468080a8b6ea452223036d3e417f748af55 Mon Sep 17 00:00:00 2001 From: Andres Lagar-Cavilla Date: Mon, 22 Sep 2014 14:54:42 -0700 Subject: kvm: Fix page ageing bugs 1. We were calling clear_flush_young_notify in unmap_one, but we are within an mmu notifier invalidate range scope. The spte exists no more (due to range_start) and the accessed bit info has already been propagated (due to kvm_pfn_set_accessed). Simply call clear_flush_young. 2. We clear_flush_young on a primary MMU PMD, but this may be mapped as a collection of PTEs by the secondary MMU (e.g. during log-dirty). This required expanding the interface of the clear_flush_young mmu notifier, so a lot of code has been trivially touched. 3. In the absence of shadow_accessed_mask (e.g. EPT A bit), we emulate the access bit by blowing the spte. This requires proper synchronizing with MMU notifier consumers, like every other removal of spte's does. Signed-off-by: Andres Lagar-Cavilla Acked-by: Rik van Riel Signed-off-by: Paolo Bonzini --- arch/arm/include/asm/kvm_host.h | 3 ++- arch/arm64/include/asm/kvm_host.h | 3 ++- arch/powerpc/include/asm/kvm_host.h | 2 +- arch/powerpc/include/asm/kvm_ppc.h | 2 +- arch/powerpc/kvm/book3s.c | 4 ++-- arch/powerpc/kvm/book3s.h | 3 ++- arch/powerpc/kvm/book3s_64_mmu_hv.c | 4 ++-- arch/powerpc/kvm/book3s_pr.c | 3 ++- arch/powerpc/kvm/e500_mmu_host.c | 2 +- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 38 ++++++++++++++++++++++--------------- drivers/iommu/amd_iommu_v2.c | 6 ++++-- include/linux/mmu_notifier.h | 24 ++++++++++++++++------- mm/mmu_notifier.c | 5 +++-- mm/rmap.c | 6 +++++- virt/kvm/kvm_main.c | 5 +++-- 16 files changed, 71 insertions(+), 41 deletions(-) (limited to 'mm') diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 032a8538318a..8c3f7eb62b54 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -170,7 +170,8 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); /* We do not have shadow page tables, hence the empty hooks */ -static inline int kvm_age_hva(struct kvm *kvm, unsigned long hva) +static inline int kvm_age_hva(struct kvm *kvm, unsigned long start, + unsigned long end) { return 0; } diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index be9970a59497..a3c671b3acc9 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -180,7 +180,8 @@ int kvm_unmap_hva_range(struct kvm *kvm, void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); /* We do not have shadow page tables, hence the empty hooks */ -static inline int kvm_age_hva(struct kvm *kvm, unsigned long hva) +static inline int kvm_age_hva(struct kvm *kvm, unsigned long start, + unsigned long end) { return 0; } diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 604000882352..d329bc5543a2 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -56,7 +56,7 @@ extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); extern int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); -extern int kvm_age_hva(struct kvm *kvm, unsigned long hva); +extern int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index fb86a2299d8a..d4a92d7cea6a 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -243,7 +243,7 @@ struct kvmppc_ops { int (*unmap_hva)(struct kvm *kvm, unsigned long hva); int (*unmap_hva_range)(struct kvm *kvm, unsigned long start, unsigned long end); - int (*age_hva)(struct kvm *kvm, unsigned long hva); + int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end); int (*test_age_hva)(struct kvm *kvm, unsigned long hva); void (*set_spte_hva)(struct kvm *kvm, unsigned long hva, pte_t pte); void (*mmu_destroy)(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index dd03f6b299ba..c16cfbfeb781 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -851,9 +851,9 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end); } -int kvm_age_hva(struct kvm *kvm, unsigned long hva) +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - return kvm->arch.kvm_ops->age_hva(kvm, hva); + return kvm->arch.kvm_ops->age_hva(kvm, start, end); } int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h index 4bf956cf94d6..d2b3ec088b8c 100644 --- a/arch/powerpc/kvm/book3s.h +++ b/arch/powerpc/kvm/book3s.h @@ -17,7 +17,8 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, extern int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva); extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end); -extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long hva); +extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, + unsigned long end); extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva); extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 72c20bb16d26..81460c5359c0 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -1002,11 +1002,11 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, return ret; } -int kvm_age_hva_hv(struct kvm *kvm, unsigned long hva) +int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end) { if (!kvm->arch.using_mmu_notifiers) return 0; - return kvm_handle_hva(kvm, hva, kvm_age_rmapp); + return kvm_handle_hva_range(kvm, start, end, kvm_age_rmapp); } static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index faffb27badd9..852fcd8951c4 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -295,7 +295,8 @@ static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start, return 0; } -static int kvm_age_hva_pr(struct kvm *kvm, unsigned long hva) +static int kvm_age_hva_pr(struct kvm *kvm, unsigned long start, + unsigned long end) { /* XXX could be more clever ;) */ return 0; diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 08f14bb57897..b1f3f630315e 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -732,7 +732,7 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) return 0; } -int kvm_age_hva(struct kvm *kvm, unsigned long hva) +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { /* XXX could be more clever ;) */ return 0; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index eeeb573fcf6f..763d273cab1d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1035,7 +1035,7 @@ asmlinkage void kvm_spurious_fault(void); #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_age_hva(struct kvm *kvm, unsigned long hva); +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 47d534066325..3201e93ebd07 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1417,18 +1417,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, struct rmap_iterator uninitialized_var(iter); int young = 0; - /* - * In case of absence of EPT Access and Dirty Bits supports, - * emulate the accessed bit for EPT, by checking if this page has - * an EPT mapping, and clearing it if it does. On the next access, - * a new EPT mapping will be established. - * This has some overhead, but not as much as the cost of swapping - * out actively used pages or breaking up actively used hugepages. - */ - if (!shadow_accessed_mask) { - young = kvm_unmap_rmapp(kvm, rmapp, slot, gfn, level, data); - goto out; - } + BUG_ON(!shadow_accessed_mask); for (sptep = rmap_get_first(*rmapp, &iter); sptep; sptep = rmap_get_next(&iter)) { @@ -1440,7 +1429,6 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, (unsigned long *)sptep); } } -out: trace_kvm_age_page(gfn, level, slot, young); return young; } @@ -1489,9 +1477,29 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) kvm_flush_remote_tlbs(vcpu->kvm); } -int kvm_age_hva(struct kvm *kvm, unsigned long hva) +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { - return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); + /* + * In case of absence of EPT Access and Dirty Bits supports, + * emulate the accessed bit for EPT, by checking if this page has + * an EPT mapping, and clearing it if it does. On the next access, + * a new EPT mapping will be established. + * This has some overhead, but not as much as the cost of swapping + * out actively used pages or breaking up actively used hugepages. + */ + if (!shadow_accessed_mask) { + /* + * We are holding the kvm->mmu_lock, and we are blowing up + * shadow PTEs. MMU notifier consumers need to be kept at bay. + * This is correct as long as we don't decouple the mmu_lock + * protected regions (like invalidate_range_start|end does). + */ + kvm->mmu_notifier_seq++; + return kvm_handle_hva_range(kvm, start, end, 0, + kvm_unmap_rmapp); + } + + return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); } int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index 5f578e850fc5..90d734bbf467 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -402,9 +402,11 @@ static void __mn_flush_page(struct mmu_notifier *mn, static int mn_clear_flush_young(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long address) + unsigned long start, + unsigned long end) { - __mn_flush_page(mn, address); + for (; start < end; start += PAGE_SIZE) + __mn_flush_page(mn, start); return 0; } diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 27288692241e..88787bb4b3b9 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -57,10 +57,13 @@ struct mmu_notifier_ops { * pte. This way the VM will provide proper aging to the * accesses to the page through the secondary MMUs and not * only to the ones through the Linux pte. + * Start-end is necessary in case the secondary MMU is mapping the page + * at a smaller granularity than the primary MMU. */ int (*clear_flush_young)(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long address); + unsigned long start, + unsigned long end); /* * test_young is called to check the young/accessed bitflag in @@ -175,7 +178,8 @@ extern void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, - unsigned long address); + unsigned long start, + unsigned long end); extern int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address); extern void __mmu_notifier_change_pte(struct mm_struct *mm, @@ -194,10 +198,11 @@ static inline void mmu_notifier_release(struct mm_struct *mm) } static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, - unsigned long address) + unsigned long start, + unsigned long end) { if (mm_has_notifiers(mm)) - return __mmu_notifier_clear_flush_young(mm, address); + return __mmu_notifier_clear_flush_young(mm, start, end); return 0; } @@ -255,7 +260,9 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) unsigned long ___address = __address; \ __young = ptep_clear_flush_young(___vma, ___address, __ptep); \ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ - ___address); \ + ___address, \ + ___address + \ + PAGE_SIZE); \ __young; \ }) @@ -266,7 +273,9 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) unsigned long ___address = __address; \ __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ - ___address); \ + ___address, \ + ___address + \ + PMD_SIZE); \ __young; \ }) @@ -301,7 +310,8 @@ static inline void mmu_notifier_release(struct mm_struct *mm) } static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, - unsigned long address) + unsigned long start, + unsigned long end) { return 0; } diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 950813b1eb36..2c8da9825fe3 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -107,7 +107,8 @@ void __mmu_notifier_release(struct mm_struct *mm) * existed or not. */ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, - unsigned long address) + unsigned long start, + unsigned long end) { struct mmu_notifier *mn; int young = 0, id; @@ -115,7 +116,7 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->clear_flush_young) - young |= mn->ops->clear_flush_young(mn, mm, address); + young |= mn->ops->clear_flush_young(mn, mm, start, end); } srcu_read_unlock(&srcu, id); diff --git a/mm/rmap.c b/mm/rmap.c index 3e8491c504f8..bc74e0012809 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1355,7 +1355,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, continue; /* don't unmap */ } - if (ptep_clear_flush_young_notify(vma, address, pte)) + /* + * No need for _notify because we're within an + * mmu_notifier_invalidate_range_ {start|end} scope. + */ + if (ptep_clear_flush_young(vma, address, pte)) continue; /* Nuke the page table entry. */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ff42b11d2b9c..0316314d48f4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -369,7 +369,8 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long address) + unsigned long start, + unsigned long end) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int young, idx; @@ -377,7 +378,7 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - young = kvm_age_hva(kvm, address); + young = kvm_age_hva(kvm, start, end); if (young) kvm_flush_remote_tlbs(kvm); -- cgit v1.2.3