diff options
author | Linus Torvalds | 2019-11-30 10:33:14 -0800 |
---|---|---|
committer | Linus Torvalds | 2019-11-30 10:33:14 -0800 |
commit | aa32f1169148beb90d71494e2f2a1999ba7b5366 (patch) | |
tree | ec8c434bff07bf0beb2df08629089824927f62f9 /drivers/infiniband | |
parent | d5bb349dbbe27537e90a03b9597deeb07723a86d (diff) | |
parent | 93f4e735b6d98ee4b7a1252d81e815a983e359f2 (diff) |
Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull hmm updates from Jason Gunthorpe:
"This is another round of bug fixing and cleanup. This time the focus
is on the driver pattern to use mmu notifiers to monitor a VA range.
This code is lifted out of many drivers and hmm_mirror directly into
the mmu_notifier core and written using the best ideas from all the
driver implementations.
This removes many bugs from the drivers and has a very pleasing
diffstat. More drivers can still be converted, but that is for another
cycle.
- A shared branch with RDMA reworking the RDMA ODP implementation
- New mmu_interval_notifier API. This is focused on the use case of
monitoring a VA and simplifies the process for drivers
- A common seq-count locking scheme built into the
mmu_interval_notifier API usable by drivers that call
get_user_pages() or hmm_range_fault() with the VA range
- Conversion of mlx5 ODP, hfi1, radeon, nouveau, AMD GPU, and Xen
GntDev drivers to the new API. This deletes a lot of wonky driver
code.
- Two improvements for hmm_range_fault(), from testing done by Ralph"
* tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma:
mm/hmm: remove hmm_range_dma_map and hmm_range_dma_unmap
mm/hmm: make full use of walk_page_range()
xen/gntdev: use mmu_interval_notifier_insert
mm/hmm: remove hmm_mirror and related
drm/amdgpu: Use mmu_interval_notifier instead of hmm_mirror
drm/amdgpu: Use mmu_interval_insert instead of hmm_mirror
drm/amdgpu: Call find_vma under mmap_sem
nouveau: use mmu_interval_notifier instead of hmm_mirror
nouveau: use mmu_notifier directly for invalidate_range_start
drm/radeon: use mmu_interval_notifier_insert
RDMA/hfi1: Use mmu_interval_notifier_insert for user_exp_rcv
RDMA/odp: Use mmu_interval_notifier_insert()
mm/hmm: define the pre-processor related parts of hmm.h even if disabled
mm/hmm: allow hmm_range to be used with a mmu_interval_notifier or hmm_mirror
mm/mmu_notifier: add an interval tree notifier
mm/mmu_notifier: define the header pre-processor parts even if disabled
mm/hmm: allow snapshot of the special zero page
Diffstat (limited to 'drivers/infiniband')
-rw-r--r-- | drivers/infiniband/core/device.c | 1 | ||||
-rw-r--r-- | drivers/infiniband/core/umem_odp.c | 303 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/file_ops.c | 2 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/hfi.h | 2 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/user_exp_rcv.c | 146 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/user_exp_rcv.h | 3 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mlx5_ib.h | 7 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mr.c | 3 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/odp.c | 50 |
9 files changed, 128 insertions, 389 deletions
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 9ebb09a75049..84dd74fe13b8 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2634,7 +2634,6 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, get_vf_guid); SET_DEVICE_OP(dev_ops, get_vf_stats); SET_DEVICE_OP(dev_ops, init_port); - SET_DEVICE_OP(dev_ops, invalidate_range); SET_DEVICE_OP(dev_ops, iw_accept); SET_DEVICE_OP(dev_ops, iw_add_ref); SET_DEVICE_OP(dev_ops, iw_connect); diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index d7d5fadf0899..e42d44e501fd 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -48,197 +48,33 @@ #include "uverbs.h" -static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) +static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, + const struct mmu_interval_notifier_ops *ops) { - mutex_lock(&umem_odp->umem_mutex); - if (umem_odp->notifiers_count++ == 0) - /* - * Initialize the completion object for waiting on - * notifiers. Since notifier_count is zero, no one should be - * waiting right now. - */ - reinit_completion(&umem_odp->notifier_completion); - mutex_unlock(&umem_odp->umem_mutex); -} - -static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) -{ - mutex_lock(&umem_odp->umem_mutex); - /* - * This sequence increase will notify the QP page fault that the page - * that is going to be mapped in the spte could have been freed. - */ - ++umem_odp->notifiers_seq; - if (--umem_odp->notifiers_count == 0) - complete_all(&umem_odp->notifier_completion); - mutex_unlock(&umem_odp->umem_mutex); -} - -static void ib_umem_notifier_release(struct mmu_notifier *mn, - struct mm_struct *mm) -{ - struct ib_ucontext_per_mm *per_mm = - container_of(mn, struct ib_ucontext_per_mm, mn); - struct rb_node *node; - - down_read(&per_mm->umem_rwsem); - if (!per_mm->mn.users) - goto out; - - for (node = rb_first_cached(&per_mm->umem_tree); node; - node = rb_next(node)) { - struct ib_umem_odp *umem_odp = - rb_entry(node, struct ib_umem_odp, interval_tree.rb); - - /* - * Increase the number of notifiers running, to prevent any - * further fault handling on this MR. - */ - ib_umem_notifier_start_account(umem_odp); - complete_all(&umem_odp->notifier_completion); - umem_odp->umem.ibdev->ops.invalidate_range( - umem_odp, ib_umem_start(umem_odp), - ib_umem_end(umem_odp)); - } - -out: - up_read(&per_mm->umem_rwsem); -} - -static int invalidate_range_start_trampoline(struct ib_umem_odp *item, - u64 start, u64 end, void *cookie) -{ - ib_umem_notifier_start_account(item); - item->umem.ibdev->ops.invalidate_range(item, start, end); - return 0; -} - -static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) -{ - struct ib_ucontext_per_mm *per_mm = - container_of(mn, struct ib_ucontext_per_mm, mn); - int rc; - - if (mmu_notifier_range_blockable(range)) - down_read(&per_mm->umem_rwsem); - else if (!down_read_trylock(&per_mm->umem_rwsem)) - return -EAGAIN; - - if (!per_mm->mn.users) { - up_read(&per_mm->umem_rwsem); - /* - * At this point users is permanently zero and visible to this - * CPU without a lock, that fact is relied on to skip the unlock - * in range_end. - */ - return 0; - } - - rc = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, - range->end, - invalidate_range_start_trampoline, - mmu_notifier_range_blockable(range), - NULL); - if (rc) - up_read(&per_mm->umem_rwsem); - return rc; -} - -static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, - u64 end, void *cookie) -{ - ib_umem_notifier_end_account(item); - return 0; -} - -static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) -{ - struct ib_ucontext_per_mm *per_mm = - container_of(mn, struct ib_ucontext_per_mm, mn); - - if (unlikely(!per_mm->mn.users)) - return; - - rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, - range->end, - invalidate_range_end_trampoline, true, NULL); - up_read(&per_mm->umem_rwsem); -} - -static struct mmu_notifier *ib_umem_alloc_notifier(struct mm_struct *mm) -{ - struct ib_ucontext_per_mm *per_mm; - - per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL); - if (!per_mm) - return ERR_PTR(-ENOMEM); - - per_mm->umem_tree = RB_ROOT_CACHED; - init_rwsem(&per_mm->umem_rwsem); - - WARN_ON(mm != current->mm); - rcu_read_lock(); - per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); - rcu_read_unlock(); - return &per_mm->mn; -} - -static void ib_umem_free_notifier(struct mmu_notifier *mn) -{ - struct ib_ucontext_per_mm *per_mm = - container_of(mn, struct ib_ucontext_per_mm, mn); - - WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); - - put_pid(per_mm->tgid); - kfree(per_mm); -} - -static const struct mmu_notifier_ops ib_umem_notifiers = { - .release = ib_umem_notifier_release, - .invalidate_range_start = ib_umem_notifier_invalidate_range_start, - .invalidate_range_end = ib_umem_notifier_invalidate_range_end, - .alloc_notifier = ib_umem_alloc_notifier, - .free_notifier = ib_umem_free_notifier, -}; - -static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp) -{ - struct ib_ucontext_per_mm *per_mm; - struct mmu_notifier *mn; int ret; umem_odp->umem.is_odp = 1; + mutex_init(&umem_odp->umem_mutex); + if (!umem_odp->is_implicit_odp) { size_t page_size = 1UL << umem_odp->page_shift; + unsigned long start; + unsigned long end; size_t pages; - umem_odp->interval_tree.start = - ALIGN_DOWN(umem_odp->umem.address, page_size); + start = ALIGN_DOWN(umem_odp->umem.address, page_size); if (check_add_overflow(umem_odp->umem.address, (unsigned long)umem_odp->umem.length, - &umem_odp->interval_tree.last)) + &end)) return -EOVERFLOW; - umem_odp->interval_tree.last = - ALIGN(umem_odp->interval_tree.last, page_size); - if (unlikely(umem_odp->interval_tree.last < page_size)) + end = ALIGN(end, page_size); + if (unlikely(end < page_size)) return -EOVERFLOW; - pages = (umem_odp->interval_tree.last - - umem_odp->interval_tree.start) >> - umem_odp->page_shift; + pages = (end - start) >> umem_odp->page_shift; if (!pages) return -EINVAL; - /* - * Note that the representation of the intervals in the - * interval tree considers the ending point as contained in - * the interval. - */ - umem_odp->interval_tree.last--; - umem_odp->page_list = kvcalloc( pages, sizeof(*umem_odp->page_list), GFP_KERNEL); if (!umem_odp->page_list) @@ -250,26 +86,13 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp) ret = -ENOMEM; goto out_page_list; } - } - mn = mmu_notifier_get(&ib_umem_notifiers, umem_odp->umem.owning_mm); - if (IS_ERR(mn)) { - ret = PTR_ERR(mn); - goto out_dma_list; + ret = mmu_interval_notifier_insert(&umem_odp->notifier, + umem_odp->umem.owning_mm, + start, end - start, ops); + if (ret) + goto out_dma_list; } - umem_odp->per_mm = per_mm = - container_of(mn, struct ib_ucontext_per_mm, mn); - - mutex_init(&umem_odp->umem_mutex); - init_completion(&umem_odp->notifier_completion); - - if (!umem_odp->is_implicit_odp) { - down_write(&per_mm->umem_rwsem); - interval_tree_insert(&umem_odp->interval_tree, - &per_mm->umem_tree); - up_write(&per_mm->umem_rwsem); - } - mmgrab(umem_odp->umem.owning_mm); return 0; @@ -305,8 +128,6 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, if (!context) return ERR_PTR(-EIO); - if (WARN_ON_ONCE(!context->device->ops.invalidate_range)) - return ERR_PTR(-EINVAL); umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL); if (!umem_odp) @@ -318,8 +139,10 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, umem_odp->is_implicit_odp = 1; umem_odp->page_shift = PAGE_SHIFT; - ret = ib_init_umem_odp(umem_odp); + umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); + ret = ib_init_umem_odp(umem_odp, NULL); if (ret) { + put_pid(umem_odp->tgid); kfree(umem_odp); return ERR_PTR(ret); } @@ -336,8 +159,10 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_implicit); * @addr: The starting userspace VA * @size: The length of the userspace VA */ -struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root, - unsigned long addr, size_t size) +struct ib_umem_odp * +ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr, + size_t size, + const struct mmu_interval_notifier_ops *ops) { /* * Caller must ensure that root cannot be freed during the call to @@ -360,9 +185,12 @@ struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root, umem->writable = root->umem.writable; umem->owning_mm = root->umem.owning_mm; odp_data->page_shift = PAGE_SHIFT; + odp_data->notifier.ops = ops; - ret = ib_init_umem_odp(odp_data); + odp_data->tgid = get_pid(root->tgid); + ret = ib_init_umem_odp(odp_data, ops); if (ret) { + put_pid(odp_data->tgid); kfree(odp_data); return ERR_PTR(ret); } @@ -383,7 +211,8 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_child); * conjunction with MMU notifiers. */ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, - size_t size, int access) + size_t size, int access, + const struct mmu_interval_notifier_ops *ops) { struct ib_umem_odp *umem_odp; struct ib_ucontext *context; @@ -398,8 +227,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, if (!context) return ERR_PTR(-EIO); - if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)) || - WARN_ON_ONCE(!context->device->ops.invalidate_range)) + if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND))) return ERR_PTR(-EINVAL); umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); @@ -411,6 +239,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, umem_odp->umem.address = addr; umem_odp->umem.writable = ib_access_writable(access); umem_odp->umem.owning_mm = mm = current->mm; + umem_odp->notifier.ops = ops; umem_odp->page_shift = PAGE_SHIFT; if (access & IB_ACCESS_HUGETLB) { @@ -429,11 +258,14 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, up_read(&mm->mmap_sem); } - ret = ib_init_umem_odp(umem_odp); + umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); + ret = ib_init_umem_odp(umem_odp, ops); if (ret) - goto err_free; + goto err_put_pid; return umem_odp; +err_put_pid: + put_pid(umem_odp->tgid); err_free: kfree(umem_odp); return ERR_PTR(ret); @@ -442,8 +274,6 @@ EXPORT_SYMBOL(ib_umem_odp_get); void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; - /* * Ensure that no more pages are mapped in the umem. * @@ -455,28 +285,11 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); mutex_unlock(&umem_odp->umem_mutex); + mmu_interval_notifier_remove(&umem_odp->notifier); kvfree(umem_odp->dma_list); kvfree(umem_odp->page_list); + put_pid(umem_odp->tgid); } - - down_write(&per_mm->umem_rwsem); - if (!umem_odp->is_implicit_odp) { - interval_tree_remove(&umem_odp->interval_tree, - &per_mm->umem_tree); - complete_all(&umem_odp->notifier_completion); - } - /* - * NOTE! mmu_notifier_unregister() can happen between a start/end - * callback, resulting in a missing end, and thus an unbalanced - * lock. This doesn't really matter to us since we are about to kfree - * the memory that holds the lock, however LOCKDEP doesn't like this. - * Thus we call the mmu_notifier_put under the rwsem and test the - * internal users count to reliably see if we are past this point. - */ - mmu_notifier_put(&per_mm->mn); - up_write(&per_mm->umem_rwsem); - - mmdrop(umem_odp->umem.owning_mm); kfree(umem_odp); } EXPORT_SYMBOL(ib_umem_odp_release); @@ -501,7 +314,7 @@ EXPORT_SYMBOL(ib_umem_odp_release); */ static int ib_umem_odp_map_dma_single_page( struct ib_umem_odp *umem_odp, - int page_index, + unsigned int page_index, struct page *page, u64 access_mask, unsigned long current_seq) @@ -510,12 +323,7 @@ static int ib_umem_odp_map_dma_single_page( dma_addr_t dma_addr; int ret = 0; - /* - * Note: we avoid writing if seq is different from the initial seq, to - * handle case of a racing notifier. This check also allows us to bail - * early if we have a notifier running in parallel with us. - */ - if (ib_umem_mmu_notifier_retry(umem_odp, current_seq)) { + if (mmu_interval_check_retry(&umem_odp->notifier, current_seq)) { ret = -EAGAIN; goto out; } @@ -618,7 +426,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, * existing beyond the lifetime of the originating process.. Presumably * mmget_not_zero will fail in this case. */ - owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID); + owning_process = get_pid_task(umem_odp->tgid, PIDTYPE_PID); if (!owning_process || !mmget_not_zero(owning_mm)) { ret = -EINVAL; goto out_put_task; @@ -762,32 +570,3 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, } } EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); - -/* @last is not a part of the interval. See comment for function - * node_last. - */ -int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, - u64 start, u64 last, - umem_call_back cb, - bool blockable, - void *cookie) -{ - int ret_val = 0; - struct interval_tree_node *node, *next; - struct ib_umem_odp *umem; - - if (unlikely(start == last)) - return ret_val; - - for (node = interval_tree_iter_first(root, start, last - 1); - node; node = next) { - /* TODO move the blockable decision up to the callback */ - if (!blockable) - return -EAGAIN; - next = interval_tree_iter_next(node, start, last - 1); - umem = container_of(node, struct ib_umem_odp, interval_tree); - ret_val = cb(umem, start, last, cookie) || ret_val; - } - - return ret_val; -} diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index f9a7e9d29c8b..7c5e3fb22413 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -1138,7 +1138,7 @@ static int get_ctxt_info(struct hfi1_filedata *fd, unsigned long arg, u32 len) HFI1_CAP_UGET_MASK(uctxt->flags, MASK) | HFI1_CAP_KGET_MASK(uctxt->flags, K2U); /* adjust flag if this fd is not able to cache */ - if (!fd->handler) + if (!fd->use_mn) cinfo.runtime_flags |= HFI1_CAP_TID_UNMAP; /* no caching */ cinfo.num_active = hfi1_count_active_units(); diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index fa45350a9a1d..fc10d65fc3e1 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1444,7 +1444,7 @@ struct hfi1_filedata { /* for cpu affinity; -1 if none */ int rec_cpu_num; u32 tid_n_pinned; - struct mmu_rb_handler *handler; + bool use_mn; struct tid_rb_node **entry_to_rb; spinlock_t tid_lock; /* protect tid_[limit,used] counters */ u32 tid_limit; diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 3592a9ec155e..f05742ac0949 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -59,11 +59,11 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd, struct tid_user_buf *tbuf, u32 rcventry, struct tid_group *grp, u16 pageidx, unsigned int npages); -static int tid_rb_insert(void *arg, struct mmu_rb_node *node); static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, struct tid_rb_node *tnode); -static void tid_rb_remove(void *arg, struct mmu_rb_node *node); -static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode); +static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq); static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *, struct tid_group *grp, unsigned int start, u16 count, @@ -73,10 +73,8 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, struct tid_group **grp); static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node); -static struct mmu_rb_ops tid_rb_ops = { - .insert = tid_rb_insert, - .remove = tid_rb_remove, - .invalidate = tid_rb_invalidate +static const struct mmu_interval_notifier_ops tid_mn_ops = { + .invalidate = tid_rb_invalidate, }; /* @@ -87,7 +85,6 @@ static struct mmu_rb_ops tid_rb_ops = { int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd, struct hfi1_ctxtdata *uctxt) { - struct hfi1_devdata *dd = uctxt->dd; int ret = 0; spin_lock_init(&fd->tid_lock); @@ -109,20 +106,7 @@ int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd, fd->entry_to_rb = NULL; return -ENOMEM; } - - /* - * Register MMU notifier callbacks. If the registration - * fails, continue without TID caching for this context. - */ - ret = hfi1_mmu_rb_register(fd, fd->mm, &tid_rb_ops, - dd->pport->hfi1_wq, - &fd->handler); - if (ret) { - dd_dev_info(dd, - "Failed MMU notifier registration %d\n", - ret); - ret = 0; - } + fd->use_mn = true; } /* @@ -139,7 +123,7 @@ int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd, * init. */ spin_lock(&fd->tid_lock); - if (uctxt->subctxt_cnt && fd->handler) { + if (uctxt->subctxt_cnt && fd->use_mn) { u16 remainder; fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt; @@ -158,18 +142,10 @@ void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) { struct hfi1_ctxtdata *uctxt = fd->uctxt; - /* - * The notifier would have been removed when the process'es mm - * was freed. - */ - if (fd->handler) { - hfi1_mmu_rb_unregister(fd->handler); - } else { - if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list)) - unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd); - if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list)) - unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd); - } + if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list)) + unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd); + if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list)) + unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd); kfree(fd->invalid_tids); fd->invalid_tids = NULL; @@ -201,7 +177,7 @@ static void unpin_rcv_pages(struct hfi1_filedata *fd, if (mapped) { pci_unmap_single(dd->pcidev, node->dma_addr, - node->mmu.len, PCI_DMA_FROMDEVICE); + node->npages * PAGE_SIZE, PCI_DMA_FROMDEVICE); pages = &node->pages[idx]; } else { pages = &tidbuf->pages[idx]; @@ -777,8 +753,7 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd, return -EFAULT; } - node->mmu.addr = tbuf->vaddr + (pageidx * PAGE_SIZE); - node->mmu.len = npages * PAGE_SIZE; + node->fdata = fd; node->phys = page_to_phys(pages[0]); node->npages = npages; node->rcventry = rcventry; @@ -787,23 +762,35 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd, node->freed = false; memcpy(node->pages, pages, sizeof(struct page *) * npages); - if (!fd->handler) - ret = tid_rb_insert(fd, &node->mmu); - else - ret = hfi1_mmu_rb_insert(fd->handler, &node->mmu); - - if (ret) { - hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d", - node->rcventry, node->mmu.addr, node->phys, ret); - pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE, - PCI_DMA_FROMDEVICE); - kfree(node); - return -EFAULT; + if (fd->use_mn) { + ret = mmu_interval_notifier_insert( + &node->notifier, fd->mm, + tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE, + &tid_mn_ops); + if (ret) + goto out_unmap; + /* + * FIXME: This is in the wrong order, the notifier should be + * established before the pages are pinned by pin_rcv_pages. + */ + mmu_interval_read_begin(&node->notifier); } + fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node; + hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1); trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages, - node->mmu.addr, node->phys, phys); + node->notifier.interval_tree.start, node->phys, + phys); return 0; + +out_unmap: + hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d", + node->rcventry, node->notifier.interval_tree.start, + node->phys, ret); + pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE, + PCI_DMA_FROMDEVICE); + kfree(node); + return -EFAULT; } static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, @@ -833,10 +820,9 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, if (grp) *grp = node->grp; - if (!fd->handler) - cacheless_tid_rb_remove(fd, node); - else - hfi1_mmu_rb_remove(fd->handler, &node->mmu); + if (fd->use_mn) + mmu_interval_notifier_remove(&node->notifier); + cacheless_tid_rb_remove(fd, node); return 0; } @@ -847,7 +833,8 @@ static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) struct hfi1_devdata *dd = uctxt->dd; trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry, - node->npages, node->mmu.addr, node->phys, + node->npages, + node->notifier.interval_tree.start, node->phys, node->dma_addr); /* @@ -894,30 +881,29 @@ static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, if (!node || node->rcventry != rcventry) continue; + if (fd->use_mn) + mmu_interval_notifier_remove( + &node->notifier); cacheless_tid_rb_remove(fd, node); } } } } -/* - * Always return 0 from this function. A non-zero return indicates that the - * remove operation will be called and that memory should be unpinned. - * However, the driver cannot unpin out from under PSM. Instead, retain the - * memory (by returning 0) and inform PSM that the memory is going away. PSM - * will call back later when it has removed the memory from its list. - */ -static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode) +static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) { - struct hfi1_filedata *fdata = arg; - struct hfi1_ctxtdata *uctxt = fdata->uctxt; struct tid_rb_node *node = - container_of(mnode, struct tid_rb_node, mmu); + container_of(mni, struct tid_rb_node, notifier); + struct hfi1_filedata *fdata = node->fdata; + struct hfi1_ctxtdata *uctxt = fdata->uctxt; if (node->freed) - return 0; + return true; - trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->mmu.addr, + trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, + node->notifier.interval_tree.start, node->rcventry, node->npages, node->dma_addr); node->freed = true; @@ -946,18 +932,7 @@ static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode) fdata->invalid_tid_idx++; } spin_unlock(&fdata->invalid_lock); - return 0; -} - -static int tid_rb_insert(void *arg, struct mmu_rb_node *node) -{ - struct hfi1_filedata *fdata = arg; - struct tid_rb_node *tnode = - container_of(node, struct tid_rb_node, mmu); - u32 base = fdata->uctxt->expected_base; - - fdata->entry_to_rb[tnode->rcventry - base] = tnode; - return 0; + return true; } static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, @@ -968,12 +943,3 @@ static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata, fdata->entry_to_rb[tnode->rcventry - base] = NULL; clear_tid_node(fdata, tnode); } - -static void tid_rb_remove(void *arg, struct mmu_rb_node *node) -{ - struct hfi1_filedata *fdata = arg; - struct tid_rb_node *tnode = - container_of(node, struct tid_rb_node, mmu); - - cacheless_tid_rb_remove(fdata, tnode); -} diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h index 43b105de1d54..6257eee083a1 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h @@ -65,7 +65,8 @@ struct tid_user_buf { }; struct tid_rb_node { - struct mmu_rb_node mmu; + struct mmu_interval_notifier notifier; + struct hfi1_filedata *fdata; unsigned long phys; struct tid_group *grp; u32 rcventry; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 03af54b53bf7..5986953ec2fa 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1258,8 +1258,6 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev); int __init mlx5_ib_odp_init(void); void mlx5_ib_odp_cleanup(void); -void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, - unsigned long end); void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent); void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, size_t nentries, struct mlx5_ib_mr *mr, int flags); @@ -1289,11 +1287,10 @@ mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, { return -EOPNOTSUPP; } -static inline void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, - unsigned long start, - unsigned long end){}; #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ +extern const struct mmu_interval_notifier_ops mlx5_mn_ops; + /* Needed for rep profile */ void __mlx5_ib_remove(struct mlx5_ib_dev *dev, const struct mlx5_ib_profile *profile, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 60d39b9ec41c..ea8bfc3e2d8d 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -749,7 +749,8 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata, if (access_flags & IB_ACCESS_ON_DEMAND) { struct ib_umem_odp *odp; - odp = ib_umem_odp_get(udata, start, length, access_flags); + odp = ib_umem_odp_get(udata, start, length, access_flags, + &mlx5_mn_ops); if (IS_ERR(odp)) { mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(odp)); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 45ee40c2f36e..f924250f80c2 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -241,18 +241,27 @@ out_unlock: xa_unlock(&imr->implicit_children); } -void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, - unsigned long end) +static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) { + struct ib_umem_odp *umem_odp = + container_of(mni, struct ib_umem_odp, notifier); struct mlx5_ib_mr *mr; const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt)) - 1; u64 idx = 0, blk_start_idx = 0; u64 invalidations = 0; + unsigned long start; + unsigned long end; int in_block = 0; u64 addr; + if (!mmu_notifier_range_blockable(range)) + return false; + mutex_lock(&umem_odp->umem_mutex); + mmu_interval_set_seq(mni, cur_seq); /* * If npages is zero then umem_odp->private may not be setup yet. This * does not complete until after the first page is mapped for DMA. @@ -261,8 +270,8 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, goto out; mr = umem_odp->private; - start = max_t(u64, ib_umem_start(umem_odp), start); - end = min_t(u64, ib_umem_end(umem_odp), end); + start = max_t(u64, ib_umem_start(umem_odp), range->start); + end = min_t(u64, ib_umem_end(umem_odp), range->end); /* * Iteration one - zap the HW's MTTs. The notifiers_count ensures that @@ -319,8 +328,13 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, destroy_unused_implicit_child_mr(mr); out: mutex_unlock(&umem_odp->umem_mutex); + return true; } +const struct mmu_interval_notifier_ops mlx5_mn_ops = { + .invalidate = mlx5_ib_invalidate_range, +}; + void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) { struct ib_odp_caps *caps = &dev->odp_caps; @@ -419,7 +433,7 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem), idx * MLX5_IMR_MTT_SIZE, - MLX5_IMR_MTT_SIZE); + MLX5_IMR_MTT_SIZE, &mlx5_mn_ops); if (IS_ERR(odp)) return ERR_CAST(odp); @@ -606,8 +620,9 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, u64 user_va, size_t bcnt, u32 *bytes_mapped, u32 flags) { - int current_seq, page_shift, ret, np; + int page_shift, ret, np; bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; + unsigned long current_seq; u64 access_mask; u64 start_idx, page_mask; @@ -619,12 +634,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, if (odp->umem.writable && !downgrade) access_mask |= ODP_WRITE_ALLOWED_BIT; - current_seq = READ_ONCE(odp->notifiers_seq); - /* - * Ensure the sequence number is valid for some time before we call - * gup. - */ - smp_rmb(); + current_seq = mmu_interval_read_begin(&odp->notifier); np = ib_umem_odp_map_dma_pages(odp, user_va, bcnt, access_mask, current_seq); @@ -632,7 +642,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, return np; mutex_lock(&odp->umem_mutex); - if (!ib_umem_mmu_notifier_retry(odp, current_seq)) { + if (!mmu_interval_read_retry(&odp->notifier, current_seq)) { /* * No need to check whether the MTTs really belong to * this MR, since ib_umem_odp_map_dma_pages already @@ -662,19 +672,6 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, return np << (page_shift - PAGE_SHIFT); out: - if (ret == -EAGAIN) { - unsigned long timeout = msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); - - if (!wait_for_completion_timeout(&odp->notifier_completion, - timeout)) { - mlx5_ib_warn( - mr->dev, - "timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n", - current_seq, odp->notifiers_seq, - odp->notifiers_count); - } - } - return ret; } @@ -1622,7 +1619,6 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) static const struct ib_device_ops mlx5_ib_dev_odp_ops = { .advise_mr = mlx5_ib_advise_mr, - .invalidate_range = mlx5_ib_invalidate_range, }; int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) |