diff options
author | David S. Miller | 2017-01-02 15:51:21 -0500 |
---|---|---|
committer | David S. Miller | 2017-01-02 15:51:21 -0500 |
commit | 525dfa2cdce4f5ab76251b5e57ebabf4f2dfc40c (patch) | |
tree | 2e7e1a80aa9aaaf328e1b1c64821bc76e27a487d /drivers/infiniband | |
parent | 85eb018fec29eae60d20f6d04af854308ffb3a05 (diff) | |
parent | aa8e08d2f523501c40b0e70f1c4ecacb97195931 (diff) |
Merge branch 'mlx5-odp'
Saeed Mahameed says:
====================
Mellanox mlx5 core and ODP updates 2017-01-01
The following eleven patches mainly come from Artemy Kovalyov
who expanded mlx5 on-demand-paging (ODP) support. In addition
there are three cleanup patches which don't change any functionality,
but are needed to align codebase prior accepting other patches.
Memory region (MR) in IB can be huge and ODP (on-demand paging)
technique allows to use unpinned memory, which can be consumed and
released on demand. This allows to applications do not pin down
the underlying physical pages of the address space, and save from them
need to track the validity of the mappings.
Rather, the HCA requests the latest translations from the OS when pages
are not present, and the OS invalidates translations which are no longer
valid due to either non-present pages or mapping changes.
In existing ODP implementation applications is needed to register
memory buffers for communication, though registered memory regions
need not have valid mappings at registration time.
This patch set performs the following steps to expand
current ODP implementation:
1. It refactors UMR to support large regions, by introducing generic
function to perform HCA translation table modifications. This
function supports both atomic and process contexts and is not limited
by number of modified entries.
This function allows to enable reallocated memory regions of
arbitrary size, so adding MR cache buckets to support up to 16GB MRs.
2. It changes page fault event format and refactor page faults logic
together with addition of atomic support.
3. It prepares mlx5 core code to support implicit registration with
simplified and relaxed semantics.
Implicit ODP semantics allows to applications provide special memory
key that represents their complete address space. Thus all IO accesses
referencing to this key (with proper access rights associated with the key)
wouldn't need not register any virtual address range.
Thanks,
Artemy, Ilya and Leon
v1->v2:
- Don't use 'inline' in .c files
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/infiniband')
-rw-r--r-- | drivers/infiniband/hw/mlx5/main.c | 50 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mem.c | 32 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mlx5_ib.h | 89 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mr.c | 518 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/odp.c | 424 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/qp.c | 154 |
6 files changed, 518 insertions, 749 deletions
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index d566f6738833..86c61e73780e 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -672,17 +672,6 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, 1 << MLX5_CAP_GEN(dev->mdev, log_max_rq); } - if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes, - uhw->outlen)) { - resp.mlx5_ib_support_multi_pkt_send_wqes = - MLX5_CAP_ETH(mdev, multi_pkt_send_wqe); - resp.response_length += - sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes); - } - - if (field_avail(typeof(resp), reserved, uhw->outlen)) - resp.response_length += sizeof(resp.reserved); - if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) { resp.cqe_comp_caps.max_num = MLX5_CAP_GEN(dev->mdev, cqe_compression) ? @@ -706,6 +695,17 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, resp.response_length += sizeof(resp.packet_pacing_caps); } + if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes, + uhw->outlen)) { + resp.mlx5_ib_support_multi_pkt_send_wqes = + MLX5_CAP_ETH(mdev, multi_pkt_send_wqe); + resp.response_length += + sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes); + } + + if (field_avail(typeof(resp), reserved, uhw->outlen)) + resp.response_length += sizeof(resp.reserved); + if (uhw->outlen) { err = ib_copy_to_udata(uhw, &resp, resp.response_length); @@ -1112,11 +1112,18 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range; #endif + context->upd_xlt_page = __get_free_page(GFP_KERNEL); + if (!context->upd_xlt_page) { + err = -ENOMEM; + goto out_uars; + } + mutex_init(&context->upd_xlt_page_mutex); + if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) { err = mlx5_core_alloc_transport_domain(dev->mdev, &context->tdn); if (err) - goto out_uars; + goto out_page; } INIT_LIST_HEAD(&context->vma_private_list); @@ -1168,6 +1175,9 @@ out_td: if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn); +out_page: + free_page(context->upd_xlt_page); + out_uars: for (i--; i >= 0; i--) mlx5_cmd_free_uar(dev->mdev, uars[i].index); @@ -1195,6 +1205,8 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn); + free_page(context->upd_xlt_page); + for (i = 0; i < uuari->num_uars; i++) { if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index)) mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index); @@ -3307,6 +3319,9 @@ static struct mlx5_interface mlx5_ib_interface = { .add = mlx5_ib_add, .remove = mlx5_ib_remove, .event = mlx5_ib_event, +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + .pfault = mlx5_ib_pfault, +#endif .protocol = MLX5_INTERFACE_PROTOCOL_IB, }; @@ -3317,25 +3332,14 @@ static int __init mlx5_ib_init(void) if (deprecated_prof_sel != 2) pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); - err = mlx5_ib_odp_init(); - if (err) - return err; - err = mlx5_register_interface(&mlx5_ib_interface); - if (err) - goto clean_odp; - - return err; -clean_odp: - mlx5_ib_odp_cleanup(); return err; } static void __exit mlx5_ib_cleanup(void) { mlx5_unregister_interface(&mlx5_ib_interface); - mlx5_ib_odp_cleanup(); } module_init(mlx5_ib_init); diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index 6851357c16f4..778d8a18925f 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -159,7 +159,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, unsigned long umem_page_shift = ilog2(umem->page_size); int shift = page_shift - umem_page_shift; int mask = (1 << shift) - 1; - int i, k; + int i, k, idx; u64 cur = 0; u64 base; int len; @@ -185,18 +185,36 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { len = sg_dma_len(sg) >> umem_page_shift; base = sg_dma_address(sg); - for (k = 0; k < len; k++) { + + /* Skip elements below offset */ + if (i + len < offset << shift) { + i += len; + continue; + } + + /* Skip pages below offset */ + if (i < offset << shift) { + k = (offset << shift) - i; + i = offset << shift; + } else { + k = 0; + } + + for (; k < len; k++) { if (!(i & mask)) { cur = base + (k << umem_page_shift); cur |= access_flags; + idx = (i >> shift) - offset; - pas[i >> shift] = cpu_to_be64(cur); + pas[idx] = cpu_to_be64(cur); mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", - i >> shift, be64_to_cpu(pas[i >> shift])); - } else - mlx5_ib_dbg(dev, "=====> 0x%llx\n", - base + (k << umem_page_shift)); + i >> shift, be64_to_cpu(pas[idx])); + } i++; + + /* Stop after num_pages reached */ + if (i >> shift >= offset + num_pages) + return; } } } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 6c6057eb60ea..a51c8051aeb2 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -125,6 +125,10 @@ struct mlx5_ib_ucontext { /* Transport Domain number */ u32 tdn; struct list_head vma_private_list; + + unsigned long upd_xlt_page; + /* protect ODP/KSM */ + struct mutex upd_xlt_page_mutex; }; static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) @@ -174,13 +178,12 @@ struct mlx5_ib_flow_db { * enum ib_send_flags and enum ib_qp_type for low-level driver */ -#define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START -#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1) -#define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2) - -#define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION (IB_SEND_RESERVED_START << 3) -#define MLX5_IB_SEND_UMR_UPDATE_PD (IB_SEND_RESERVED_START << 4) -#define MLX5_IB_SEND_UMR_UPDATE_ACCESS IB_SEND_RESERVED_END +#define MLX5_IB_SEND_UMR_ENABLE_MR (IB_SEND_RESERVED_START << 0) +#define MLX5_IB_SEND_UMR_DISABLE_MR (IB_SEND_RESERVED_START << 1) +#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 2) +#define MLX5_IB_SEND_UMR_UPDATE_XLT (IB_SEND_RESERVED_START << 3) +#define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION (IB_SEND_RESERVED_START << 4) +#define MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS IB_SEND_RESERVED_END #define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 /* @@ -190,6 +193,16 @@ struct mlx5_ib_flow_db { #define MLX5_IB_QPT_HW_GSI IB_QPT_RESERVED2 #define MLX5_IB_WR_UMR IB_WR_RESERVED1 +#define MLX5_IB_UMR_OCTOWORD 16 +#define MLX5_IB_UMR_XLT_ALIGNMENT 64 + +#define MLX5_IB_UPD_XLT_ZAP BIT(0) +#define MLX5_IB_UPD_XLT_ENABLE BIT(1) +#define MLX5_IB_UPD_XLT_ATOMIC BIT(2) +#define MLX5_IB_UPD_XLT_ADDR BIT(3) +#define MLX5_IB_UPD_XLT_PD BIT(4) +#define MLX5_IB_UPD_XLT_ACCESS BIT(5) + /* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. * * These flags are intended for internal use by the mlx5_ib driver, and they @@ -264,29 +277,6 @@ struct mlx5_ib_rwq_ind_table { u32 rqtn; }; -/* - * Connect-IB can trigger up to four concurrent pagefaults - * per-QP. - */ -enum mlx5_ib_pagefault_context { - MLX5_IB_PAGEFAULT_RESPONDER_READ, - MLX5_IB_PAGEFAULT_REQUESTOR_READ, - MLX5_IB_PAGEFAULT_RESPONDER_WRITE, - MLX5_IB_PAGEFAULT_REQUESTOR_WRITE, - MLX5_IB_PAGEFAULT_CONTEXTS -}; - -static inline enum mlx5_ib_pagefault_context - mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault) -{ - return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE); -} - -struct mlx5_ib_pfault { - struct work_struct work; - struct mlx5_pagefault mpfault; -}; - struct mlx5_ib_ubuffer { struct ib_umem *umem; int buf_size; @@ -372,20 +362,6 @@ struct mlx5_ib_qp { /* Store signature errors */ bool signature_en; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - /* - * A flag that is true for QP's that are in a state that doesn't - * allow page faults, and shouldn't schedule any more faults. - */ - int disable_page_faults; - /* - * The disable_page_faults_lock protects a QP's disable_page_faults - * field, allowing for a thread to atomically check whether the QP - * allows page faults, and if so schedule a page fault. - */ - spinlock_t disable_page_faults_lock; - struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS]; -#endif struct list_head qps_list; struct list_head cq_recv_list; struct list_head cq_send_list; @@ -414,13 +390,11 @@ enum mlx5_ib_qp_flags { struct mlx5_umr_wr { struct ib_send_wr wr; - union { - u64 virt_addr; - u64 offset; - } target; + u64 virt_addr; + u64 offset; struct ib_pd *pd; unsigned int page_shift; - unsigned int npages; + unsigned int xlt_size; u64 length; int access_flags; u32 mkey; @@ -634,6 +608,7 @@ struct mlx5_ib_dev { int fill_delay; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct ib_odp_caps odp_caps; + u64 odp_max_size; /* * Sleepable RCU that prevents destruction of MRs while they are still * being used by a page fault handler. @@ -787,8 +762,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, struct ib_udata *udata); int mlx5_ib_dealloc_mw(struct ib_mw *mw); -int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, - int npages, int zap); +int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, + int page_shift, int flags); int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_pd *pd, struct ib_udata *udata); @@ -857,18 +832,13 @@ struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -extern struct workqueue_struct *mlx5_ib_page_fault_wq; - void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev); -void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, - struct mlx5_ib_pfault *pfault); -void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp); +void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, + struct mlx5_pagefault *pfault); int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev); int __init mlx5_ib_odp_init(void); void mlx5_ib_odp_cleanup(void); -void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp); -void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp); void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, unsigned long end); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ @@ -877,13 +847,10 @@ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) return; } -static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) {} static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {} static inline int mlx5_ib_odp_init(void) { return 0; } static inline void mlx5_ib_odp_cleanup(void) {} -static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {} -static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 8f608debe141..8cf2a67f9fb0 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -46,14 +46,9 @@ enum { }; #define MLX5_UMR_ALIGN 2048 -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -static __be64 mlx5_ib_update_mtt_emergency_buffer[ - MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)] - __aligned(MLX5_UMR_ALIGN); -static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex); -#endif static int clean_mr(struct mlx5_ib_mr *mr); +static int use_umr(struct mlx5_ib_dev *dev, int order); static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { @@ -134,6 +129,7 @@ static void reg_mr_callback(int status, void *context) return; } + mr->mmkey.type = MLX5_MKEY_MR; spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags); key = dev->mdev->priv.mkey_key++; spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags); @@ -629,7 +625,8 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) ent->dev = dev; if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) && - (mlx5_core_is_pf(dev->mdev))) + mlx5_core_is_pf(dev->mdev) && + use_umr(dev, ent->order)) limit = dev->mdev->profile->mr_cache[i].limit; else limit = 0; @@ -732,6 +729,7 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) goto err_in; kfree(in); + mr->mmkey.type = MLX5_MKEY_MR; mr->ibmr.lkey = mr->mmkey.key; mr->ibmr.rkey = mr->mmkey.key; mr->umem = NULL; @@ -757,94 +755,13 @@ static int get_octo_len(u64 addr, u64 len, int page_size) return (npages + 1) / 2; } -static int use_umr(int order) +static int use_umr(struct mlx5_ib_dev *dev, int order) { + if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) + return order < MAX_MR_CACHE_ENTRIES + 2; return order <= MLX5_MAX_UMR_SHIFT; } -static int dma_map_mr_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, - int npages, int page_shift, int *size, - __be64 **mr_pas, dma_addr_t *dma) -{ - __be64 *pas; - struct device *ddev = dev->ib_dev.dma_device; - - /* - * UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes. - * To avoid copying garbage after the pas array, we allocate - * a little more. - */ - *size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT); - *mr_pas = kmalloc(*size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); - if (!(*mr_pas)) - return -ENOMEM; - - pas = PTR_ALIGN(*mr_pas, MLX5_UMR_ALIGN); - mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT); - /* Clear padding after the actual pages. */ - memset(pas + npages, 0, *size - npages * sizeof(u64)); - - *dma = dma_map_single(ddev, pas, *size, DMA_TO_DEVICE); - if (dma_mapping_error(ddev, *dma)) { - kfree(*mr_pas); - return -ENOMEM; - } - - return 0; -} - -static void prep_umr_wqe_common(struct ib_pd *pd, struct ib_send_wr *wr, - struct ib_sge *sg, u64 dma, int n, u32 key, - int page_shift) -{ - struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct mlx5_umr_wr *umrwr = umr_wr(wr); - - sg->addr = dma; - sg->length = ALIGN(sizeof(u64) * n, 64); - sg->lkey = dev->umrc.pd->local_dma_lkey; - - wr->next = NULL; - wr->sg_list = sg; - if (n) - wr->num_sge = 1; - else - wr->num_sge = 0; - - wr->opcode = MLX5_IB_WR_UMR; - - umrwr->npages = n; - umrwr->page_shift = page_shift; - umrwr->mkey = key; -} - -static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, - struct ib_sge *sg, u64 dma, int n, u32 key, - int page_shift, u64 virt_addr, u64 len, - int access_flags) -{ - struct mlx5_umr_wr *umrwr = umr_wr(wr); - - prep_umr_wqe_common(pd, wr, sg, dma, n, key, page_shift); - - wr->send_flags = 0; - - umrwr->target.virt_addr = virt_addr; - umrwr->length = len; - umrwr->access_flags = access_flags; - umrwr->pd = pd; -} - -static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, - struct ib_send_wr *wr, u32 key) -{ - struct mlx5_umr_wr *umrwr = umr_wr(wr); - - wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE; - wr->opcode = MLX5_IB_WR_UMR; - umrwr->mkey = key; -} - static int mr_umem_get(struct ib_pd *pd, u64 start, u64 length, int access_flags, struct ib_umem **umem, int *npages, int *page_shift, int *ncont, @@ -891,21 +808,39 @@ static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) init_completion(&context->done); } +static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev, + struct mlx5_umr_wr *umrwr) +{ + struct umr_common *umrc = &dev->umrc; + struct ib_send_wr *bad; + int err; + struct mlx5_ib_umr_context umr_context; + + mlx5_ib_init_umr_context(&umr_context); + umrwr->wr.wr_cqe = &umr_context.cqe; + + down(&umrc->sem); + err = ib_post_send(umrc->qp, &umrwr->wr, &bad); + if (err) { + mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err); + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "reg umr failed (%u)\n", + umr_context.status); + err = -EFAULT; + } + } + up(&umrc->sem); + return err; +} + static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, u64 virt_addr, u64 len, int npages, int page_shift, int order, int access_flags) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct device *ddev = dev->ib_dev.dma_device; - struct umr_common *umrc = &dev->umrc; - struct mlx5_ib_umr_context umr_context; - struct mlx5_umr_wr umrwr = {}; - struct ib_send_wr *bad; struct mlx5_ib_mr *mr; - struct ib_sge sg; - int size; - __be64 *mr_pas; - dma_addr_t dma; int err = 0; int i; @@ -924,173 +859,174 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, if (!mr) return ERR_PTR(-EAGAIN); - err = dma_map_mr_pas(dev, umem, npages, page_shift, &size, &mr_pas, - &dma); - if (err) - goto free_mr; - - mlx5_ib_init_umr_context(&umr_context); - - umrwr.wr.wr_cqe = &umr_context.cqe; - prep_umr_reg_wqe(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key, - page_shift, virt_addr, len, access_flags); - - down(&umrc->sem); - err = ib_post_send(umrc->qp, &umrwr.wr, &bad); - if (err) { - mlx5_ib_warn(dev, "post send failed, err %d\n", err); - goto unmap_dma; - } else { - wait_for_completion(&umr_context.done); - if (umr_context.status != IB_WC_SUCCESS) { - mlx5_ib_warn(dev, "reg umr failed\n"); - err = -EFAULT; - } - } - + mr->ibmr.pd = pd; + mr->umem = umem; + mr->access_flags = access_flags; + mr->desc_size = sizeof(struct mlx5_mtt); mr->mmkey.iova = virt_addr; mr->mmkey.size = len; mr->mmkey.pd = to_mpd(pd)->pdn; - mr->live = 1; + err = mlx5_ib_update_xlt(mr, 0, npages, page_shift, + MLX5_IB_UPD_XLT_ENABLE); -unmap_dma: - up(&umrc->sem); - dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); - - kfree(mr_pas); - -free_mr: if (err) { free_cached_mr(dev, mr); return ERR_PTR(err); } + mr->live = 1; + return mr; } -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, - int zap) +static inline int populate_xlt(struct mlx5_ib_mr *mr, int idx, int npages, + void *xlt, int page_shift, size_t size, + int flags) { struct mlx5_ib_dev *dev = mr->dev; - struct device *ddev = dev->ib_dev.dma_device; - struct umr_common *umrc = &dev->umrc; - struct mlx5_ib_umr_context umr_context; struct ib_umem *umem = mr->umem; + + npages = min_t(size_t, npages, ib_umem_num_pages(umem) - idx); + + if (!(flags & MLX5_IB_UPD_XLT_ZAP)) { + __mlx5_ib_populate_pas(dev, umem, page_shift, + idx, npages, xlt, + MLX5_IB_MTT_PRESENT); + /* Clear padding after the pages + * brought from the umem. + */ + memset(xlt + (npages * sizeof(struct mlx5_mtt)), 0, + size - npages * sizeof(struct mlx5_mtt)); + } + + return npages; +} + +#define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \ + MLX5_UMR_MTT_ALIGNMENT) +#define MLX5_SPARE_UMR_CHUNK 0x10000 + +int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, + int page_shift, int flags) +{ + struct mlx5_ib_dev *dev = mr->dev; + struct device *ddev = dev->ib_dev.dma_device; + struct mlx5_ib_ucontext *uctx = NULL; int size; - __be64 *pas; + void *xlt; dma_addr_t dma; - struct ib_send_wr *bad; struct mlx5_umr_wr wr; struct ib_sge sg; int err = 0; - const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64); - const int page_index_mask = page_index_alignment - 1; + int desc_size = sizeof(struct mlx5_mtt); + const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size; + const int page_mask = page_align - 1; size_t pages_mapped = 0; size_t pages_to_map = 0; size_t pages_iter = 0; - int use_emergency_buf = 0; + gfp_t gfp; /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes, - * so we need to align the offset and length accordingly */ - if (start_page_index & page_index_mask) { - npages += start_page_index & page_index_mask; - start_page_index &= ~page_index_mask; + * so we need to align the offset and length accordingly + */ + if (idx & page_mask) { + npages += idx & page_mask; + idx &= ~page_mask; } - pages_to_map = ALIGN(npages, page_index_alignment); + gfp = flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC : GFP_KERNEL; + gfp |= __GFP_ZERO | __GFP_NOWARN; - if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES) - return -EINVAL; + pages_to_map = ALIGN(npages, page_align); + size = desc_size * pages_to_map; + size = min_t(int, size, MLX5_MAX_UMR_CHUNK); + + xlt = (void *)__get_free_pages(gfp, get_order(size)); + if (!xlt && size > MLX5_SPARE_UMR_CHUNK) { + mlx5_ib_dbg(dev, "Failed to allocate %d bytes of order %d. fallback to spare UMR allocation od %d bytes\n", + size, get_order(size), MLX5_SPARE_UMR_CHUNK); - size = sizeof(u64) * pages_to_map; - size = min_t(int, PAGE_SIZE, size); - /* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim - * code, when we are called from an invalidation. The pas buffer must - * be 2k-aligned for Connect-IB. */ - pas = (__be64 *)get_zeroed_page(GFP_ATOMIC); - if (!pas) { - mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n"); - pas = mlx5_ib_update_mtt_emergency_buffer; - size = MLX5_UMR_MTT_MIN_CHUNK_SIZE; - use_emergency_buf = 1; - mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex); - memset(pas, 0, size); + size = MLX5_SPARE_UMR_CHUNK; + xlt = (void *)__get_free_pages(gfp, get_order(size)); } - pages_iter = size / sizeof(u64); - dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); + + if (!xlt) { + uctx = to_mucontext(mr->ibmr.uobject->context); + mlx5_ib_warn(dev, "Using XLT emergency buffer\n"); + size = PAGE_SIZE; + xlt = (void *)uctx->upd_xlt_page; + mutex_lock(&uctx->upd_xlt_page_mutex); + memset(xlt, 0, size); + } + pages_iter = size / desc_size; + dma = dma_map_single(ddev, xlt, size, DMA_TO_DEVICE); if (dma_mapping_error(ddev, dma)) { - mlx5_ib_err(dev, "unable to map DMA during MTT update.\n"); + mlx5_ib_err(dev, "unable to map DMA during XLT update.\n"); err = -ENOMEM; - goto free_pas; + goto free_xlt; } + sg.addr = dma; + sg.lkey = dev->umrc.pd->local_dma_lkey; + + memset(&wr, 0, sizeof(wr)); + wr.wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT; + if (!(flags & MLX5_IB_UPD_XLT_ENABLE)) + wr.wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE; + wr.wr.sg_list = &sg; + wr.wr.num_sge = 1; + wr.wr.opcode = MLX5_IB_WR_UMR; + + wr.pd = mr->ibmr.pd; + wr.mkey = mr->mmkey.key; + wr.length = mr->mmkey.size; + wr.virt_addr = mr->mmkey.iova; + wr.access_flags = mr->access_flags; + wr.page_shift = page_shift; + for (pages_mapped = 0; pages_mapped < pages_to_map && !err; - pages_mapped += pages_iter, start_page_index += pages_iter) { + pages_mapped += pages_iter, idx += pages_iter) { dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE); - - npages = min_t(size_t, - pages_iter, - ib_umem_num_pages(umem) - start_page_index); - - if (!zap) { - __mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT, - start_page_index, npages, pas, - MLX5_IB_MTT_PRESENT); - /* Clear padding after the pages brought from the - * umem. */ - memset(pas + npages, 0, size - npages * sizeof(u64)); - } + npages = populate_xlt(mr, idx, pages_iter, xlt, + page_shift, size, flags); dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE); - mlx5_ib_init_umr_context(&umr_context); - - memset(&wr, 0, sizeof(wr)); - wr.wr.wr_cqe = &umr_context.cqe; - - sg.addr = dma; - sg.length = ALIGN(npages * sizeof(u64), - MLX5_UMR_MTT_ALIGNMENT); - sg.lkey = dev->umrc.pd->local_dma_lkey; + sg.length = ALIGN(npages * desc_size, + MLX5_UMR_MTT_ALIGNMENT); + + if (pages_mapped + pages_iter >= pages_to_map) { + if (flags & MLX5_IB_UPD_XLT_ENABLE) + wr.wr.send_flags |= + MLX5_IB_SEND_UMR_ENABLE_MR | + MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS | + MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; + if (flags & MLX5_IB_UPD_XLT_PD || + flags & MLX5_IB_UPD_XLT_ACCESS) + wr.wr.send_flags |= + MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; + if (flags & MLX5_IB_UPD_XLT_ADDR) + wr.wr.send_flags |= + MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; + } - wr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE | - MLX5_IB_SEND_UMR_UPDATE_MTT; - wr.wr.sg_list = &sg; - wr.wr.num_sge = 1; - wr.wr.opcode = MLX5_IB_WR_UMR; - wr.npages = sg.length / sizeof(u64); - wr.page_shift = PAGE_SHIFT; - wr.mkey = mr->mmkey.key; - wr.target.offset = start_page_index; + wr.offset = idx * desc_size; + wr.xlt_size = sg.length; - down(&umrc->sem); - err = ib_post_send(umrc->qp, &wr.wr, &bad); - if (err) { - mlx5_ib_err(dev, "UMR post send failed, err %d\n", err); - } else { - wait_for_completion(&umr_context.done); - if (umr_context.status != IB_WC_SUCCESS) { - mlx5_ib_err(dev, "UMR completion failed, code %d\n", - umr_context.status); - err = -EFAULT; - } - } - up(&umrc->sem); + err = mlx5_ib_post_send_wait(dev, &wr); } dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); -free_pas: - if (!use_emergency_buf) - free_page((unsigned long)pas); +free_xlt: + if (uctx) + mutex_unlock(&uctx->upd_xlt_page_mutex); else - mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex); + free_pages((unsigned long)xlt, get_order(size)); return err; } -#endif /* * If ibmr is NULL it will be allocated by reg_create. @@ -1122,8 +1058,9 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, goto err_1; } pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - mlx5_ib_populate_pas(dev, umem, page_shift, pas, - pg_cap ? MLX5_IB_MTT_PRESENT : 0); + if (!(access_flags & IB_ACCESS_ON_DEMAND)) + mlx5_ib_populate_pas(dev, umem, page_shift, pas, + pg_cap ? MLX5_IB_MTT_PRESENT : 0); /* The pg_access bit allows setting the access flags * in the page list submitted with the command. */ @@ -1153,6 +1090,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, mlx5_ib_warn(dev, "create mkey failed\n"); goto err_2; } + mr->mmkey.type = MLX5_MKEY_MR; mr->umem = umem; mr->dev = dev; mr->live = 1; @@ -1204,14 +1142,15 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (err < 0) return ERR_PTR(err); - if (use_umr(order)) { + if (use_umr(dev, order)) { mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift, order, access_flags); if (PTR_ERR(mr) == -EAGAIN) { mlx5_ib_dbg(dev, "cache empty for order %d", order); mr = NULL; } - } else if (access_flags & IB_ACCESS_ON_DEMAND) { + } else if (access_flags & IB_ACCESS_ON_DEMAND && + !MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) { err = -EINVAL; pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB"); goto error; @@ -1248,106 +1187,39 @@ error: static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { struct mlx5_core_dev *mdev = dev->mdev; - struct umr_common *umrc = &dev->umrc; - struct mlx5_ib_umr_context umr_context; struct mlx5_umr_wr umrwr = {}; - struct ib_send_wr *bad; - int err; if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) return 0; - mlx5_ib_init_umr_context(&umr_context); - - umrwr.wr.wr_cqe = &umr_context.cqe; - prep_umr_unreg_wqe(dev, &umrwr.wr, mr->mmkey.key); + umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR | + MLX5_IB_SEND_UMR_FAIL_IF_FREE; + umrwr.wr.opcode = MLX5_IB_WR_UMR; + umrwr.mkey = mr->mmkey.key; - down(&umrc->sem); - err = ib_post_send(umrc->qp, &umrwr.wr, &bad); - if (err) { - up(&umrc->sem); - mlx5_ib_dbg(dev, "err %d\n", err); - goto error; - } else { - wait_for_completion(&umr_context.done); - up(&umrc->sem); - } - if (umr_context.status != IB_WC_SUCCESS) { - mlx5_ib_warn(dev, "unreg umr failed\n"); - err = -EFAULT; - goto error; - } - return 0; - -error: - return err; + return mlx5_ib_post_send_wait(dev, &umrwr); } -static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr, - u64 length, int npages, int page_shift, int order, +static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, int access_flags, int flags) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct device *ddev = dev->ib_dev.dma_device; - struct mlx5_ib_umr_context umr_context; - struct ib_send_wr *bad; struct mlx5_umr_wr umrwr = {}; - struct ib_sge sg; - struct umr_common *umrc = &dev->umrc; - dma_addr_t dma = 0; - __be64 *mr_pas = NULL; - int size; int err; - mlx5_ib_init_umr_context(&umr_context); - - umrwr.wr.wr_cqe = &umr_context.cqe; umrwr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE; - if (flags & IB_MR_REREG_TRANS) { - err = dma_map_mr_pas(dev, mr->umem, npages, page_shift, &size, - &mr_pas, &dma); - if (err) - return err; + umrwr.wr.opcode = MLX5_IB_WR_UMR; + umrwr.mkey = mr->mmkey.key; - umrwr.target.virt_addr = virt_addr; - umrwr.length = length; - umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; - } - - prep_umr_wqe_common(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key, - page_shift); - - if (flags & IB_MR_REREG_PD) { + if (flags & IB_MR_REREG_PD || flags & IB_MR_REREG_ACCESS) { umrwr.pd = pd; - umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD; - } - - if (flags & IB_MR_REREG_ACCESS) { umrwr.access_flags = access_flags; - umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_ACCESS; + umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; } - /* post send request to UMR QP */ - down(&umrc->sem); - err = ib_post_send(umrc->qp, &umrwr.wr, &bad); - - if (err) { - mlx5_ib_warn(dev, "post send failed, err %d\n", err); - } else { - wait_for_completion(&umr_context.done); - if (umr_context.status != IB_WC_SUCCESS) { - mlx5_ib_warn(dev, "reg umr failed (%u)\n", - umr_context.status); - err = -EFAULT; - } - } + err = mlx5_ib_post_send_wait(dev, &umrwr); - up(&umrc->sem); - if (flags & IB_MR_REREG_TRANS) { - dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); - kfree(mr_pas); - } return err; } @@ -1364,6 +1236,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, u64 addr = (flags & IB_MR_REREG_TRANS) ? virt_addr : mr->umem->address; u64 len = (flags & IB_MR_REREG_TRANS) ? length : mr->umem->length; int page_shift = 0; + int upd_flags = 0; int npages = 0; int ncont = 0; int order = 0; @@ -1372,6 +1245,8 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", start, virt_addr, length, access_flags); + atomic_sub(mr->npages, &dev->mdev->priv.reg_pages); + if (flags != IB_MR_REREG_PD) { /* * Replace umem. This needs to be done whether or not UMR is @@ -1382,7 +1257,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, err = mr_umem_get(pd, addr, len, access_flags, &mr->umem, &npages, &page_shift, &ncont, &order); if (err < 0) { - mr->umem = NULL; + clean_mr(mr); return err; } } @@ -1414,32 +1289,37 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, /* * Send a UMR WQE */ - err = rereg_umr(pd, mr, addr, len, npages, page_shift, - order, access_flags, flags); + mr->ibmr.pd = pd; + mr->access_flags = access_flags; + mr->mmkey.iova = addr; + mr->mmkey.size = len; + mr->mmkey.pd = to_mpd(pd)->pdn; + + if (flags & IB_MR_REREG_TRANS) { + upd_flags = MLX5_IB_UPD_XLT_ADDR; + if (flags & IB_MR_REREG_PD) + upd_flags |= MLX5_IB_UPD_XLT_PD; + if (flags & IB_MR_REREG_ACCESS) + upd_flags |= MLX5_IB_UPD_XLT_ACCESS; + err = mlx5_ib_update_xlt(mr, 0, npages, page_shift, + upd_flags); + } else { + err = rereg_umr(pd, mr, access_flags, flags); + } + if (err) { mlx5_ib_warn(dev, "Failed to rereg UMR\n"); + ib_umem_release(mr->umem); + clean_mr(mr); return err; } } - if (flags & IB_MR_REREG_PD) { - ib_mr->pd = pd; - mr->mmkey.pd = to_mpd(pd)->pdn; - } + set_mr_fileds(dev, mr, npages, len, access_flags); - if (flags & IB_MR_REREG_ACCESS) - mr->access_flags = access_flags; - - if (flags & IB_MR_REREG_TRANS) { - atomic_sub(mr->npages, &dev->mdev->priv.reg_pages); - set_mr_fileds(dev, mr, npages, len, access_flags); - mr->mmkey.iova = addr; - mr->mmkey.size = len; - } #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING update_odp_mr(mr); #endif - return 0; } @@ -1603,11 +1483,11 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, mr->access_mode = MLX5_MKC_ACCESS_MODE_MTT; MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); err = mlx5_alloc_priv_descs(pd->device, mr, - ndescs, sizeof(u64)); + ndescs, sizeof(struct mlx5_mtt)); if (err) goto err_free_in; - mr->desc_size = sizeof(u64); + mr->desc_size = sizeof(struct mlx5_mtt); mr->max_descs = ndescs; } else if (mr_type == IB_MR_TYPE_SG_GAPS) { mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS; @@ -1656,6 +1536,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, if (err) goto err_destroy_psv; + mr->mmkey.type = MLX5_MKEY_MR; mr->ibmr.lkey = mr->mmkey.key; mr->ibmr.rkey = mr->mmkey.key; mr->umem = NULL; @@ -1736,6 +1617,7 @@ struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, if (err) goto free; + mw->mmkey.type = MLX5_MKEY_MW; mw->ibmw.rkey = mw->mmkey.key; resp.response_length = min(offsetof(typeof(resp), response_length) + diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index cacb631a7b0a..e5bc267aca73 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -41,13 +41,12 @@ * a pagefault. */ #define MMU_NOTIFIER_TIMEOUT 1000 -struct workqueue_struct *mlx5_ib_page_fault_wq; - void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, unsigned long end) { struct mlx5_ib_mr *mr; - const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1; + const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / + sizeof(struct mlx5_mtt)) - 1; u64 idx = 0, blk_start_idx = 0; int in_block = 0; u64 addr; @@ -90,16 +89,21 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, u64 umr_offset = idx & umr_block_mask; if (in_block && umr_offset == 0) { - mlx5_ib_update_mtt(mr, blk_start_idx, - idx - blk_start_idx, 1); + mlx5_ib_update_xlt(mr, blk_start_idx, + idx - blk_start_idx, + PAGE_SHIFT, + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ATOMIC); in_block = 0; } } } if (in_block) - mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1, - 1); - + mlx5_ib_update_xlt(mr, blk_start_idx, + idx - blk_start_idx + 1, + PAGE_SHIFT, + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ATOMIC); /* * We are now sure that the device will not access the * memory. We can safely unmap it, and mark it as dirty if @@ -120,6 +124,11 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) caps->general_caps = IB_ODP_SUPPORT; + if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) + dev->odp_max_size = U64_MAX; + else + dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT); + if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; @@ -135,6 +144,9 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; + if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) + caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; + return; } @@ -143,46 +155,51 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, { u32 base_key = mlx5_base_mkey(key); struct mlx5_core_mkey *mmkey = __mlx5_mr_lookup(dev->mdev, base_key); - struct mlx5_ib_mr *mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + struct mlx5_ib_mr *mr; - if (!mmkey || mmkey->key != key || !mr->live) + if (!mmkey || mmkey->key != key || mmkey->type != MLX5_MKEY_MR) + return NULL; + + mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + + if (!mr->live) return NULL; return container_of(mmkey, struct mlx5_ib_mr, mmkey); } -static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp, - struct mlx5_ib_pfault *pfault, +static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault, int error) { - struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); - u32 qpn = qp->trans_qp.base.mqp.qpn; + int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? + pfault->wqe.wq_num : pfault->token; int ret = mlx5_core_page_fault_resume(dev->mdev, - qpn, - pfault->mpfault.flags, + pfault->token, + wq_num, + pfault->type, error); if (ret) - pr_err("Failed to resolve the page fault on QP 0x%x\n", qpn); + mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n", + wq_num); } /* - * Handle a single data segment in a page-fault WQE. + * Handle a single data segment in a page-fault WQE or RDMA region. * - * Returns number of pages retrieved on success. The caller will continue to + * Returns number of pages retrieved on success. The caller may continue to * the next data segment. * Can return the following error codes: * -EAGAIN to designate a temporary error. The caller will abort handling the * page fault and resolve it. * -EFAULT when there's an error mapping the requested pages. The caller will - * abort the page fault handling and possibly move the QP to an error state. - * On other errors the QP should also be closed with an error. + * abort the page fault handling. */ -static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, - struct mlx5_ib_pfault *pfault, +static int pagefault_single_data_segment(struct mlx5_ib_dev *mib_dev, u32 key, u64 io_virt, size_t bcnt, + u32 *bytes_committed, u32 *bytes_mapped) { - struct mlx5_ib_dev *mib_dev = to_mdev(qp->ibqp.pd->device); int srcu_key; unsigned int current_seq; u64 start_idx; @@ -208,12 +225,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, key); if (bytes_mapped) *bytes_mapped += - (bcnt - pfault->mpfault.bytes_committed); - goto srcu_unlock; - } - if (mr->ibmr.pd != qp->ibqp.pd) { - pr_err("Page-fault with different PDs for QP and MR.\n"); - ret = -EFAULT; + (bcnt - *bytes_committed); goto srcu_unlock; } @@ -229,8 +241,8 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, * in all iterations (in iteration 2 and above, * bytes_committed == 0). */ - io_virt += pfault->mpfault.bytes_committed; - bcnt -= pfault->mpfault.bytes_committed; + io_virt += *bytes_committed; + bcnt -= *bytes_committed; start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT; @@ -251,7 +263,9 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, * this MR, since ib_umem_odp_map_dma_pages already * checks this. */ - ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0); + ret = mlx5_ib_update_xlt(mr, start_idx, npages, + PAGE_SHIFT, + MLX5_IB_UPD_XLT_ATOMIC); } else { ret = -EAGAIN; } @@ -287,7 +301,7 @@ srcu_unlock: } } srcu_read_unlock(&mib_dev->mr_srcu, srcu_key); - pfault->mpfault.bytes_committed = 0; + *bytes_committed = 0; return ret ? ret : npages; } @@ -309,8 +323,9 @@ srcu_unlock: * Returns the number of pages loaded if positive, zero for an empty WQE, or a * negative error code. */ -static int pagefault_data_segments(struct mlx5_ib_qp *qp, - struct mlx5_ib_pfault *pfault, void *wqe, +static int pagefault_data_segments(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault, + struct mlx5_ib_qp *qp, void *wqe, void *wqe_end, u32 *bytes_mapped, u32 *total_wqe_bytes, int receive_queue) { @@ -354,22 +369,23 @@ static int pagefault_data_segments(struct mlx5_ib_qp *qp, if (!inline_segment && total_wqe_bytes) { *total_wqe_bytes += bcnt - min_t(size_t, bcnt, - pfault->mpfault.bytes_committed); + pfault->bytes_committed); } /* A zero length data segment designates a length of 2GB. */ if (bcnt == 0) bcnt = 1U << 31; - if (inline_segment || bcnt <= pfault->mpfault.bytes_committed) { - pfault->mpfault.bytes_committed -= + if (inline_segment || bcnt <= pfault->bytes_committed) { + pfault->bytes_committed -= min_t(size_t, bcnt, - pfault->mpfault.bytes_committed); + pfault->bytes_committed); continue; } - ret = pagefault_single_data_segment(qp, pfault, key, io_virt, - bcnt, bytes_mapped); + ret = pagefault_single_data_segment(dev, key, io_virt, bcnt, + &pfault->bytes_committed, + bytes_mapped); if (ret < 0) break; npages += ret; @@ -378,17 +394,29 @@ static int pagefault_data_segments(struct mlx5_ib_qp *qp, return ret < 0 ? ret : npages; } +static const u32 mlx5_ib_odp_opcode_cap[] = { + [MLX5_OPCODE_SEND] = IB_ODP_SUPPORT_SEND, + [MLX5_OPCODE_SEND_IMM] = IB_ODP_SUPPORT_SEND, + [MLX5_OPCODE_SEND_INVAL] = IB_ODP_SUPPORT_SEND, + [MLX5_OPCODE_RDMA_WRITE] = IB_ODP_SUPPORT_WRITE, + [MLX5_OPCODE_RDMA_WRITE_IMM] = IB_ODP_SUPPORT_WRITE, + [MLX5_OPCODE_RDMA_READ] = IB_ODP_SUPPORT_READ, + [MLX5_OPCODE_ATOMIC_CS] = IB_ODP_SUPPORT_ATOMIC, + [MLX5_OPCODE_ATOMIC_FA] = IB_ODP_SUPPORT_ATOMIC, +}; + /* * Parse initiator WQE. Advances the wqe pointer to point at the * scatter-gather list, and set wqe_end to the end of the WQE. */ static int mlx5_ib_mr_initiator_pfault_handler( - struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, - void **wqe, void **wqe_end, int wqe_length) + struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, + struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) { - struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); struct mlx5_wqe_ctrl_seg *ctrl = *wqe; - u16 wqe_index = pfault->mpfault.wqe.wqe_index; + u16 wqe_index = pfault->wqe.wqe_index; + u32 transport_caps; + struct mlx5_base_av *av; unsigned ds, opcode; #if defined(DEBUG) u32 ctrl_wqe_index, ctrl_qpn; @@ -434,53 +462,49 @@ static int mlx5_ib_mr_initiator_pfault_handler( opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & MLX5_WQE_CTRL_OPCODE_MASK; + switch (qp->ibqp.qp_type) { case IB_QPT_RC: - switch (opcode) { - case MLX5_OPCODE_SEND: - case MLX5_OPCODE_SEND_IMM: - case MLX5_OPCODE_SEND_INVAL: - if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & - IB_ODP_SUPPORT_SEND)) - goto invalid_transport_or_opcode; - break; - case MLX5_OPCODE_RDMA_WRITE: - case MLX5_OPCODE_RDMA_WRITE_IMM: - if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & - IB_ODP_SUPPORT_WRITE)) - goto invalid_transport_or_opcode; - *wqe += sizeof(struct mlx5_wqe_raddr_seg); - break; - case MLX5_OPCODE_RDMA_READ: - if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & - IB_ODP_SUPPORT_READ)) - goto invalid_transport_or_opcode; - *wqe += sizeof(struct mlx5_wqe_raddr_seg); - break; - default: - goto invalid_transport_or_opcode; - } + transport_caps = dev->odp_caps.per_transport_caps.rc_odp_caps; break; case IB_QPT_UD: - switch (opcode) { - case MLX5_OPCODE_SEND: - case MLX5_OPCODE_SEND_IMM: - if (!(dev->odp_caps.per_transport_caps.ud_odp_caps & - IB_ODP_SUPPORT_SEND)) - goto invalid_transport_or_opcode; - *wqe += sizeof(struct mlx5_wqe_datagram_seg); - break; - default: - goto invalid_transport_or_opcode; - } + transport_caps = dev->odp_caps.per_transport_caps.ud_odp_caps; break; default: -invalid_transport_or_opcode: - mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode or transport. transport: 0x%x opcode: 0x%x.\n", - qp->ibqp.qp_type, opcode); + mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport 0x%x\n", + qp->ibqp.qp_type); return -EFAULT; } + if (unlikely(opcode >= sizeof(mlx5_ib_odp_opcode_cap) / + sizeof(mlx5_ib_odp_opcode_cap[0]) || + !(transport_caps & mlx5_ib_odp_opcode_cap[opcode]))) { + mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode 0x%x\n", + opcode); + return -EFAULT; + } + + if (qp->ibqp.qp_type != IB_QPT_RC) { + av = *wqe; + if (av->dqp_dct & be32_to_cpu(MLX5_WQE_AV_EXT)) + *wqe += sizeof(struct mlx5_av); + else + *wqe += sizeof(struct mlx5_base_av); + } + + switch (opcode) { + case MLX5_OPCODE_RDMA_WRITE: + case MLX5_OPCODE_RDMA_WRITE_IMM: + case MLX5_OPCODE_RDMA_READ: + *wqe += sizeof(struct mlx5_wqe_raddr_seg); + break; + case MLX5_OPCODE_ATOMIC_CS: + case MLX5_OPCODE_ATOMIC_FA: + *wqe += sizeof(struct mlx5_wqe_raddr_seg); + *wqe += sizeof(struct mlx5_wqe_atomic_seg); + break; + } + return 0; } @@ -489,10 +513,9 @@ invalid_transport_or_opcode: * scatter-gather list, and set wqe_end to the end of the WQE. */ static int mlx5_ib_mr_responder_pfault_handler( - struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, - void **wqe, void **wqe_end, int wqe_length) + struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, + struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) { - struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); struct mlx5_ib_wq *wq = &qp->rq; int wqe_size = 1 << wq->wqe_shift; @@ -529,70 +552,83 @@ invalid_transport_or_opcode: return 0; } -static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp, - struct mlx5_ib_pfault *pfault) +static struct mlx5_ib_qp *mlx5_ib_odp_find_qp(struct mlx5_ib_dev *dev, + u32 wq_num) +{ + struct mlx5_core_qp *mqp = __mlx5_qp_lookup(dev->mdev, wq_num); + + if (!mqp) { + mlx5_ib_err(dev, "QPN 0x%6x not found\n", wq_num); + return NULL; + } + + return to_mibqp(mqp); +} + +static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault) { - struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); int ret; void *wqe, *wqe_end; u32 bytes_mapped, total_wqe_bytes; char *buffer = NULL; - int resume_with_error = 0; - u16 wqe_index = pfault->mpfault.wqe.wqe_index; - int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR; - u32 qpn = qp->trans_qp.base.mqp.qpn; + int resume_with_error = 1; + u16 wqe_index = pfault->wqe.wqe_index; + int requestor = pfault->type & MLX5_PFAULT_REQUESTOR; + struct mlx5_ib_qp *qp; buffer = (char *)__get_free_page(GFP_KERNEL); if (!buffer) { mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); - resume_with_error = 1; goto resolve_page_fault; } + qp = mlx5_ib_odp_find_qp(dev, pfault->wqe.wq_num); + if (!qp) + goto resolve_page_fault; + ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, PAGE_SIZE, &qp->trans_qp.base); if (ret < 0) { - mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n", - -ret, wqe_index, qpn); - resume_with_error = 1; + mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n", + ret, wqe_index, pfault->token); goto resolve_page_fault; } wqe = buffer; if (requestor) - ret = mlx5_ib_mr_initiator_pfault_handler(qp, pfault, &wqe, + ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp, &wqe, &wqe_end, ret); else - ret = mlx5_ib_mr_responder_pfault_handler(qp, pfault, &wqe, + ret = mlx5_ib_mr_responder_pfault_handler(dev, pfault, qp, &wqe, &wqe_end, ret); - if (ret < 0) { - resume_with_error = 1; + if (ret < 0) goto resolve_page_fault; - } if (wqe >= wqe_end) { mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); - resume_with_error = 1; goto resolve_page_fault; } - ret = pagefault_data_segments(qp, pfault, wqe, wqe_end, &bytes_mapped, - &total_wqe_bytes, !requestor); + ret = pagefault_data_segments(dev, pfault, qp, wqe, wqe_end, + &bytes_mapped, &total_wqe_bytes, + !requestor); if (ret == -EAGAIN) { + resume_with_error = 0; goto resolve_page_fault; } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { - mlx5_ib_err(dev, "Error getting user pages for page fault. Error: 0x%x\n", - -ret); - resume_with_error = 1; + if (ret != -ENOENT) + mlx5_ib_err(dev, "Error getting user pages for page fault. Error: %d\n", + ret); goto resolve_page_fault; } + resume_with_error = 0; resolve_page_fault: - mlx5_ib_page_fault_resume(qp, pfault, resume_with_error); - mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n", - qpn, resume_with_error, - pfault->mpfault.flags); - + mlx5_ib_page_fault_resume(dev, pfault, resume_with_error); + mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n", + pfault->token, resume_with_error, + pfault->type); free_page((unsigned long)buffer); } @@ -602,15 +638,14 @@ static int pages_in_range(u64 address, u32 length) (address & PAGE_MASK)) >> PAGE_SHIFT; } -static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp, - struct mlx5_ib_pfault *pfault) +static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault) { - struct mlx5_pagefault *mpfault = &pfault->mpfault; u64 address; u32 length; - u32 prefetch_len = mpfault->bytes_committed; + u32 prefetch_len = pfault->bytes_committed; int prefetch_activated = 0; - u32 rkey = mpfault->rdma.r_key; + u32 rkey = pfault->rdma.r_key; int ret; /* The RDMA responder handler handles the page fault in two parts. @@ -619,38 +654,40 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp, * prefetches more pages. The second operation cannot use the pfault * context and therefore uses the dummy_pfault context allocated on * the stack */ - struct mlx5_ib_pfault dummy_pfault = {}; - - dummy_pfault.mpfault.bytes_committed = 0; + pfault->rdma.rdma_va += pfault->bytes_committed; + pfault->rdma.rdma_op_len -= min(pfault->bytes_committed, + pfault->rdma.rdma_op_len); + pfault->bytes_committed = 0; - mpfault->rdma.rdma_va += mpfault->bytes_committed; - mpfault->rdma.rdma_op_len -= min(mpfault->bytes_committed, - mpfault->rdma.rdma_op_len); - mpfault->bytes_committed = 0; - - address = mpfault->rdma.rdma_va; - length = mpfault->rdma.rdma_op_len; + address = pfault->rdma.rdma_va; + length = pfault->rdma.rdma_op_len; /* For some operations, the hardware cannot tell the exact message * length, and in those cases it reports zero. Use prefetch * logic. */ if (length == 0) { prefetch_activated = 1; - length = mpfault->rdma.packet_size; + length = pfault->rdma.packet_size; prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); } - ret = pagefault_single_data_segment(qp, pfault, rkey, address, length, - NULL); + ret = pagefault_single_data_segment(dev, rkey, address, length, + &pfault->bytes_committed, NULL); if (ret == -EAGAIN) { /* We're racing with an invalidation, don't prefetch */ prefetch_activated = 0; } else if (ret < 0 || pages_in_range(address, length) > ret) { - mlx5_ib_page_fault_resume(qp, pfault, 1); + mlx5_ib_page_fault_resume(dev, pfault, 1); + if (ret != -ENOENT) + mlx5_ib_warn(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n", + ret, pfault->token, pfault->type); return; } - mlx5_ib_page_fault_resume(qp, pfault, 0); + mlx5_ib_page_fault_resume(dev, pfault, 0); + mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n", + pfault->token, pfault->type, + prefetch_activated); /* At this point, there might be a new pagefault already arriving in * the eq, switch to the dummy pagefault for the rest of the @@ -658,112 +695,39 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp, * work-queue is being fenced. */ if (prefetch_activated) { - ret = pagefault_single_data_segment(qp, &dummy_pfault, rkey, - address, + u32 bytes_committed = 0; + + ret = pagefault_single_data_segment(dev, rkey, address, prefetch_len, - NULL); + &bytes_committed, NULL); if (ret < 0) { - pr_warn("Prefetch failed (ret = %d, prefetch_activated = %d) for QPN %d, address: 0x%.16llx, length = 0x%.16x\n", - ret, prefetch_activated, - qp->ibqp.qp_num, address, prefetch_len); + mlx5_ib_warn(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", + ret, pfault->token, address, + prefetch_len); } } } -void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, - struct mlx5_ib_pfault *pfault) +void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, + struct mlx5_pagefault *pfault) { - u8 event_subtype = pfault->mpfault.event_subtype; + struct mlx5_ib_dev *dev = context; + u8 event_subtype = pfault->event_subtype; switch (event_subtype) { case MLX5_PFAULT_SUBTYPE_WQE: - mlx5_ib_mr_wqe_pfault_handler(qp, pfault); + mlx5_ib_mr_wqe_pfault_handler(dev, pfault); break; case MLX5_PFAULT_SUBTYPE_RDMA: - mlx5_ib_mr_rdma_pfault_handler(qp, pfault); + mlx5_ib_mr_rdma_pfault_handler(dev, pfault); break; default: - pr_warn("Invalid page fault event subtype: 0x%x\n", - event_subtype); - mlx5_ib_page_fault_resume(qp, pfault, 1); - break; + mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n", + event_subtype); + mlx5_ib_page_fault_resume(dev, pfault, 1); } } -static void mlx5_ib_qp_pfault_action(struct work_struct *work) -{ - struct mlx5_ib_pfault *pfault = container_of(work, - struct mlx5_ib_pfault, - work); - enum mlx5_ib_pagefault_context context = - mlx5_ib_get_pagefault_context(&pfault->mpfault); - struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp, - pagefaults[context]); - mlx5_ib_mr_pfault_handler(qp, pfault); -} - -void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) -{ - unsigned long flags; - - spin_lock_irqsave(&qp->disable_page_faults_lock, flags); - qp->disable_page_faults = 1; - spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); - - /* - * Note that at this point, we are guarenteed that no more - * work queue elements will be posted to the work queue with - * the QP we are closing. - */ - flush_workqueue(mlx5_ib_page_fault_wq); -} - -void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) -{ - unsigned long flags; - - spin_lock_irqsave(&qp->disable_page_faults_lock, flags); - qp->disable_page_faults = 0; - spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); -} - -static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp, - struct mlx5_pagefault *pfault) -{ - /* - * Note that we will only get one fault event per QP per context - * (responder/initiator, read/write), until we resolve the page fault - * with the mlx5_ib_page_fault_resume command. Since this function is - * called from within the work element, there is no risk of missing - * events. - */ - struct mlx5_ib_qp *mibqp = to_mibqp(qp); - enum mlx5_ib_pagefault_context context = - mlx5_ib_get_pagefault_context(pfault); - struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context]; - - qp_pfault->mpfault = *pfault; - - /* No need to stop interrupts here since we are in an interrupt */ - spin_lock(&mibqp->disable_page_faults_lock); - if (!mibqp->disable_page_faults) - queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work); - spin_unlock(&mibqp->disable_page_faults_lock); -} - -void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) -{ - int i; - - qp->disable_page_faults = 1; - spin_lock_init(&qp->disable_page_faults_lock); - - qp->trans_qp.base.mqp.pfault_handler = mlx5_ib_pfault_handler; - - for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i) - INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action); -} - int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { int ret; @@ -780,17 +744,3 @@ void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) cleanup_srcu_struct(&ibdev->mr_srcu); } -int __init mlx5_ib_odp_init(void) -{ - mlx5_ib_page_fault_wq = alloc_ordered_workqueue("mlx5_ib_page_faults", - WQ_MEM_RECLAIM); - if (!mlx5_ib_page_fault_wq) - return -ENOMEM; - - return 0; -} - -void mlx5_ib_odp_cleanup(void) -{ - destroy_workqueue(mlx5_ib_page_fault_wq); -} diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index a1b3125f0a6e..53f4dd32f956 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1526,9 +1526,6 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, &qp->raw_packet_qp.rq.base : &qp->trans_qp.base; - if (init_attr->qp_type != IB_QPT_RAW_PACKET) - mlx5_ib_odp_create_qp(qp); - mutex_init(&qp->mutex); spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); @@ -1923,7 +1920,6 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) if (qp->state != IB_QPS_RESET) { if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET) { - mlx5_ib_qp_disable_pagefaults(qp); err = mlx5_core_qp_modify(dev->mdev, MLX5_CMD_OP_2RST_QP, 0, NULL, &base->mqp); @@ -2823,16 +2819,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (mlx5_st < 0) goto out; - /* If moving to a reset or error state, we must disable page faults on - * this QP and flush all current page faults. Otherwise a stale page - * fault may attempt to work on this QP after it is reset and moved - * again to RTS, and may cause the driver and the device to get out of - * sync. */ - if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && - (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR) && - (qp->ibqp.qp_type != IB_QPT_RAW_PACKET)) - mlx5_ib_qp_disable_pagefaults(qp); - if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE || !optab[mlx5_cur][mlx5_new]) goto out; @@ -2864,10 +2850,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (err) goto out; - if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT && - (qp->ibqp.qp_type != IB_QPT_RAW_PACKET)) - mlx5_ib_qp_enable_pagefaults(qp); - qp->state = new_state; if (attr_mask & IB_QP_ACCESS_FLAGS) @@ -3080,9 +3062,10 @@ static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg) dseg->addr = cpu_to_be64(sg->addr); } -static __be16 get_klm_octo(int npages) +static u64 get_xlt_octo(u64 bytes) { - return cpu_to_be16(ALIGN(npages, 8) / 2); + return ALIGN(bytes, MLX5_IB_UMR_XLT_ALIGNMENT) / + MLX5_IB_UMR_OCTOWORD; } static __be64 frwr_mkey_mask(void) @@ -3127,18 +3110,14 @@ static __be64 sig_mkey_mask(void) } static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, - struct mlx5_ib_mr *mr) + struct mlx5_ib_mr *mr) { - int ndescs = mr->ndescs; + int size = mr->ndescs * mr->desc_size; memset(umr, 0, sizeof(*umr)); - if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) - /* KLMs take twice the size of MTTs */ - ndescs *= 2; - umr->flags = MLX5_UMR_CHECK_NOT_FREE; - umr->klm_octowords = get_klm_octo(ndescs); + umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); umr->mkey_mask = frwr_mkey_mask(); } @@ -3149,37 +3128,17 @@ static void set_linv_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr) umr->flags = MLX5_UMR_INLINE; } -static __be64 get_umr_reg_mr_mask(int atomic) +static __be64 get_umr_enable_mr_mask(void) { u64 result; - result = MLX5_MKEY_MASK_LEN | - MLX5_MKEY_MASK_PAGE_SIZE | - MLX5_MKEY_MASK_START_ADDR | - MLX5_MKEY_MASK_PD | - MLX5_MKEY_MASK_LR | - MLX5_MKEY_MASK_LW | - MLX5_MKEY_MASK_KEY | - MLX5_MKEY_MASK_RR | - MLX5_MKEY_MASK_RW | + result = MLX5_MKEY_MASK_KEY | MLX5_MKEY_MASK_FREE; - if (atomic) - result |= MLX5_MKEY_MASK_A; - - return cpu_to_be64(result); -} - -static __be64 get_umr_unreg_mr_mask(void) -{ - u64 result; - - result = MLX5_MKEY_MASK_FREE; - return cpu_to_be64(result); } -static __be64 get_umr_update_mtt_mask(void) +static __be64 get_umr_disable_mr_mask(void) { u64 result; @@ -3194,23 +3153,22 @@ static __be64 get_umr_update_translation_mask(void) result = MLX5_MKEY_MASK_LEN | MLX5_MKEY_MASK_PAGE_SIZE | - MLX5_MKEY_MASK_START_ADDR | - MLX5_MKEY_MASK_KEY | - MLX5_MKEY_MASK_FREE; + MLX5_MKEY_MASK_START_ADDR; return cpu_to_be64(result); } -static __be64 get_umr_update_access_mask(void) +static __be64 get_umr_update_access_mask(int atomic) { u64 result; - result = MLX5_MKEY_MASK_LW | + result = MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | MLX5_MKEY_MASK_RR | - MLX5_MKEY_MASK_RW | - MLX5_MKEY_MASK_A | - MLX5_MKEY_MASK_KEY | - MLX5_MKEY_MASK_FREE; + MLX5_MKEY_MASK_RW; + + if (atomic) + result |= MLX5_MKEY_MASK_A; return cpu_to_be64(result); } @@ -3219,9 +3177,7 @@ static __be64 get_umr_update_pd_mask(void) { u64 result; - result = MLX5_MKEY_MASK_PD | - MLX5_MKEY_MASK_KEY | - MLX5_MKEY_MASK_FREE; + result = MLX5_MKEY_MASK_PD; return cpu_to_be64(result); } @@ -3238,24 +3194,24 @@ static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, else umr->flags = MLX5_UMR_CHECK_NOT_FREE; /* fail if not free */ - if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) { - umr->klm_octowords = get_klm_octo(umrwr->npages); - if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT) { - umr->mkey_mask = get_umr_update_mtt_mask(); - umr->bsf_octowords = get_klm_octo(umrwr->target.offset); - umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; - } - if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION) - umr->mkey_mask |= get_umr_update_translation_mask(); - if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_ACCESS) - umr->mkey_mask |= get_umr_update_access_mask(); - if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD) - umr->mkey_mask |= get_umr_update_pd_mask(); - if (!umr->mkey_mask) - umr->mkey_mask = get_umr_reg_mr_mask(atomic); - } else { - umr->mkey_mask = get_umr_unreg_mr_mask(); + umr->xlt_octowords = cpu_to_be16(get_xlt_octo(umrwr->xlt_size)); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_XLT) { + u64 offset = get_xlt_octo(umrwr->offset); + + umr->xlt_offset = cpu_to_be16(offset & 0xffff); + umr->xlt_offset_47_16 = cpu_to_be32(offset >> 16); + umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; + } + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION) + umr->mkey_mask |= get_umr_update_translation_mask(); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS) { + umr->mkey_mask |= get_umr_update_access_mask(atomic); + umr->mkey_mask |= get_umr_update_pd_mask(); } + if (wr->send_flags & MLX5_IB_SEND_UMR_ENABLE_MR) + umr->mkey_mask |= get_umr_enable_mr_mask(); + if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR) + umr->mkey_mask |= get_umr_disable_mr_mask(); if (!wr->num_sge) umr->flags |= MLX5_UMR_INLINE; @@ -3303,17 +3259,17 @@ static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *w struct mlx5_umr_wr *umrwr = umr_wr(wr); memset(seg, 0, sizeof(*seg)); - if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) { + if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR) seg->status = MLX5_MKEY_STATUS_FREE; - return; - } seg->flags = convert_access(umrwr->access_flags); - if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) { - if (umrwr->pd) - seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); - seg->start_addr = cpu_to_be64(umrwr->target.virt_addr); - } + if (umrwr->pd) + seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION && + !umrwr->length) + seg->flags_pd |= cpu_to_be32(MLX5_MKEY_LEN64); + + seg->start_addr = cpu_to_be64(umrwr->virt_addr); seg->len = cpu_to_be64(umrwr->length); seg->log2_page_size = umrwr->page_shift; seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | @@ -3611,7 +3567,7 @@ static int set_sig_data_segment(struct ib_sig_handover_wr *wr, } static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, - struct ib_sig_handover_wr *wr, u32 nelements, + struct ib_sig_handover_wr *wr, u32 size, u32 length, u32 pdn) { struct ib_mr *sig_mr = wr->sig_mr; @@ -3626,17 +3582,17 @@ static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 | MLX5_MKEY_BSF_EN | pdn); seg->len = cpu_to_be64(length); - seg->xlt_oct_size = cpu_to_be32(be16_to_cpu(get_klm_octo(nelements))); + seg->xlt_oct_size = cpu_to_be32(get_xlt_octo(size)); seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); } static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, - u32 nelements) + u32 size) { memset(umr, 0, sizeof(*umr)); umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE; - umr->klm_octowords = get_klm_octo(nelements); + umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE); umr->mkey_mask = sig_mkey_mask(); } @@ -3648,7 +3604,7 @@ static int set_sig_umr_wr(struct ib_send_wr *send_wr, struct mlx5_ib_qp *qp, struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr); struct mlx5_ib_mr *sig_mr = to_mmr(wr->sig_mr); u32 pdn = get_pd(qp)->pdn; - u32 klm_oct_size; + u32 xlt_size; int region_len, ret; if (unlikely(wr->wr.num_sge != 1) || @@ -3670,15 +3626,15 @@ static int set_sig_umr_wr(struct ib_send_wr *send_wr, struct mlx5_ib_qp *qp, * then we use strided block format (3 octowords), * else we use single KLM (1 octoword) **/ - klm_oct_size = wr->prot ? 3 : 1; + xlt_size = wr->prot ? 0x30 : sizeof(struct mlx5_klm); - set_sig_umr_segment(*seg, klm_oct_size); + set_sig_umr_segment(*seg, xlt_size); *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); - set_sig_mkey_segment(*seg, wr, klm_oct_size, region_len, pdn); + set_sig_mkey_segment(*seg, wr, xlt_size, region_len, pdn); *seg += sizeof(struct mlx5_mkey_seg); *size += sizeof(struct mlx5_mkey_seg) / 16; if (unlikely((*seg == qp->sq.qend))) @@ -4559,14 +4515,6 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask, qp_init_attr); -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - /* - * Wait for any outstanding page faults, in case the user frees memory - * based upon this query's result. - */ - flush_workqueue(mlx5_ib_page_fault_wq); -#endif - mutex_lock(&qp->mutex); if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { |