From 746b5583c1a48a837f4891adaff5e09d61b204a6 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 23 Oct 2013 09:53:14 +0300 Subject: IB/mlx5: Multithreaded create MR Use asynchronous commands to execute up to eight concurrent create MR commands. This is to fill memory caches faster so we keep consuming from there. Also, increase timeout for shrinking caches to five minutes. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- include/linux/mlx5/driver.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 6b8c496572c8..513619a75695 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -557,9 +557,11 @@ typedef void (*mlx5_cmd_cbk_t)(int status, void *context); struct mlx5_cmd_work_ent { struct mlx5_cmd_msg *in; struct mlx5_cmd_msg *out; + void *uout; + int uout_size; mlx5_cmd_cbk_t callback; void *context; - int idx; + int idx; struct completion done; struct mlx5_cmd *cmd; struct work_struct work; @@ -570,6 +572,7 @@ struct mlx5_cmd_work_ent { u8 token; struct timespec ts1; struct timespec ts2; + u16 op; }; struct mlx5_pas { @@ -653,6 +656,9 @@ void mlx5_cmd_use_polling(struct mlx5_core_dev *dev); int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr); int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); +int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size, + void *out, int out_size, mlx5_cmd_cbk_t callback, + void *context); int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn); int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn); int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari); @@ -676,7 +682,9 @@ int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, u16 lwm, int is_srq); int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, - struct mlx5_create_mkey_mbox_in *in, int inlen); + struct mlx5_create_mkey_mbox_in *in, int inlen, + mlx5_cmd_cbk_t callback, void *context, + struct mlx5_create_mkey_mbox_out *out); int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr); int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, struct mlx5_query_mkey_mbox_out *out, int outlen); @@ -745,6 +753,11 @@ static inline u32 mlx5_idx_to_mkey(u32 mkey_idx) return mkey_idx << 8; } +static inline u8 mlx5_mkey_variant(u32 mkey) +{ + return mkey & 0xff; +} + enum { MLX5_PROF_MASK_QP_SIZE = (u64)1 << 0, MLX5_PROF_MASK_MR_CACHE = (u64)1 << 1, -- cgit v1.2.3 From bf0bf77f6519e5dcd57a77b47e1d151c1e81b7ec Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 23 Oct 2013 09:53:19 +0300 Subject: mlx5: Support communicating arbitrary host page size to firmware Connect-IB firmware requires 4K pages to be communicated with the driver. This patch breaks larger pages to 4K units to enable support for architectures utilizing larger page size, such as PowerPC. This patch also fixes several places that referred to PAGE_SHIFT instead of explicit 12 which is the inherent page shift on Connect-IB. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/cq.c | 2 +- drivers/infiniband/hw/mlx5/qp.c | 4 +- drivers/infiniband/hw/mlx5/srq.c | 4 +- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 2 +- .../net/ethernet/mellanox/mlx5/core/pagealloc.c | 174 ++++++++++++++------- include/linux/mlx5/driver.h | 1 + 6 files changed, 127 insertions(+), 60 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 84591865e39d..eecac7b52bcb 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -620,7 +620,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, } mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas); - (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - PAGE_SHIFT; + (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - 12; *index = dev->mdev.priv.uuari.uars[0].index; return 0; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 8a36fd78c89f..ce14008a5702 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -551,7 +551,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, } mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0); (*in)->ctx.log_pg_sz_remote_qpn = - cpu_to_be32((page_shift - PAGE_SHIFT) << 24); + cpu_to_be32((page_shift - 12) << 24); (*in)->ctx.params2 = cpu_to_be32(offset << 6); (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); @@ -648,7 +648,7 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, goto err_buf; } (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); - (*in)->ctx.log_pg_sz_remote_qpn = cpu_to_be32((qp->buf.page_shift - PAGE_SHIFT) << 24); + (*in)->ctx.log_pg_sz_remote_qpn = cpu_to_be32((qp->buf.page_shift - 12) << 24); /* Set "fast registration enabled" for all kernel QPs */ (*in)->ctx.params1 |= cpu_to_be32(1 << 11); (*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4); diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index f1e5845b42b0..dbc2c7188c5e 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -123,7 +123,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, goto err_in; } - (*in)->ctx.log_pg_sz = page_shift - PAGE_SHIFT; + (*in)->ctx.log_pg_sz = page_shift - 12; (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26); return 0; @@ -192,7 +192,7 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, } srq->wq_sig = !!srq_signature; - (*in)->ctx.log_pg_sz = page_shift - PAGE_SHIFT; + (*in)->ctx.log_pg_sz = page_shift - 12; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 2231d93cc7ad..6b4b436840bd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -354,7 +354,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_EQ); in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(eq->nent) << 24 | uar->index); in->ctx.intr = vecidx; - in->ctx.log_page_size = PAGE_SHIFT - 12; + in->ctx.log_page_size = eq->buf.page_shift - 12; in->events_mask = cpu_to_be64(mask); err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index a0d0da35578c..013aa422adee 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -57,10 +57,13 @@ struct mlx5_pages_req { }; struct fw_page { - struct rb_node rb_node; - u64 addr; - struct page *page; - u16 func_id; + struct rb_node rb_node; + u64 addr; + struct page *page; + u16 func_id; + unsigned long bitmask; + struct list_head list; + unsigned free_count; }; struct mlx5_query_pages_inbox { @@ -94,6 +97,11 @@ enum { MAX_RECLAIM_TIME_MSECS = 5000, }; +enum { + MLX5_MAX_RECLAIM_TIME_MILI = 5000, + MLX5_NUM_4K_IN_PAGE = PAGE_SIZE / 4096, +}; + static int insert_page(struct mlx5_core_dev *dev, u64 addr, struct page *page, u16 func_id) { struct rb_root *root = &dev->priv.page_root; @@ -101,6 +109,7 @@ static int insert_page(struct mlx5_core_dev *dev, u64 addr, struct page *page, u struct rb_node *parent = NULL; struct fw_page *nfp; struct fw_page *tfp; + int i; while (*new) { parent = *new; @@ -113,25 +122,29 @@ static int insert_page(struct mlx5_core_dev *dev, u64 addr, struct page *page, u return -EEXIST; } - nfp = kmalloc(sizeof(*nfp), GFP_KERNEL); + nfp = kzalloc(sizeof(*nfp), GFP_KERNEL); if (!nfp) return -ENOMEM; nfp->addr = addr; nfp->page = page; nfp->func_id = func_id; + nfp->free_count = MLX5_NUM_4K_IN_PAGE; + for (i = 0; i < MLX5_NUM_4K_IN_PAGE; i++) + set_bit(i, &nfp->bitmask); rb_link_node(&nfp->rb_node, parent, new); rb_insert_color(&nfp->rb_node, root); + list_add(&nfp->list, &dev->priv.free_list); return 0; } -static struct page *remove_page(struct mlx5_core_dev *dev, u64 addr) +static struct fw_page *find_fw_page(struct mlx5_core_dev *dev, u64 addr) { struct rb_root *root = &dev->priv.page_root; struct rb_node *tmp = root->rb_node; - struct page *result = NULL; + struct fw_page *result = NULL; struct fw_page *tfp; while (tmp) { @@ -141,9 +154,7 @@ static struct page *remove_page(struct mlx5_core_dev *dev, u64 addr) } else if (tfp->addr > addr) { tmp = tmp->rb_right; } else { - rb_erase(&tfp->rb_node, root); - result = tfp->page; - kfree(tfp); + result = tfp; break; } } @@ -176,13 +187,97 @@ static int mlx5_cmd_query_pages(struct mlx5_core_dev *dev, u16 *func_id, return err; } +static int alloc_4k(struct mlx5_core_dev *dev, u64 *addr) +{ + struct fw_page *fp; + unsigned n; + + if (list_empty(&dev->priv.free_list)) { + return -ENOMEM; + mlx5_core_warn(dev, "\n"); + } + + fp = list_entry(dev->priv.free_list.next, struct fw_page, list); + n = find_first_bit(&fp->bitmask, 8 * sizeof(fp->bitmask)); + if (n >= MLX5_NUM_4K_IN_PAGE) { + mlx5_core_warn(dev, "alloc 4k bug\n"); + return -ENOENT; + } + clear_bit(n, &fp->bitmask); + fp->free_count--; + if (!fp->free_count) + list_del(&fp->list); + + *addr = fp->addr + n * 4096; + + return 0; +} + +static void free_4k(struct mlx5_core_dev *dev, u64 addr) +{ + struct fw_page *fwp; + int n; + + fwp = find_fw_page(dev, addr & PAGE_MASK); + if (!fwp) { + mlx5_core_warn(dev, "page not found\n"); + return; + } + + n = (addr & ~PAGE_MASK) % 4096; + fwp->free_count++; + set_bit(n, &fwp->bitmask); + if (fwp->free_count == MLX5_NUM_4K_IN_PAGE) { + rb_erase(&fwp->rb_node, &dev->priv.page_root); + list_del(&fwp->list); + dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL); + __free_page(fwp->page); + kfree(fwp); + } else if (fwp->free_count == 1) { + list_add(&fwp->list, &dev->priv.free_list); + } +} + +static int alloc_system_page(struct mlx5_core_dev *dev, u16 func_id) +{ + struct page *page; + u64 addr; + int err; + + page = alloc_page(GFP_HIGHUSER); + if (!page) { + mlx5_core_warn(dev, "failed to allocate page\n"); + return -ENOMEM; + } + addr = dma_map_page(&dev->pdev->dev, page, 0, + PAGE_SIZE, DMA_BIDIRECTIONAL); + if (dma_mapping_error(&dev->pdev->dev, addr)) { + mlx5_core_warn(dev, "failed dma mapping page\n"); + err = -ENOMEM; + goto out_alloc; + } + err = insert_page(dev, addr, page, func_id); + if (err) { + mlx5_core_err(dev, "failed to track allocated page\n"); + goto out_mapping; + } + + return 0; + +out_mapping: + dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL); + +out_alloc: + __free_page(page); + + return err; +} static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, int notify_fail) { struct mlx5_manage_pages_inbox *in; struct mlx5_manage_pages_outbox out; struct mlx5_manage_pages_inbox *nin; - struct page *page; int inlen; u64 addr; int err; @@ -197,27 +292,15 @@ static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, memset(&out, 0, sizeof(out)); for (i = 0; i < npages; i++) { - page = alloc_page(GFP_HIGHUSER); - if (!page) { - err = -ENOMEM; - mlx5_core_warn(dev, "failed to allocate page\n"); - goto out_alloc; - } - addr = dma_map_page(&dev->pdev->dev, page, 0, - PAGE_SIZE, DMA_BIDIRECTIONAL); - if (dma_mapping_error(&dev->pdev->dev, addr)) { - mlx5_core_warn(dev, "failed dma mapping page\n"); - __free_page(page); - err = -ENOMEM; - goto out_alloc; - } - err = insert_page(dev, addr, page, func_id); +retry: + err = alloc_4k(dev, &addr); if (err) { - mlx5_core_err(dev, "failed to track allocated page\n"); - dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL); - __free_page(page); - err = -ENOMEM; - goto out_alloc; + if (err == -ENOMEM) + err = alloc_system_page(dev, func_id); + if (err) + goto out_4k; + + goto retry; } in->pas[i] = cpu_to_be64(addr); } @@ -227,7 +310,6 @@ static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, in->func_id = cpu_to_be16(func_id); in->num_entries = cpu_to_be32(npages); err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); - mlx5_core_dbg(dev, "err %d\n", err); if (err) { mlx5_core_warn(dev, "func_id 0x%x, npages %d, err %d\n", func_id, npages, err); goto out_alloc; @@ -251,7 +333,7 @@ out_alloc: nin = kzalloc(sizeof(*nin), GFP_KERNEL); if (!nin) { mlx5_core_warn(dev, "allocation failed\n"); - goto unmap; + goto out_4k; } memset(&out, 0, sizeof(out)); nin->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MANAGE_PAGES); @@ -261,19 +343,9 @@ out_alloc: kfree(nin); } -unmap: - for (i--; i >= 0; i--) { - addr = be64_to_cpu(in->pas[i]); - page = remove_page(dev, addr); - if (!page) { - mlx5_core_err(dev, "BUG: can't remove page at addr 0x%llx\n", - addr); - continue; - } - dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL); - __free_page(page); - } - +out_4k: + for (i--; i >= 0; i--) + free_4k(dev, be64_to_cpu(in->pas[i])); out_free: mlx5_vfree(in); return err; @@ -284,7 +356,6 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u32 func_id, int npages, { struct mlx5_manage_pages_inbox in; struct mlx5_manage_pages_outbox *out; - struct page *page; int num_claimed; int outlen; u64 addr; @@ -323,13 +394,7 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u32 func_id, int npages, for (i = 0; i < num_claimed; i++) { addr = be64_to_cpu(out->pas[i]); - page = remove_page(dev, addr); - if (!page) { - mlx5_core_warn(dev, "FW reported unknown DMA address 0x%llx\n", addr); - } else { - dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL); - __free_page(page); - } + free_4k(dev, addr); } out_free: @@ -435,6 +500,7 @@ int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev) void mlx5_pagealloc_init(struct mlx5_core_dev *dev) { dev->priv.page_root = RB_ROOT; + INIT_LIST_HEAD(&dev->priv.free_list); } void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 513619a75695..554548cd3dd4 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -483,6 +483,7 @@ struct mlx5_priv { struct rb_root page_root; int fw_pages; int reg_pages; + struct list_head free_list; struct mlx5_core_health health; -- cgit v1.2.3 From 87b8de492da34942fc554f2958a570ce0642296a Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 23 Oct 2013 09:53:20 +0300 Subject: mlx5: Clear reserved area in set_hca_cap() Firmware spec requires reserved fields to be cleared when calling set_hca_cap. Current code queries and copy to the set area, possibly resulting in reserved bits not cleared. This patch copies only writable fields to the set area. Fix also typo - msx => max Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 35 +++++++++++++++++++++++--- include/linux/mlx5/device.h | 9 +++++-- 2 files changed, 39 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index bc0f5fb66e24..40a9f5ed814d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -159,6 +159,36 @@ struct mlx5_reg_host_endianess { u8 rsvd[15]; }; + +#define CAP_MASK(pos, size) ((u64)((1 << (size)) - 1) << (pos)) + +enum { + MLX5_CAP_BITS_RW_MASK = CAP_MASK(MLX5_CAP_OFF_CMDIF_CSUM, 2) | + CAP_MASK(MLX5_CAP_OFF_DCT, 1), +}; + +/* selectively copy writable fields clearing any reserved area + */ +static void copy_rw_fields(struct mlx5_hca_cap *to, struct mlx5_hca_cap *from) +{ + u64 v64; + + to->log_max_qp = from->log_max_qp & 0x1f; + to->log_max_ra_req_dc = from->log_max_ra_req_dc & 0x3f; + to->log_max_ra_res_dc = from->log_max_ra_res_dc & 0x3f; + to->log_max_ra_req_qp = from->log_max_ra_req_qp & 0x3f; + to->log_max_ra_res_qp = from->log_max_ra_res_qp & 0x3f; + to->log_max_atomic_size_qp = from->log_max_atomic_size_qp; + to->log_max_atomic_size_dc = from->log_max_atomic_size_dc; + v64 = be64_to_cpu(from->flags) & MLX5_CAP_BITS_RW_MASK; + to->flags = cpu_to_be64(v64); +} + +enum { + HCA_CAP_OPMOD_GET_MAX = 0, + HCA_CAP_OPMOD_GET_CUR = 1, +}; + static int handle_hca_cap(struct mlx5_core_dev *dev) { struct mlx5_cmd_query_hca_cap_mbox_out *query_out = NULL; @@ -180,7 +210,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev) } query_ctx.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_HCA_CAP); - query_ctx.hdr.opmod = cpu_to_be16(0x1); + query_ctx.hdr.opmod = cpu_to_be16(HCA_CAP_OPMOD_GET_CUR); err = mlx5_cmd_exec(dev, &query_ctx, sizeof(query_ctx), query_out, sizeof(*query_out)); if (err) @@ -192,8 +222,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev) goto query_ex; } - memcpy(&set_ctx->hca_cap, &query_out->hca_cap, - sizeof(set_ctx->hca_cap)); + copy_rw_fields(&set_ctx->hca_cap, &query_out->hca_cap); if (dev->profile->mask & MLX5_PROF_MASK_QP_SIZE) set_ctx->hca_cap.log_max_qp = dev->profile->log_max_qp; diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 5eb4e31af22b..3d789f43e34b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -230,6 +230,11 @@ enum { MLX5_MAX_PAGE_SHIFT = 31 }; +enum { + MLX5_CAP_OFF_DCT = 41, + MLX5_CAP_OFF_CMDIF_CSUM = 46, +}; + struct mlx5_inbox_hdr { __be16 opcode; u8 rsvd[4]; @@ -319,9 +324,9 @@ struct mlx5_hca_cap { u8 rsvd25[42]; __be16 log_uar_page_sz; u8 rsvd26[28]; - u8 log_msx_atomic_size_qp; + u8 log_max_atomic_size_qp; u8 rsvd27[2]; - u8 log_msx_atomic_size_dc; + u8 log_max_atomic_size_dc; u8 rsvd28[76]; }; -- cgit v1.2.3 From 1b77d2bd753d119eedcbc08fda58934307676554 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 24 Oct 2013 12:01:03 +0300 Subject: mlx5: Use enum to indicate adapter page size The Connect-IB adapter has an inherent page size which equals 4K. Define an new enum that equals the page shift and use it instead of using the value 12 throughout the code. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/cq.c | 2 +- drivers/infiniband/hw/mlx5/qp.c | 5 +++-- drivers/infiniband/hw/mlx5/srq.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 2 +- include/linux/mlx5/device.h | 4 ++++ 5 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index eecac7b52bcb..28344773f640 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -620,7 +620,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, } mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas); - (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - 12; + (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; *index = dev->mdev.priv.uuari.uars[0].index; return 0; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 5bcf57943b84..7c6b4ba49bec 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -551,7 +551,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, } mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0); (*in)->ctx.log_pg_sz_remote_qpn = - cpu_to_be32((page_shift - 12) << 24); + cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); (*in)->ctx.params2 = cpu_to_be32(offset << 6); (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); @@ -648,7 +648,8 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, goto err_buf; } (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); - (*in)->ctx.log_pg_sz_remote_qpn = cpu_to_be32((qp->buf.page_shift - 12) << 24); + (*in)->ctx.log_pg_sz_remote_qpn = + cpu_to_be32((qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); /* Set "fast registration enabled" for all kernel QPs */ (*in)->ctx.params1 |= cpu_to_be32(1 << 11); (*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4); diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index dbc2c7188c5e..210b3eaf188a 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -123,7 +123,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, goto err_in; } - (*in)->ctx.log_pg_sz = page_shift - 12; + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26); return 0; @@ -192,7 +192,7 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, } srq->wq_sig = !!srq_signature; - (*in)->ctx.log_pg_sz = page_shift - 12; + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 6b4b436840bd..64a61b286b2c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -354,7 +354,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_EQ); in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(eq->nent) << 24 | uar->index); in->ctx.intr = vecidx; - in->ctx.log_page_size = eq->buf.page_shift - 12; + in->ctx.log_page_size = eq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; in->events_mask = cpu_to_be64(mask); err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 3d789f43e34b..da78875807fc 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -230,6 +230,10 @@ enum { MLX5_MAX_PAGE_SHIFT = 31 }; +enum { + MLX5_ADAPTER_PAGE_SHIFT = 12 +}; + enum { MLX5_CAP_OFF_DCT = 41, MLX5_CAP_OFF_CMDIF_CSUM = 46, -- cgit v1.2.3 From 9dd69a600a680fab1c9235a644c886d8d6a2da2a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 26 Oct 2013 14:32:30 +0200 Subject: IB/srp: Keep rport as long as the IB transport layer Keep the rport data structure around after srp_remove_host() has finished until cleanup of the IB transport layer has finished completely. This is necessary because later patches use the rport pointer inside the queuecommand callback. Without this patch accessing the rport from inside a queuecommand callback is racy because srp_remove_host() must be invoked before scsi_remove_host() and because the queuecommand callback could get invoked after srp_remove_host() has finished. In other words, without this patch the queuecommand callback can get invoked after the rport data structure has been freed. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 3 +++ drivers/infiniband/ulp/srp/ib_srp.h | 1 + drivers/scsi/scsi_transport_srp.c | 18 ++++++++++++++++++ include/scsi/scsi_transport_srp.h | 2 ++ 4 files changed, 24 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 11ebf3e7646a..6edab7855f5e 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -528,11 +528,13 @@ static void srp_remove_target(struct srp_target_port *target) WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); srp_del_scsi_host_attr(target->scsi_host); + srp_rport_get(target->rport); srp_remove_host(target->scsi_host); scsi_remove_host(target->scsi_host); srp_disconnect_target(target); ib_destroy_cm_id(target->cm_id); srp_free_target_ib(target); + srp_rport_put(target->rport); srp_free_req_data(target); scsi_host_put(target->scsi_host); } @@ -2004,6 +2006,7 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target) } rport->lld_data = target; + target->rport = rport; spin_lock(&host->target_lock); list_add_tail(&target->list, &host->target_list); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 84d821b24ec7..2a1768fcc57d 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -153,6 +153,7 @@ struct srp_target_port { u16 io_class; struct srp_host *srp_host; struct Scsi_Host *scsi_host; + struct srp_rport *rport; char target_name[32]; unsigned int scsi_id; unsigned int sg_tablesize; diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c index f379c7f3034c..f7ba94aa52cb 100644 --- a/drivers/scsi/scsi_transport_srp.c +++ b/drivers/scsi/scsi_transport_srp.c @@ -184,6 +184,24 @@ static int srp_host_match(struct attribute_container *cont, struct device *dev) return &i->t.host_attrs.ac == cont; } +/** + * srp_rport_get() - increment rport reference count + */ +void srp_rport_get(struct srp_rport *rport) +{ + get_device(&rport->dev); +} +EXPORT_SYMBOL(srp_rport_get); + +/** + * srp_rport_put() - decrement rport reference count + */ +void srp_rport_put(struct srp_rport *rport) +{ + put_device(&rport->dev); +} +EXPORT_SYMBOL(srp_rport_put); + /** * srp_rport_add - add a SRP remote port to the device hierarchy * @shost: scsi host the remote port is connected to. diff --git a/include/scsi/scsi_transport_srp.h b/include/scsi/scsi_transport_srp.h index ff0f04ac91aa..5a2d2d1081c1 100644 --- a/include/scsi/scsi_transport_srp.h +++ b/include/scsi/scsi_transport_srp.h @@ -38,6 +38,8 @@ extern struct scsi_transport_template * srp_attach_transport(struct srp_function_template *); extern void srp_release_transport(struct scsi_transport_template *); +extern void srp_rport_get(struct srp_rport *rport); +extern void srp_rport_put(struct srp_rport *rport); extern struct srp_rport *srp_rport_add(struct Scsi_Host *, struct srp_rport_identifiers *); extern void srp_rport_del(struct srp_rport *); -- cgit v1.2.3 From 29c17324803c8a3bb5b2b69309e43571164cc4de Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 26 Oct 2013 14:33:30 +0200 Subject: scsi_transport_srp: Add transport layer error handling Add the necessary functions in the SRP transport module to allow an SRP initiator driver to implement transport layer error handling similar to the functionality already provided by the FC transport layer. This includes: - Support for implementing fast_io_fail_tmo, the time that should elapse after having detected a transport layer problem and before failing I/O. - Support for implementing dev_loss_tmo, the time that should elapse after having detected a transport layer problem and before removing a remote port. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- Documentation/ABI/stable/sysfs-transport-srp | 31 ++ drivers/scsi/scsi_transport_srp.c | 430 ++++++++++++++++++++++++++- include/scsi/scsi_transport_srp.h | 74 ++++- 3 files changed, 532 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/Documentation/ABI/stable/sysfs-transport-srp b/Documentation/ABI/stable/sysfs-transport-srp index b36fb0dc13c8..8b6acc775b71 100644 --- a/Documentation/ABI/stable/sysfs-transport-srp +++ b/Documentation/ABI/stable/sysfs-transport-srp @@ -5,6 +5,24 @@ Contact: linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org Description: Instructs an SRP initiator to disconnect from a target and to remove all LUNs imported from that target. +What: /sys/class/srp_remote_ports/port-:/dev_loss_tmo +Date: February 1, 2014 +KernelVersion: 3.13 +Contact: linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org +Description: Number of seconds the SCSI layer will wait after a transport + layer error has been observed before removing a target port. + Zero means immediate removal. Setting this attribute to "off" + will disable the dev_loss timer. + +What: /sys/class/srp_remote_ports/port-:/fast_io_fail_tmo +Date: February 1, 2014 +KernelVersion: 3.13 +Contact: linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org +Description: Number of seconds the SCSI layer will wait after a transport + layer error has been observed before failing I/O. Zero means + failing I/O immediately. Setting this attribute to "off" will + disable the fast_io_fail timer. + What: /sys/class/srp_remote_ports/port-:/port_id Date: June 27, 2007 KernelVersion: 2.6.24 @@ -17,3 +35,16 @@ Date: June 27, 2007 KernelVersion: 2.6.24 Contact: linux-scsi@vger.kernel.org Description: Role of the remote port. Either "SRP Initiator" or "SRP Target". + +What: /sys/class/srp_remote_ports/port-:/state +Date: February 1, 2014 +KernelVersion: 3.13 +Contact: linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org +Description: State of the transport layer used for communication with the + remote port. "running" if the transport layer is operational; + "blocked" if a transport layer error has been encountered but + the fast_io_fail_tmo timer has not yet fired; "fail-fast" + after the fast_io_fail_tmo timer has fired and before the + "dev_loss_tmo" timer has fired; "lost" after the + "dev_loss_tmo" timer has fired and before the port is finally + removed. diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c index f7ba94aa52cb..2696e26b3423 100644 --- a/drivers/scsi/scsi_transport_srp.c +++ b/drivers/scsi/scsi_transport_srp.c @@ -24,12 +24,15 @@ #include #include #include +#include #include +#include #include #include #include #include +#include "scsi_priv.h" #include "scsi_transport_srp_internal.h" struct srp_host_attrs { @@ -38,7 +41,7 @@ struct srp_host_attrs { #define to_srp_host_attrs(host) ((struct srp_host_attrs *)(host)->shost_data) #define SRP_HOST_ATTRS 0 -#define SRP_RPORT_ATTRS 3 +#define SRP_RPORT_ATTRS 6 struct srp_internal { struct scsi_transport_template t; @@ -54,6 +57,34 @@ struct srp_internal { #define dev_to_rport(d) container_of(d, struct srp_rport, dev) #define transport_class_to_srp_rport(dev) dev_to_rport((dev)->parent) +static inline struct Scsi_Host *rport_to_shost(struct srp_rport *r) +{ + return dev_to_shost(r->dev.parent); +} + +/** + * srp_tmo_valid() - check timeout combination validity + * + * The combination of the timeout parameters must be such that SCSI commands + * are finished in a reasonable time. Hence do not allow the fast I/O fail + * timeout to exceed SCSI_DEVICE_BLOCK_MAX_TIMEOUT. Furthermore, these + * parameters must be such that multipath can detect failed paths timely. + * Hence do not allow both parameters to be disabled simultaneously. + */ +int srp_tmo_valid(int fast_io_fail_tmo, int dev_loss_tmo) +{ + if (fast_io_fail_tmo < 0 && dev_loss_tmo < 0) + return -EINVAL; + if (fast_io_fail_tmo > SCSI_DEVICE_BLOCK_MAX_TIMEOUT) + return -EINVAL; + if (dev_loss_tmo >= LONG_MAX / HZ) + return -EINVAL; + if (fast_io_fail_tmo >= 0 && dev_loss_tmo >= 0 && + fast_io_fail_tmo >= dev_loss_tmo) + return -EINVAL; + return 0; +} +EXPORT_SYMBOL_GPL(srp_tmo_valid); static int srp_host_setup(struct transport_container *tc, struct device *dev, struct device *cdev) @@ -134,10 +165,383 @@ static ssize_t store_srp_rport_delete(struct device *dev, static DEVICE_ATTR(delete, S_IWUSR, NULL, store_srp_rport_delete); +static ssize_t show_srp_rport_state(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + static const char *const state_name[] = { + [SRP_RPORT_RUNNING] = "running", + [SRP_RPORT_BLOCKED] = "blocked", + [SRP_RPORT_FAIL_FAST] = "fail-fast", + [SRP_RPORT_LOST] = "lost", + }; + struct srp_rport *rport = transport_class_to_srp_rport(dev); + enum srp_rport_state state = rport->state; + + return sprintf(buf, "%s\n", + (unsigned)state < ARRAY_SIZE(state_name) ? + state_name[state] : "???"); +} + +static DEVICE_ATTR(state, S_IRUGO, show_srp_rport_state, NULL); + +static ssize_t srp_show_tmo(char *buf, int tmo) +{ + return tmo >= 0 ? sprintf(buf, "%d\n", tmo) : sprintf(buf, "off\n"); +} + +static int srp_parse_tmo(int *tmo, const char *buf) +{ + int res = 0; + + if (strncmp(buf, "off", 3) != 0) + res = kstrtoint(buf, 0, tmo); + else + *tmo = -1; + + return res; +} + +static ssize_t show_srp_rport_fast_io_fail_tmo(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + + return srp_show_tmo(buf, rport->fast_io_fail_tmo); +} + +static ssize_t store_srp_rport_fast_io_fail_tmo(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + int res; + int fast_io_fail_tmo; + + res = srp_parse_tmo(&fast_io_fail_tmo, buf); + if (res) + goto out; + res = srp_tmo_valid(fast_io_fail_tmo, rport->dev_loss_tmo); + if (res) + goto out; + rport->fast_io_fail_tmo = fast_io_fail_tmo; + res = count; + +out: + return res; +} + +static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR, + show_srp_rport_fast_io_fail_tmo, + store_srp_rport_fast_io_fail_tmo); + +static ssize_t show_srp_rport_dev_loss_tmo(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + + return srp_show_tmo(buf, rport->dev_loss_tmo); +} + +static ssize_t store_srp_rport_dev_loss_tmo(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + int res; + int dev_loss_tmo; + + res = srp_parse_tmo(&dev_loss_tmo, buf); + if (res) + goto out; + res = srp_tmo_valid(rport->fast_io_fail_tmo, dev_loss_tmo); + if (res) + goto out; + rport->dev_loss_tmo = dev_loss_tmo; + res = count; + +out: + return res; +} + +static DEVICE_ATTR(dev_loss_tmo, S_IRUGO | S_IWUSR, + show_srp_rport_dev_loss_tmo, + store_srp_rport_dev_loss_tmo); + +static int srp_rport_set_state(struct srp_rport *rport, + enum srp_rport_state new_state) +{ + enum srp_rport_state old_state = rport->state; + + lockdep_assert_held(&rport->mutex); + + switch (new_state) { + case SRP_RPORT_RUNNING: + switch (old_state) { + case SRP_RPORT_LOST: + goto invalid; + default: + break; + } + break; + case SRP_RPORT_BLOCKED: + switch (old_state) { + case SRP_RPORT_RUNNING: + break; + default: + goto invalid; + } + break; + case SRP_RPORT_FAIL_FAST: + switch (old_state) { + case SRP_RPORT_LOST: + goto invalid; + default: + break; + } + break; + case SRP_RPORT_LOST: + break; + } + rport->state = new_state; + return 0; + +invalid: + return -EINVAL; +} + +static void __rport_fail_io_fast(struct srp_rport *rport) +{ + struct Scsi_Host *shost = rport_to_shost(rport); + struct srp_internal *i; + + lockdep_assert_held(&rport->mutex); + + if (srp_rport_set_state(rport, SRP_RPORT_FAIL_FAST)) + return; + scsi_target_unblock(rport->dev.parent, SDEV_TRANSPORT_OFFLINE); + + /* Involve the LLD if possible to terminate all I/O on the rport. */ + i = to_srp_internal(shost->transportt); + if (i->f->terminate_rport_io) + i->f->terminate_rport_io(rport); +} + +/** + * rport_fast_io_fail_timedout() - fast I/O failure timeout handler + */ +static void rport_fast_io_fail_timedout(struct work_struct *work) +{ + struct srp_rport *rport = container_of(to_delayed_work(work), + struct srp_rport, fast_io_fail_work); + struct Scsi_Host *shost = rport_to_shost(rport); + + pr_info("fast_io_fail_tmo expired for SRP %s / %s.\n", + dev_name(&rport->dev), dev_name(&shost->shost_gendev)); + + mutex_lock(&rport->mutex); + if (rport->state == SRP_RPORT_BLOCKED) + __rport_fail_io_fast(rport); + mutex_unlock(&rport->mutex); +} + +/** + * rport_dev_loss_timedout() - device loss timeout handler + */ +static void rport_dev_loss_timedout(struct work_struct *work) +{ + struct srp_rport *rport = container_of(to_delayed_work(work), + struct srp_rport, dev_loss_work); + struct Scsi_Host *shost = rport_to_shost(rport); + struct srp_internal *i = to_srp_internal(shost->transportt); + + pr_info("dev_loss_tmo expired for SRP %s / %s.\n", + dev_name(&rport->dev), dev_name(&shost->shost_gendev)); + + mutex_lock(&rport->mutex); + WARN_ON(srp_rport_set_state(rport, SRP_RPORT_LOST) != 0); + scsi_target_unblock(rport->dev.parent, SDEV_TRANSPORT_OFFLINE); + mutex_unlock(&rport->mutex); + + i->f->rport_delete(rport); +} + +static void __srp_start_tl_fail_timers(struct srp_rport *rport) +{ + struct Scsi_Host *shost = rport_to_shost(rport); + int fast_io_fail_tmo, dev_loss_tmo; + + lockdep_assert_held(&rport->mutex); + + if (!rport->deleted) { + fast_io_fail_tmo = rport->fast_io_fail_tmo; + dev_loss_tmo = rport->dev_loss_tmo; + pr_debug("%s current state: %d\n", + dev_name(&shost->shost_gendev), rport->state); + + if (fast_io_fail_tmo >= 0 && + srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) { + pr_debug("%s new state: %d\n", + dev_name(&shost->shost_gendev), + rport->state); + scsi_target_block(&shost->shost_gendev); + queue_delayed_work(system_long_wq, + &rport->fast_io_fail_work, + 1UL * fast_io_fail_tmo * HZ); + } + if (dev_loss_tmo >= 0) + queue_delayed_work(system_long_wq, + &rport->dev_loss_work, + 1UL * dev_loss_tmo * HZ); + } else { + pr_debug("%s has already been deleted\n", + dev_name(&shost->shost_gendev)); + srp_rport_set_state(rport, SRP_RPORT_FAIL_FAST); + scsi_target_unblock(&shost->shost_gendev, + SDEV_TRANSPORT_OFFLINE); + } +} + +/** + * srp_start_tl_fail_timers() - start the transport layer failure timers + * + * Start the transport layer fast I/O failure and device loss timers. Do not + * modify a timer that was already started. + */ +void srp_start_tl_fail_timers(struct srp_rport *rport) +{ + mutex_lock(&rport->mutex); + __srp_start_tl_fail_timers(rport); + mutex_unlock(&rport->mutex); +} +EXPORT_SYMBOL(srp_start_tl_fail_timers); + +/** + * scsi_request_fn_active() - number of kernel threads inside scsi_request_fn() + */ +static int scsi_request_fn_active(struct Scsi_Host *shost) +{ + struct scsi_device *sdev; + struct request_queue *q; + int request_fn_active = 0; + + shost_for_each_device(sdev, shost) { + q = sdev->request_queue; + + spin_lock_irq(q->queue_lock); + request_fn_active += q->request_fn_active; + spin_unlock_irq(q->queue_lock); + } + + return request_fn_active; +} + +/** + * srp_reconnect_rport() - reconnect to an SRP target port + * + * Blocks SCSI command queueing before invoking reconnect() such that + * queuecommand() won't be invoked concurrently with reconnect() from outside + * the SCSI EH. This is important since a reconnect() implementation may + * reallocate resources needed by queuecommand(). + * + * Notes: + * - This function neither waits until outstanding requests have finished nor + * tries to abort these. It is the responsibility of the reconnect() + * function to finish outstanding commands before reconnecting to the target + * port. + * - It is the responsibility of the caller to ensure that the resources + * reallocated by the reconnect() function won't be used while this function + * is in progress. One possible strategy is to invoke this function from + * the context of the SCSI EH thread only. Another possible strategy is to + * lock the rport mutex inside each SCSI LLD callback that can be invoked by + * the SCSI EH (the scsi_host_template.eh_*() functions and also the + * scsi_host_template.queuecommand() function). + */ +int srp_reconnect_rport(struct srp_rport *rport) +{ + struct Scsi_Host *shost = rport_to_shost(rport); + struct srp_internal *i = to_srp_internal(shost->transportt); + struct scsi_device *sdev; + int res; + + pr_debug("SCSI host %s\n", dev_name(&shost->shost_gendev)); + + res = mutex_lock_interruptible(&rport->mutex); + if (res) + goto out; + scsi_target_block(&shost->shost_gendev); + while (scsi_request_fn_active(shost)) + msleep(20); + res = i->f->reconnect(rport); + pr_debug("%s (state %d): transport.reconnect() returned %d\n", + dev_name(&shost->shost_gendev), rport->state, res); + if (res == 0) { + cancel_delayed_work(&rport->fast_io_fail_work); + cancel_delayed_work(&rport->dev_loss_work); + + srp_rport_set_state(rport, SRP_RPORT_RUNNING); + scsi_target_unblock(&shost->shost_gendev, SDEV_RUNNING); + /* + * If the SCSI error handler has offlined one or more devices, + * invoking scsi_target_unblock() won't change the state of + * these devices into running so do that explicitly. + */ + spin_lock_irq(shost->host_lock); + __shost_for_each_device(sdev, shost) + if (sdev->sdev_state == SDEV_OFFLINE) + sdev->sdev_state = SDEV_RUNNING; + spin_unlock_irq(shost->host_lock); + } else if (rport->state == SRP_RPORT_RUNNING) { + /* + * srp_reconnect_rport() was invoked with fast_io_fail + * off. Mark the port as failed and start the TL failure + * timers if these had not yet been started. + */ + __rport_fail_io_fast(rport); + scsi_target_unblock(&shost->shost_gendev, + SDEV_TRANSPORT_OFFLINE); + __srp_start_tl_fail_timers(rport); + } else if (rport->state != SRP_RPORT_BLOCKED) { + scsi_target_unblock(&shost->shost_gendev, + SDEV_TRANSPORT_OFFLINE); + } + mutex_unlock(&rport->mutex); + +out: + return res; +} +EXPORT_SYMBOL(srp_reconnect_rport); + +/** + * srp_timed_out() - SRP transport intercept of the SCSI timeout EH + * + * If a timeout occurs while an rport is in the blocked state, ask the SCSI + * EH to continue waiting (BLK_EH_RESET_TIMER). Otherwise let the SCSI core + * handle the timeout (BLK_EH_NOT_HANDLED). + * + * Note: This function is called from soft-IRQ context and with the request + * queue lock held. + */ +static enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd) +{ + struct scsi_device *sdev = scmd->device; + struct Scsi_Host *shost = sdev->host; + struct srp_internal *i = to_srp_internal(shost->transportt); + + pr_debug("timeout for sdev %s\n", dev_name(&sdev->sdev_gendev)); + return i->f->reset_timer_if_blocked && scsi_device_blocked(sdev) ? + BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED; +} + static void srp_rport_release(struct device *dev) { struct srp_rport *rport = dev_to_rport(dev); + cancel_delayed_work_sync(&rport->fast_io_fail_work); + cancel_delayed_work_sync(&rport->dev_loss_work); + put_device(dev->parent); kfree(rport); } @@ -214,12 +618,15 @@ struct srp_rport *srp_rport_add(struct Scsi_Host *shost, { struct srp_rport *rport; struct device *parent = &shost->shost_gendev; + struct srp_internal *i = to_srp_internal(shost->transportt); int id, ret; rport = kzalloc(sizeof(*rport), GFP_KERNEL); if (!rport) return ERR_PTR(-ENOMEM); + mutex_init(&rport->mutex); + device_initialize(&rport->dev); rport->dev.parent = get_device(parent); @@ -228,6 +635,13 @@ struct srp_rport *srp_rport_add(struct Scsi_Host *shost, memcpy(rport->port_id, ids->port_id, sizeof(rport->port_id)); rport->roles = ids->roles; + rport->fast_io_fail_tmo = i->f->fast_io_fail_tmo ? + *i->f->fast_io_fail_tmo : 15; + rport->dev_loss_tmo = i->f->dev_loss_tmo ? *i->f->dev_loss_tmo : 60; + INIT_DELAYED_WORK(&rport->fast_io_fail_work, + rport_fast_io_fail_timedout); + INIT_DELAYED_WORK(&rport->dev_loss_work, rport_dev_loss_timedout); + id = atomic_inc_return(&to_srp_host_attrs(shost)->next_port_id); dev_set_name(&rport->dev, "port-%d:%d", shost->host_no, id); @@ -277,6 +691,13 @@ void srp_rport_del(struct srp_rport *rport) transport_remove_device(dev); device_del(dev); transport_destroy_device(dev); + + mutex_lock(&rport->mutex); + if (rport->state == SRP_RPORT_BLOCKED) + __rport_fail_io_fast(rport); + rport->deleted = true; + mutex_unlock(&rport->mutex); + put_device(dev); } EXPORT_SYMBOL_GPL(srp_rport_del); @@ -328,6 +749,8 @@ srp_attach_transport(struct srp_function_template *ft) if (!i) return NULL; + i->t.eh_timed_out = srp_timed_out; + i->t.tsk_mgmt_response = srp_tsk_mgmt_response; i->t.it_nexus_response = srp_it_nexus_response; @@ -345,6 +768,11 @@ srp_attach_transport(struct srp_function_template *ft) count = 0; i->rport_attrs[count++] = &dev_attr_port_id; i->rport_attrs[count++] = &dev_attr_roles; + if (ft->has_rport_state) { + i->rport_attrs[count++] = &dev_attr_state; + i->rport_attrs[count++] = &dev_attr_fast_io_fail_tmo; + i->rport_attrs[count++] = &dev_attr_dev_loss_tmo; + } if (ft->rport_delete) i->rport_attrs[count++] = &dev_attr_delete; i->rport_attrs[count++] = NULL; diff --git a/include/scsi/scsi_transport_srp.h b/include/scsi/scsi_transport_srp.h index 5a2d2d1081c1..ee7001677f64 100644 --- a/include/scsi/scsi_transport_srp.h +++ b/include/scsi/scsi_transport_srp.h @@ -13,6 +13,26 @@ struct srp_rport_identifiers { u8 roles; }; +/** + * enum srp_rport_state - SRP transport layer state + * @SRP_RPORT_RUNNING: Transport layer operational. + * @SRP_RPORT_BLOCKED: Transport layer not operational; fast I/O fail timer + * is running and I/O has been blocked. + * @SRP_RPORT_FAIL_FAST: Fast I/O fail timer has expired; fail I/O fast. + * @SRP_RPORT_LOST: Device loss timer has expired; port is being removed. + */ +enum srp_rport_state { + SRP_RPORT_RUNNING, + SRP_RPORT_BLOCKED, + SRP_RPORT_FAIL_FAST, + SRP_RPORT_LOST, +}; + +/** + * struct srp_rport + * @lld_data: LLD private data. + * @mutex: Protects against concurrent rport fast_io_fail / dev_loss_tmo. + */ struct srp_rport { /* for initiator and target drivers */ @@ -23,11 +43,38 @@ struct srp_rport { /* for initiator drivers */ - void *lld_data; /* LLD private data */ + void *lld_data; + + struct mutex mutex; + enum srp_rport_state state; + bool deleted; + int fast_io_fail_tmo; + int dev_loss_tmo; + struct delayed_work fast_io_fail_work; + struct delayed_work dev_loss_work; }; +/** + * struct srp_function_template + * @has_rport_state: Whether or not to create the state, fast_io_fail_tmo and + * dev_loss_tmo sysfs attribute for an rport. + * @reset_timer_if_blocked: Whether or srp_timed_out() should reset the command + * timer if the device on which it has been queued is blocked. + * @fast_io_fail_tmo: If not NULL, points to the default fast_io_fail_tmo value. + * @dev_loss_tmo: If not NULL, points to the default dev_loss_tmo value. + * @reconnect: Callback function for reconnecting to the target. See also + * srp_reconnect_rport(). + * @terminate_rport_io: Callback function for terminating all outstanding I/O + * requests for an rport. + */ struct srp_function_template { /* for initiator drivers */ + bool has_rport_state; + bool reset_timer_if_blocked; + int *fast_io_fail_tmo; + int *dev_loss_tmo; + int (*reconnect)(struct srp_rport *rport); + void (*terminate_rport_io)(struct srp_rport *rport); void (*rport_delete)(struct srp_rport *rport); /* for target drivers */ int (* tsk_mgmt_response)(struct Scsi_Host *, u64, u64, int); @@ -43,7 +90,30 @@ extern void srp_rport_put(struct srp_rport *rport); extern struct srp_rport *srp_rport_add(struct Scsi_Host *, struct srp_rport_identifiers *); extern void srp_rport_del(struct srp_rport *); - +extern int srp_tmo_valid(int fast_io_fail_tmo, int dev_loss_tmo); +extern int srp_reconnect_rport(struct srp_rport *rport); +extern void srp_start_tl_fail_timers(struct srp_rport *rport); extern void srp_remove_host(struct Scsi_Host *); +/** + * srp_chkready() - evaluate the transport layer state before I/O + * + * Returns a SCSI result code that can be returned by the LLD queuecommand() + * implementation. The role of this function is similar to that of + * fc_remote_port_chkready(). + */ +static inline int srp_chkready(struct srp_rport *rport) +{ + switch (rport->state) { + case SRP_RPORT_RUNNING: + case SRP_RPORT_BLOCKED: + default: + return 0; + case SRP_RPORT_FAIL_FAST: + return DID_TRANSPORT_FAILFAST << 16; + case SRP_RPORT_LOST: + return DID_NO_CONNECT << 16; + } +} + #endif -- cgit v1.2.3 From 8c64e4531c3c3bedf11d723196270d4a7553db45 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 26 Oct 2013 14:35:59 +0200 Subject: scsi_transport_srp: Add periodic reconnect support Add support for periodically reconnecting to an SRP target until the dev_loss timer expires. After the tenth reconnection attempt, gradually slow down subsequent reconnect attempts. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- Documentation/ABI/stable/sysfs-transport-srp | 8 ++ drivers/infiniband/ulp/srp/ib_srp.c | 4 +- drivers/scsi/scsi_transport_srp.c | 106 +++++++++++++++++++++++++-- include/scsi/scsi_transport_srp.h | 11 ++- 4 files changed, 118 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/Documentation/ABI/stable/sysfs-transport-srp b/Documentation/ABI/stable/sysfs-transport-srp index 8b6acc775b71..ec7af69fea0a 100644 --- a/Documentation/ABI/stable/sysfs-transport-srp +++ b/Documentation/ABI/stable/sysfs-transport-srp @@ -30,6 +30,14 @@ Contact: linux-scsi@vger.kernel.org Description: 16-byte local SRP port identifier in hexadecimal format. An example: 4c:49:4e:55:58:20:56:49:4f:00:00:00:00:00:00:00. +What: /sys/class/srp_remote_ports/port-:/reconnect_delay +Date: February 1, 2014 +KernelVersion: 3.13 +Contact: linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org +Description: Number of seconds the SCSI layer will wait after a reconnect + attempt failed before retrying. Setting this attribute to + "off" will disable time-based reconnecting. + What: /sys/class/srp_remote_ports/port-:/roles Date: June 27, 2007 KernelVersion: 2.6.24 diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 076a3ab7f779..99c893d1c2ac 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -145,9 +145,9 @@ static int srp_tmo_set(const char *val, const struct kernel_param *kp) tmo = -1; } if (kp->arg == &srp_fast_io_fail_tmo) - res = srp_tmo_valid(tmo, srp_dev_loss_tmo); + res = srp_tmo_valid(-1, tmo, srp_dev_loss_tmo); else - res = srp_tmo_valid(srp_fast_io_fail_tmo, tmo); + res = srp_tmo_valid(-1, srp_fast_io_fail_tmo, tmo); if (res) goto out; *(int *)kp->arg = tmo; diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c index 2696e26b3423..2700a5a09bd4 100644 --- a/drivers/scsi/scsi_transport_srp.c +++ b/drivers/scsi/scsi_transport_srp.c @@ -41,7 +41,7 @@ struct srp_host_attrs { #define to_srp_host_attrs(host) ((struct srp_host_attrs *)(host)->shost_data) #define SRP_HOST_ATTRS 0 -#define SRP_RPORT_ATTRS 6 +#define SRP_RPORT_ATTRS 8 struct srp_internal { struct scsi_transport_template t; @@ -69,11 +69,13 @@ static inline struct Scsi_Host *rport_to_shost(struct srp_rport *r) * are finished in a reasonable time. Hence do not allow the fast I/O fail * timeout to exceed SCSI_DEVICE_BLOCK_MAX_TIMEOUT. Furthermore, these * parameters must be such that multipath can detect failed paths timely. - * Hence do not allow both parameters to be disabled simultaneously. + * Hence do not allow all three parameters to be disabled simultaneously. */ -int srp_tmo_valid(int fast_io_fail_tmo, int dev_loss_tmo) +int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo, int dev_loss_tmo) { - if (fast_io_fail_tmo < 0 && dev_loss_tmo < 0) + if (reconnect_delay < 0 && fast_io_fail_tmo < 0 && dev_loss_tmo < 0) + return -EINVAL; + if (reconnect_delay == 0) return -EINVAL; if (fast_io_fail_tmo > SCSI_DEVICE_BLOCK_MAX_TIMEOUT) return -EINVAL; @@ -202,6 +204,56 @@ static int srp_parse_tmo(int *tmo, const char *buf) return res; } +static ssize_t show_reconnect_delay(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + + return srp_show_tmo(buf, rport->reconnect_delay); +} + +static ssize_t store_reconnect_delay(struct device *dev, + struct device_attribute *attr, + const char *buf, const size_t count) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + int res, delay; + + res = srp_parse_tmo(&delay, buf); + if (res) + goto out; + res = srp_tmo_valid(delay, rport->fast_io_fail_tmo, + rport->dev_loss_tmo); + if (res) + goto out; + + if (rport->reconnect_delay <= 0 && delay > 0 && + rport->state != SRP_RPORT_RUNNING) { + queue_delayed_work(system_long_wq, &rport->reconnect_work, + delay * HZ); + } else if (delay <= 0) { + cancel_delayed_work(&rport->reconnect_work); + } + rport->reconnect_delay = delay; + res = count; + +out: + return res; +} + +static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, show_reconnect_delay, + store_reconnect_delay); + +static ssize_t show_failed_reconnects(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + + return sprintf(buf, "%d\n", rport->failed_reconnects); +} + +static DEVICE_ATTR(failed_reconnects, S_IRUGO, show_failed_reconnects, NULL); + static ssize_t show_srp_rport_fast_io_fail_tmo(struct device *dev, struct device_attribute *attr, char *buf) @@ -222,7 +274,8 @@ static ssize_t store_srp_rport_fast_io_fail_tmo(struct device *dev, res = srp_parse_tmo(&fast_io_fail_tmo, buf); if (res) goto out; - res = srp_tmo_valid(fast_io_fail_tmo, rport->dev_loss_tmo); + res = srp_tmo_valid(rport->reconnect_delay, fast_io_fail_tmo, + rport->dev_loss_tmo); if (res) goto out; rport->fast_io_fail_tmo = fast_io_fail_tmo; @@ -256,7 +309,8 @@ static ssize_t store_srp_rport_dev_loss_tmo(struct device *dev, res = srp_parse_tmo(&dev_loss_tmo, buf); if (res) goto out; - res = srp_tmo_valid(rport->fast_io_fail_tmo, dev_loss_tmo); + res = srp_tmo_valid(rport->reconnect_delay, rport->fast_io_fail_tmo, + dev_loss_tmo); if (res) goto out; rport->dev_loss_tmo = dev_loss_tmo; @@ -312,6 +366,29 @@ invalid: return -EINVAL; } +/** + * srp_reconnect_work() - reconnect and schedule a new attempt if necessary + */ +static void srp_reconnect_work(struct work_struct *work) +{ + struct srp_rport *rport = container_of(to_delayed_work(work), + struct srp_rport, reconnect_work); + struct Scsi_Host *shost = rport_to_shost(rport); + int delay, res; + + res = srp_reconnect_rport(rport); + if (res != 0) { + shost_printk(KERN_ERR, shost, + "reconnect attempt %d failed (%d)\n", + ++rport->failed_reconnects, res); + delay = rport->reconnect_delay * + min(100, max(1, rport->failed_reconnects - 10)); + if (delay > 0) + queue_delayed_work(system_long_wq, + &rport->reconnect_work, delay * HZ); + } +} + static void __rport_fail_io_fast(struct srp_rport *rport) { struct Scsi_Host *shost = rport_to_shost(rport); @@ -371,16 +448,21 @@ static void rport_dev_loss_timedout(struct work_struct *work) static void __srp_start_tl_fail_timers(struct srp_rport *rport) { struct Scsi_Host *shost = rport_to_shost(rport); - int fast_io_fail_tmo, dev_loss_tmo; + int delay, fast_io_fail_tmo, dev_loss_tmo; lockdep_assert_held(&rport->mutex); if (!rport->deleted) { + delay = rport->reconnect_delay; fast_io_fail_tmo = rport->fast_io_fail_tmo; dev_loss_tmo = rport->dev_loss_tmo; pr_debug("%s current state: %d\n", dev_name(&shost->shost_gendev), rport->state); + if (delay > 0) + queue_delayed_work(system_long_wq, + &rport->reconnect_work, + 1UL * delay * HZ); if (fast_io_fail_tmo >= 0 && srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) { pr_debug("%s new state: %d\n", @@ -481,6 +563,7 @@ int srp_reconnect_rport(struct srp_rport *rport) cancel_delayed_work(&rport->fast_io_fail_work); cancel_delayed_work(&rport->dev_loss_work); + rport->failed_reconnects = 0; srp_rport_set_state(rport, SRP_RPORT_RUNNING); scsi_target_unblock(&shost->shost_gendev, SDEV_RUNNING); /* @@ -539,6 +622,7 @@ static void srp_rport_release(struct device *dev) { struct srp_rport *rport = dev_to_rport(dev); + cancel_delayed_work_sync(&rport->reconnect_work); cancel_delayed_work_sync(&rport->fast_io_fail_work); cancel_delayed_work_sync(&rport->dev_loss_work); @@ -635,6 +719,10 @@ struct srp_rport *srp_rport_add(struct Scsi_Host *shost, memcpy(rport->port_id, ids->port_id, sizeof(rport->port_id)); rport->roles = ids->roles; + if (i->f->reconnect) + rport->reconnect_delay = i->f->reconnect_delay ? + *i->f->reconnect_delay : 10; + INIT_DELAYED_WORK(&rport->reconnect_work, srp_reconnect_work); rport->fast_io_fail_tmo = i->f->fast_io_fail_tmo ? *i->f->fast_io_fail_tmo : 15; rport->dev_loss_tmo = i->f->dev_loss_tmo ? *i->f->dev_loss_tmo : 60; @@ -773,6 +861,10 @@ srp_attach_transport(struct srp_function_template *ft) i->rport_attrs[count++] = &dev_attr_fast_io_fail_tmo; i->rport_attrs[count++] = &dev_attr_dev_loss_tmo; } + if (ft->reconnect) { + i->rport_attrs[count++] = &dev_attr_reconnect_delay; + i->rport_attrs[count++] = &dev_attr_failed_reconnects; + } if (ft->rport_delete) i->rport_attrs[count++] = &dev_attr_delete; i->rport_attrs[count++] = NULL; diff --git a/include/scsi/scsi_transport_srp.h b/include/scsi/scsi_transport_srp.h index ee7001677f64..4ebf6913b7b2 100644 --- a/include/scsi/scsi_transport_srp.h +++ b/include/scsi/scsi_transport_srp.h @@ -31,7 +31,8 @@ enum srp_rport_state { /** * struct srp_rport * @lld_data: LLD private data. - * @mutex: Protects against concurrent rport fast_io_fail / dev_loss_tmo. + * @mutex: Protects against concurrent rport reconnect / fast_io_fail / + * dev_loss_tmo activity. */ struct srp_rport { /* for initiator and target drivers */ @@ -48,6 +49,9 @@ struct srp_rport { struct mutex mutex; enum srp_rport_state state; bool deleted; + int reconnect_delay; + int failed_reconnects; + struct delayed_work reconnect_work; int fast_io_fail_tmo; int dev_loss_tmo; struct delayed_work fast_io_fail_work; @@ -60,6 +64,7 @@ struct srp_rport { * dev_loss_tmo sysfs attribute for an rport. * @reset_timer_if_blocked: Whether or srp_timed_out() should reset the command * timer if the device on which it has been queued is blocked. + * @reconnect_delay: If not NULL, points to the default reconnect_delay value. * @fast_io_fail_tmo: If not NULL, points to the default fast_io_fail_tmo value. * @dev_loss_tmo: If not NULL, points to the default dev_loss_tmo value. * @reconnect: Callback function for reconnecting to the target. See also @@ -71,6 +76,7 @@ struct srp_function_template { /* for initiator drivers */ bool has_rport_state; bool reset_timer_if_blocked; + int *reconnect_delay; int *fast_io_fail_tmo; int *dev_loss_tmo; int (*reconnect)(struct srp_rport *rport); @@ -90,7 +96,8 @@ extern void srp_rport_put(struct srp_rport *rport); extern struct srp_rport *srp_rport_add(struct Scsi_Host *, struct srp_rport_identifiers *); extern void srp_rport_del(struct srp_rport *); -extern int srp_tmo_valid(int fast_io_fail_tmo, int dev_loss_tmo); +extern int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo, + int dev_loss_tmo); extern int srp_reconnect_rport(struct srp_rport *rport); extern void srp_start_tl_fail_timers(struct srp_rport *rport); extern void srp_remove_host(struct Scsi_Host *); -- cgit v1.2.3 From 180771a3707a4c0577cbf4f830c754dbabfdfccb Mon Sep 17 00:00:00 2001 From: Upinder Malhi \(umalhi\) Date: Tue, 10 Sep 2013 03:36:59 +0000 Subject: IB/core: Add Cisco usNIC rdma node and transport types This patch adds new rdma node and new rdma transport, and supporting code used by Cisco's low latency driver called usNIC. usNIC uses its own transport, distinct from IB and iWARP. Signed-off-by: Upinder Malhi Signed-off-by: Jeff Squyres Signed-off-by: Roland Dreier --- drivers/infiniband/core/sysfs.c | 1 + drivers/infiniband/core/verbs.c | 3 +++ include/rdma/ib_verbs.h | 6 ++++-- 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index cde1e7b5b85d..faad2caf22b1 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -612,6 +612,7 @@ static ssize_t show_node_type(struct device *device, switch (dev->node_type) { case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type); case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type); + case RDMA_NODE_USNIC: return sprintf(buf, "%d: usNIC\n", dev->node_type); case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type); case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type); default: return sprintf(buf, "%d: \n", dev->node_type); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index a321df28bab2..84f502785f89 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -114,6 +114,8 @@ rdma_node_get_transport(enum rdma_node_type node_type) return RDMA_TRANSPORT_IB; case RDMA_NODE_RNIC: return RDMA_TRANSPORT_IWARP; + case RDMA_NODE_USNIC: + return RDMA_TRANSPORT_USNIC; default: BUG(); return 0; @@ -130,6 +132,7 @@ enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_ case RDMA_TRANSPORT_IB: return IB_LINK_LAYER_INFINIBAND; case RDMA_TRANSPORT_IWARP: + case RDMA_TRANSPORT_USNIC: return IB_LINK_LAYER_ETHERNET; default: return IB_LINK_LAYER_UNSPECIFIED; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e393171e2fac..60354d53948e 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -67,12 +67,14 @@ enum rdma_node_type { RDMA_NODE_IB_CA = 1, RDMA_NODE_IB_SWITCH, RDMA_NODE_IB_ROUTER, - RDMA_NODE_RNIC + RDMA_NODE_RNIC, + RDMA_NODE_USNIC, }; enum rdma_transport_type { RDMA_TRANSPORT_IB, - RDMA_TRANSPORT_IWARP + RDMA_TRANSPORT_IWARP, + RDMA_TRANSPORT_USNIC }; enum rdma_transport_type -- cgit v1.2.3 From 1c636f801615bdfc9b1d46904e8258c7a025670b Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 31 Oct 2013 15:26:32 +0200 Subject: IB/core: Encorce MR access rights rules on kernel consumers Enforce the rule that when requesting remote write or atomic permissions, local write must be indicated as well. See IB spec 11.2.8.2. Spotted by: Hagay Abramovsky Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 10 +++------- drivers/infiniband/core/verbs.c | 14 ++++++++++++++ include/rdma/ib_verbs.h | 13 +++++++++++++ 3 files changed, 30 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 5bb2a82d52e7..7f671b7b8bac 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -939,13 +939,9 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) return -EINVAL; - /* - * Local write permission is required if remote write or - * remote atomic permission is also requested. - */ - if (cmd.access_flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) && - !(cmd.access_flags & IB_ACCESS_LOCAL_WRITE)) - return -EINVAL; + ret = ib_check_mr_access(cmd.access_flags); + if (ret) + return ret; uobj = kmalloc(sizeof *uobj, GFP_KERNEL); if (!uobj) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 84f502785f89..d4f6ddf72ffa 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -961,6 +961,11 @@ EXPORT_SYMBOL(ib_resize_cq); struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) { struct ib_mr *mr; + int err; + + err = ib_check_mr_access(mr_access_flags); + if (err) + return ERR_PTR(err); mr = pd->device->get_dma_mr(pd, mr_access_flags); @@ -983,6 +988,11 @@ struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, u64 *iova_start) { struct ib_mr *mr; + int err; + + err = ib_check_mr_access(mr_access_flags); + if (err) + return ERR_PTR(err); if (!pd->device->reg_phys_mr) return ERR_PTR(-ENOSYS); @@ -1013,6 +1023,10 @@ int ib_rereg_phys_mr(struct ib_mr *mr, struct ib_pd *old_pd; int ret; + ret = ib_check_mr_access(mr_access_flags); + if (ret) + return ret; + if (!mr->device->rereg_phys_mr) return -ENOSYS; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 60354d53948e..68c053d0d629 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2386,4 +2386,17 @@ struct ib_flow *ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain); int ib_destroy_flow(struct ib_flow *flow_id); +static inline int ib_check_mr_access(int flags) +{ + /* + * Local write permission is required if remote write or + * remote atomic permission is also requested. + */ + if (flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) && + !(flags & IB_ACCESS_LOCAL_WRITE)) + return -EINVAL; + + return 0; +} + #endif /* IB_VERBS_H */ -- cgit v1.2.3 From f88482743872230f5899b8344a057d6e2fd011e2 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 6 Nov 2013 23:21:44 +0100 Subject: IB/core: clarify overflow/underflow checks on ib_create/destroy_flow This patch fixes the following issues: 1. Unneeded checks were removed 2. Removed the fixed size out of flow_attr.size, thus simplifying the checks. 3. Remove a 32bit hole on 64bit systems with strict alignment in struct ib_kern_flow_att by adding a reserved field. Signed-off-by: Matan Barak Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 32 +++++++++++++++----------------- include/uapi/rdma/ib_user_verbs.h | 1 + 2 files changed, 16 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 2f0f01b70e3b..26ee2d2a8c52 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2657,7 +2657,6 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, void *kern_spec; void *ib_spec; int i; - int kern_attr_size; if (out_len < sizeof(resp)) return -ENOSPC; @@ -2672,32 +2671,28 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW)) return -EPERM; - if (cmd.flow_attr.num_of_specs < 0 || - cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) + if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) return -EINVAL; - kern_attr_size = cmd.flow_attr.size - sizeof(cmd) - - sizeof(struct ib_uverbs_cmd_hdr_ex); - - if (cmd.flow_attr.size < 0 || cmd.flow_attr.size > in_len || - kern_attr_size < 0 || kern_attr_size > + if (cmd.flow_attr.size > (in_len - sizeof(cmd)) || + cmd.flow_attr.size > (cmd.flow_attr.num_of_specs * sizeof(struct ib_kern_spec))) return -EINVAL; if (cmd.flow_attr.num_of_specs) { - kern_flow_attr = kmalloc(cmd.flow_attr.size, GFP_KERNEL); + kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size, + GFP_KERNEL); if (!kern_flow_attr) return -ENOMEM; memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr)); if (copy_from_user(kern_flow_attr + 1, buf + sizeof(cmd), - kern_attr_size)) { + cmd.flow_attr.size)) { err = -EFAULT; goto err_free_attr; } } else { kern_flow_attr = &cmd.flow_attr; - kern_attr_size = sizeof(cmd.flow_attr); } uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); @@ -2714,7 +2709,7 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, goto err_uobj; } - flow_attr = kmalloc(cmd.flow_attr.size, GFP_KERNEL); + flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size, GFP_KERNEL); if (!flow_attr) { err = -ENOMEM; goto err_put; @@ -2729,19 +2724,22 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, kern_spec = kern_flow_attr + 1; ib_spec = flow_attr + 1; - for (i = 0; i < flow_attr->num_of_specs && kern_attr_size > 0; i++) { + for (i = 0; i < flow_attr->num_of_specs && + cmd.flow_attr.size > offsetof(struct ib_kern_spec, reserved) && + cmd.flow_attr.size >= + ((struct ib_kern_spec *)kern_spec)->size; i++) { err = kern_spec_to_ib_spec(kern_spec, ib_spec); if (err) goto err_free; flow_attr->size += ((union ib_flow_spec *) ib_spec)->size; - kern_attr_size -= ((struct ib_kern_spec *) kern_spec)->size; + cmd.flow_attr.size -= ((struct ib_kern_spec *)kern_spec)->size; kern_spec += ((struct ib_kern_spec *) kern_spec)->size; ib_spec += ((union ib_flow_spec *) ib_spec)->size; } - if (kern_attr_size) { - pr_warn("create flow failed, %d bytes left from uverb cmd\n", - kern_attr_size); + if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { + pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n", + i, cmd.flow_attr.size); goto err_free; } flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index e3ddd86c90a6..99a58bdc1c64 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -771,6 +771,7 @@ struct ib_kern_flow_attr { struct ib_uverbs_create_flow { __u32 comp_mask; + __u32 reserved; __u64 response; __u32 qp_handle; struct ib_kern_flow_attr flow_attr; -- cgit v1.2.3 From d82693dad09b49232cd727ee9e15bd027710edac Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Wed, 6 Nov 2013 23:21:45 +0100 Subject: IB/core: Rename 'flow' structs to match other uverbs structs Commit 436f2ad05a0b ("IB/core: Export ib_create/destroy_flow through uverbs") added public data structures to support receive flow steering. The new structs are not following the 'uverbs' pattern: they're lacking the common prefix 'ib_uverbs'. This patch replaces ib_kern prefix by ib_uverbs. Signed-off-by: Yann Droneaud Link: http://marc.info/?i=cover.1383773832.git.ydroneaud@opteya.com Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 14 +++++++------- include/uapi/rdma/ib_user_verbs.h | 36 ++++++++++++++++++------------------ 2 files changed, 25 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 26ee2d2a8c52..f83c1dc08481 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2602,7 +2602,7 @@ out_put: } #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING -static int kern_spec_to_ib_spec(struct ib_kern_spec *kern_spec, +static int kern_spec_to_ib_spec(struct ib_uverbs_spec *kern_spec, union ib_flow_spec *ib_spec) { ib_spec->type = kern_spec->type; @@ -2650,7 +2650,7 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, struct ib_uverbs_create_flow_resp resp; struct ib_uobject *uobj; struct ib_flow *flow_id; - struct ib_kern_flow_attr *kern_flow_attr; + struct ib_uverbs_flow_attr *kern_flow_attr; struct ib_flow_attr *flow_attr; struct ib_qp *qp; int err = 0; @@ -2676,7 +2676,7 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, if (cmd.flow_attr.size > (in_len - sizeof(cmd)) || cmd.flow_attr.size > - (cmd.flow_attr.num_of_specs * sizeof(struct ib_kern_spec))) + (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_spec))) return -EINVAL; if (cmd.flow_attr.num_of_specs) { @@ -2725,16 +2725,16 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, kern_spec = kern_flow_attr + 1; ib_spec = flow_attr + 1; for (i = 0; i < flow_attr->num_of_specs && - cmd.flow_attr.size > offsetof(struct ib_kern_spec, reserved) && + cmd.flow_attr.size > offsetof(struct ib_uverbs_spec, reserved) && cmd.flow_attr.size >= - ((struct ib_kern_spec *)kern_spec)->size; i++) { + ((struct ib_uverbs_spec *)kern_spec)->size; i++) { err = kern_spec_to_ib_spec(kern_spec, ib_spec); if (err) goto err_free; flow_attr->size += ((union ib_flow_spec *) ib_spec)->size; - cmd.flow_attr.size -= ((struct ib_kern_spec *)kern_spec)->size; - kern_spec += ((struct ib_kern_spec *) kern_spec)->size; + cmd.flow_attr.size -= ((struct ib_uverbs_spec *)kern_spec)->size; + kern_spec += ((struct ib_uverbs_spec *) kern_spec)->size; ib_spec += ((union ib_flow_spec *) ib_spec)->size; } if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 99a58bdc1c64..96a660533236 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -701,61 +701,61 @@ struct ib_uverbs_detach_mcast { }; #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING -struct ib_kern_eth_filter { +struct ib_uverbs_eth_filter { __u8 dst_mac[6]; __u8 src_mac[6]; __be16 ether_type; __be16 vlan_tag; }; -struct ib_kern_spec_eth { +struct ib_uverbs_spec_eth { __u32 type; __u16 size; __u16 reserved; - struct ib_kern_eth_filter val; - struct ib_kern_eth_filter mask; + struct ib_uverbs_eth_filter val; + struct ib_uverbs_eth_filter mask; }; -struct ib_kern_ipv4_filter { +struct ib_uverbs_ipv4_filter { __be32 src_ip; __be32 dst_ip; }; -struct ib_kern_spec_ipv4 { +struct ib_uverbs_spec_ipv4 { __u32 type; __u16 size; __u16 reserved; - struct ib_kern_ipv4_filter val; - struct ib_kern_ipv4_filter mask; + struct ib_uverbs_ipv4_filter val; + struct ib_uverbs_ipv4_filter mask; }; -struct ib_kern_tcp_udp_filter { +struct ib_uverbs_tcp_udp_filter { __be16 dst_port; __be16 src_port; }; -struct ib_kern_spec_tcp_udp { +struct ib_uverbs_spec_tcp_udp { __u32 type; __u16 size; __u16 reserved; - struct ib_kern_tcp_udp_filter val; - struct ib_kern_tcp_udp_filter mask; + struct ib_uverbs_tcp_udp_filter val; + struct ib_uverbs_tcp_udp_filter mask; }; -struct ib_kern_spec { +struct ib_uverbs_spec { union { struct { __u32 type; __u16 size; __u16 reserved; }; - struct ib_kern_spec_eth eth; - struct ib_kern_spec_ipv4 ipv4; - struct ib_kern_spec_tcp_udp tcp_udp; + struct ib_uverbs_spec_eth eth; + struct ib_uverbs_spec_ipv4 ipv4; + struct ib_uverbs_spec_tcp_udp tcp_udp; }; }; -struct ib_kern_flow_attr { +struct ib_uverbs_flow_attr { __u32 type; __u16 size; __u16 priority; @@ -774,7 +774,7 @@ struct ib_uverbs_create_flow { __u32 reserved; __u64 response; __u32 qp_handle; - struct ib_kern_flow_attr flow_attr; + struct ib_uverbs_flow_attr flow_attr; }; struct ib_uverbs_create_flow_resp { -- cgit v1.2.3 From b68c956021386eead6b8b28e445f33c8c985d7d2 Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Wed, 6 Nov 2013 23:21:46 +0100 Subject: IB/core: Make uverbs flow structure use names like verbs ones This patch adds "flow" prefix to most of data structure added as part of commit 436f2ad05a0b ("IB/core: Export ib_create/destroy_flow through uverbs") to keep those names in sync with the data structures added in commit 319a441d1361 ("IB/core: Add receive flow steering support"). It's just a matter of translating 'ib_flow' to 'ib_uverbs_flow'. Signed-off-by: Yann Droneaud Link: http://marc.info/?i=cover.1383773832.git.ydroneaud@opteya.com Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 12 ++++++------ include/uapi/rdma/ib_user_verbs.h | 32 ++++++++++++++++---------------- 2 files changed, 22 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index f83c1dc08481..19f14d82c988 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2602,7 +2602,7 @@ out_put: } #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING -static int kern_spec_to_ib_spec(struct ib_uverbs_spec *kern_spec, +static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, union ib_flow_spec *ib_spec) { ib_spec->type = kern_spec->type; @@ -2676,7 +2676,7 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, if (cmd.flow_attr.size > (in_len - sizeof(cmd)) || cmd.flow_attr.size > - (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_spec))) + (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec))) return -EINVAL; if (cmd.flow_attr.num_of_specs) { @@ -2725,16 +2725,16 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, kern_spec = kern_flow_attr + 1; ib_spec = flow_attr + 1; for (i = 0; i < flow_attr->num_of_specs && - cmd.flow_attr.size > offsetof(struct ib_uverbs_spec, reserved) && + cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) && cmd.flow_attr.size >= - ((struct ib_uverbs_spec *)kern_spec)->size; i++) { + ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) { err = kern_spec_to_ib_spec(kern_spec, ib_spec); if (err) goto err_free; flow_attr->size += ((union ib_flow_spec *) ib_spec)->size; - cmd.flow_attr.size -= ((struct ib_uverbs_spec *)kern_spec)->size; - kern_spec += ((struct ib_uverbs_spec *) kern_spec)->size; + cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size; + kern_spec += ((struct ib_uverbs_flow_spec *) kern_spec)->size; ib_spec += ((union ib_flow_spec *) ib_spec)->size; } if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 96a660533236..ef2be64bf4c3 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -701,57 +701,57 @@ struct ib_uverbs_detach_mcast { }; #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING -struct ib_uverbs_eth_filter { +struct ib_uverbs_flow_eth_filter { __u8 dst_mac[6]; __u8 src_mac[6]; __be16 ether_type; __be16 vlan_tag; }; -struct ib_uverbs_spec_eth { +struct ib_uverbs_flow_spec_eth { __u32 type; __u16 size; __u16 reserved; - struct ib_uverbs_eth_filter val; - struct ib_uverbs_eth_filter mask; + struct ib_uverbs_flow_eth_filter val; + struct ib_uverbs_flow_eth_filter mask; }; -struct ib_uverbs_ipv4_filter { +struct ib_uverbs_flow_ipv4_filter { __be32 src_ip; __be32 dst_ip; }; -struct ib_uverbs_spec_ipv4 { +struct ib_uverbs_flow_spec_ipv4 { __u32 type; __u16 size; __u16 reserved; - struct ib_uverbs_ipv4_filter val; - struct ib_uverbs_ipv4_filter mask; + struct ib_uverbs_flow_ipv4_filter val; + struct ib_uverbs_flow_ipv4_filter mask; }; -struct ib_uverbs_tcp_udp_filter { +struct ib_uverbs_flow_tcp_udp_filter { __be16 dst_port; __be16 src_port; }; -struct ib_uverbs_spec_tcp_udp { +struct ib_uverbs_flow_spec_tcp_udp { __u32 type; __u16 size; __u16 reserved; - struct ib_uverbs_tcp_udp_filter val; - struct ib_uverbs_tcp_udp_filter mask; + struct ib_uverbs_flow_tcp_udp_filter val; + struct ib_uverbs_flow_tcp_udp_filter mask; }; -struct ib_uverbs_spec { +struct ib_uverbs_flow_spec { union { struct { __u32 type; __u16 size; __u16 reserved; }; - struct ib_uverbs_spec_eth eth; - struct ib_uverbs_spec_ipv4 ipv4; - struct ib_uverbs_spec_tcp_udp tcp_udp; + struct ib_uverbs_flow_spec_eth eth; + struct ib_uverbs_flow_spec_ipv4 ipv4; + struct ib_uverbs_flow_spec_tcp_udp tcp_udp; }; }; -- cgit v1.2.3 From 58913efba9c3aa7992f2a4d630135ded833d988e Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Wed, 6 Nov 2013 23:21:47 +0100 Subject: IB/core: Use a common header for uverbs flow_specs A common header will allows better checking of flow specs size, while ensuring strict alignment to 64 bits. Signed-off-by: Yann Droneaud Link: http://marc.info/?i=cover.1383773832.git.ydroneaud@opteya.com Signed-off-by: Roland Dreier --- include/uapi/rdma/ib_user_verbs.h | 53 +++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index ef2be64bf4c3..43014981550a 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -701,6 +701,14 @@ struct ib_uverbs_detach_mcast { }; #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING +struct ib_uverbs_flow_spec_hdr { + __u32 type; + __u16 size; + __u16 reserved; + /* followed by flow_spec */ + __u64 flow_spec_data[0]; +}; + struct ib_uverbs_flow_eth_filter { __u8 dst_mac[6]; __u8 src_mac[6]; @@ -709,9 +717,14 @@ struct ib_uverbs_flow_eth_filter { }; struct ib_uverbs_flow_spec_eth { - __u32 type; - __u16 size; - __u16 reserved; + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; struct ib_uverbs_flow_eth_filter val; struct ib_uverbs_flow_eth_filter mask; }; @@ -722,9 +735,14 @@ struct ib_uverbs_flow_ipv4_filter { }; struct ib_uverbs_flow_spec_ipv4 { - __u32 type; - __u16 size; - __u16 reserved; + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; struct ib_uverbs_flow_ipv4_filter val; struct ib_uverbs_flow_ipv4_filter mask; }; @@ -735,19 +753,27 @@ struct ib_uverbs_flow_tcp_udp_filter { }; struct ib_uverbs_flow_spec_tcp_udp { - __u32 type; - __u16 size; - __u16 reserved; + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; struct ib_uverbs_flow_tcp_udp_filter val; struct ib_uverbs_flow_tcp_udp_filter mask; }; struct ib_uverbs_flow_spec { union { - struct { - __u32 type; - __u16 size; - __u16 reserved; + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; }; struct ib_uverbs_flow_spec_eth eth; struct ib_uverbs_flow_spec_ipv4 ipv4; @@ -767,6 +793,7 @@ struct ib_uverbs_flow_attr { * struct ib_flow_spec_xxx * struct ib_flow_spec_yyy */ + struct ib_uverbs_flow_spec_hdr flow_specs[0]; }; struct ib_uverbs_create_flow { -- cgit v1.2.3 From 2490f20be496c2da14ae4632a8c60e0633e97fd0 Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Wed, 6 Nov 2013 23:21:48 +0100 Subject: IB/core: Remove ib_uverbs_flow_spec structure from userspace The structure holding any types of flow_spec is of no use to userspace. It would be wrong for userspace to do: struct ib_uverbs_flow_spec flow_spec; flow_spec.type = IB_FLOW_SPEC_TCP; flow_spec.size = sizeof(flow_spec); Instead, userspace should use the dedicated flow_spec structure for - Ethernet : struct ib_uverbs_flow_spec_eth, - IPv4 : struct ib_uverbs_flow_spec_ipv4, - TCP/UDP : struct ib_uverbs_flow_spec_tcp_udp. In other words, struct ib_uverbs_flow_spec is a "virtual" data structure that can only be use by the kernel as an alias to the other. Signed-off-by: Yann Droneaud Link: http://marc.info/?i=cover.1383773832.git.ydroneaud@opteya.com Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs.h | 16 ++++++++++++++++ include/uapi/rdma/ib_user_verbs.h | 16 ---------------- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index d8f9c6c272d7..777954f67270 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -178,6 +178,22 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event); void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd); +struct ib_uverbs_flow_spec { + union { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_spec_eth eth; + struct ib_uverbs_flow_spec_ipv4 ipv4; + struct ib_uverbs_flow_spec_tcp_udp tcp_udp; + }; +}; + #define IB_UVERBS_DECLARE_CMD(name) \ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \ const char __user *buf, int in_len, \ diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 43014981550a..fc9bbe37cfce 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -765,22 +765,6 @@ struct ib_uverbs_flow_spec_tcp_udp { struct ib_uverbs_flow_tcp_udp_filter mask; }; -struct ib_uverbs_flow_spec { - union { - union { - struct ib_uverbs_flow_spec_hdr hdr; - struct { - __u32 type; - __u16 size; - __u16 reserved; - }; - }; - struct ib_uverbs_flow_spec_eth eth; - struct ib_uverbs_flow_spec_ipv4 ipv4; - struct ib_uverbs_flow_spec_tcp_udp tcp_udp; - }; -}; - struct ib_uverbs_flow_attr { __u32 type; __u16 size; -- cgit v1.2.3 From f21519b23c1b6fa25366be4114ccf7fcf1c190f9 Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Wed, 6 Nov 2013 23:21:49 +0100 Subject: IB/core: extended command: an improved infrastructure for uverbs commands Commit 400dbc96583f ("IB/core: Infrastructure for extensible uverbs commands") added an infrastructure for extensible uverbs commands while later commit 436f2ad05a0b ("IB/core: Export ib_create/destroy_flow through uverbs") exported ib_create_flow()/ib_destroy_flow() functions using this new infrastructure. According to the commit 400dbc96583f, the purpose of this infrastructure is to support passing around provider (eg. hardware) specific buffers when userspace issue commands to the kernel, so that it would be possible to extend uverbs (eg. core) buffers independently from the provider buffers. But the new kernel command function prototypes were not modified to take advantage of this extension. This issue was exposed by Roland Dreier in a previous review[1]. So the following patch is an attempt to a revised extensible command infrastructure. This improved extensible command infrastructure distinguish between core (eg. legacy)'s command/response buffers from provider (eg. hardware)'s command/response buffers: each extended command implementing function is given a struct ib_udata to hold core (eg. uverbs) input and output buffers, and another struct ib_udata to hold the hw (eg. provider) input and output buffers. Having those buffers identified separately make it easier to increase one buffer to support extension without having to add some code to guess the exact size of each command/response parts: This should make the extended functions more reliable. Additionally, instead of relying on command identifier being greater than IB_USER_VERBS_CMD_THRESHOLD, the proposed infrastructure rely on unused bits in command field: on the 32 bits provided by command field, only 6 bits are really needed to encode the identifier of commands currently supported by the kernel. (Even using only 6 bits leaves room for about 23 new commands). So this patch makes use of some high order bits in command field to store flags, leaving enough room for more command identifiers than one will ever need (eg. 256). The new flags are used to specify if the command should be processed as an extended one or a legacy one. While designing the new command format, care was taken to make usage of flags itself extensible. Using high order bits of the commands field ensure that newer libibverbs on older kernel will properly fail when trying to call extended commands. On the other hand, older libibverbs on newer kernel will never be able to issue calls to extended commands. The extended command header includes the optional response pointer so that output buffer length and output buffer pointer are located together in the command, allowing proper parameters checking. This should make implementing functions easier and safer. Additionally the extended header ensure 64bits alignment, while making all sizes multiple of 8 bytes, extending the maximum buffer size: legacy extended Maximum command buffer: 256KBytes 1024KBytes (512KBytes + 512KBytes) Maximum response buffer: 256KBytes 1024KBytes (512KBytes + 512KBytes) For the purpose of doing proper buffer size accounting, the headers size are no more taken in account in "in_words". One of the odds of the current extensible infrastructure, reading twice the "legacy" command header, is fixed by removing the "legacy" command header from the extended command header: they are processed as two different parts of the command: memory is read once and information are not duplicated: it's making clear that's an extended command scheme and not a different command scheme. The proposed scheme will format input (command) and output (response) buffers this way: - command: legacy header + extended header + command data (core + hw): +----------------------------------------+ | flags | 00 00 | command | | in_words | out_words | +----------------------------------------+ | response | | response | | provider_in_words | provider_out_words | | padding | +----------------------------------------+ | | . . . (in_words * 8) . | | +----------------------------------------+ | | . . . (provider_in_words * 8) . | | +----------------------------------------+ - response, if present: +----------------------------------------+ | | . . . (out_words * 8) . | | +----------------------------------------+ | | . . . (provider_out_words * 8) . | | +----------------------------------------+ The overall design is to ensure that the extensible infrastructure is itself extensible while begin more reliable with more input and bound checking. Note: The unused field in the extended header would be perfect candidate to hold the command "comp_mask" (eg. bit field used to handle compatibility). This was suggested by Roland Dreier in a previous review[2]. But "comp_mask" field is likely to be present in the uverb input and/or provider input, likewise for the response, as noted by Matan Barak[3], so it doesn't make sense to put "comp_mask" in the header. [1]: http://marc.info/?i=CAL1RGDWxmM17W2o_era24A-TTDeKyoL6u3NRu_=t_dhV_ZA9MA@mail.gmail.com [2]: http://marc.info/?i=CAL1RGDXJtrc849M6_XNZT5xO1+ybKtLWGq6yg6LhoSsKpsmkYA@mail.gmail.com [3]: http://marc.info/?i=525C1149.6000701@mellanox.com Signed-off-by: Yann Droneaud Link: http://marc.info/?i=cover.1383773832.git.ydroneaud@opteya.com [ Convert "ret ? ret : 0" to the equivalent "ret". - Roland ] Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs.h | 20 +++++- drivers/infiniband/core/uverbs_cmd.c | 56 +++++++-------- drivers/infiniband/core/uverbs_main.c | 127 ++++++++++++++++++++++++++-------- drivers/infiniband/hw/mlx4/main.c | 6 +- include/rdma/ib_verbs.h | 1 + include/uapi/rdma/ib_user_verbs.h | 23 +++--- 6 files changed, 160 insertions(+), 73 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 777954f67270..24adccbf92f9 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -47,6 +47,14 @@ #include #include +#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ + do { \ + (udata)->inbuf = (void __user *) (ibuf); \ + (udata)->outbuf = (void __user *) (obuf); \ + (udata)->inlen = (ilen); \ + (udata)->outlen = (olen); \ + } while (0) + /* * Our lifetime rules for these structs are the following: * @@ -233,9 +241,17 @@ IB_UVERBS_DECLARE_CMD(destroy_srq); IB_UVERBS_DECLARE_CMD(create_xsrq); IB_UVERBS_DECLARE_CMD(open_xrcd); IB_UVERBS_DECLARE_CMD(close_xrcd); + #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING -IB_UVERBS_DECLARE_CMD(create_flow); -IB_UVERBS_DECLARE_CMD(destroy_flow); + +#define IB_UVERBS_DECLARE_EX_CMD(name) \ + int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \ + struct ib_udata *ucore, \ + struct ib_udata *uhw) + +IB_UVERBS_DECLARE_EX_CMD(create_flow); +IB_UVERBS_DECLARE_EX_CMD(destroy_flow); + #endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 19f14d82c988..33a52785f812 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -58,14 +58,6 @@ static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" }; static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; #endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ -#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ - do { \ - (udata)->inbuf = (void __user *) (ibuf); \ - (udata)->outbuf = (void __user *) (obuf); \ - (udata)->inlen = (ilen); \ - (udata)->outlen = (olen); \ - } while (0) - /* * The ib_uobject locking scheme is as follows: * @@ -2642,9 +2634,9 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, return 0; } -ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) +int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) { struct ib_uverbs_create_flow cmd; struct ib_uverbs_create_flow_resp resp; @@ -2658,11 +2650,15 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, void *ib_spec; int i; - if (out_len < sizeof(resp)) + if (ucore->outlen < sizeof(resp)) return -ENOSPC; - if (copy_from_user(&cmd, buf, sizeof(cmd))) - return -EFAULT; + err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (err) + return err; + + ucore->inbuf += sizeof(cmd); + ucore->inlen -= sizeof(cmd); if (cmd.comp_mask) return -EINVAL; @@ -2674,7 +2670,7 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) return -EINVAL; - if (cmd.flow_attr.size > (in_len - sizeof(cmd)) || + if (cmd.flow_attr.size > ucore->inlen || cmd.flow_attr.size > (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec))) return -EINVAL; @@ -2686,11 +2682,10 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, return -ENOMEM; memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr)); - if (copy_from_user(kern_flow_attr + 1, buf + sizeof(cmd), - cmd.flow_attr.size)) { - err = -EFAULT; + err = ib_copy_from_udata(kern_flow_attr + 1, ucore, + cmd.flow_attr.size); + if (err) goto err_free_attr; - } } else { kern_flow_attr = &cmd.flow_attr; } @@ -2758,11 +2753,10 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, memset(&resp, 0, sizeof(resp)); resp.flow_handle = uobj->id; - if (copy_to_user((void __user *)(unsigned long) cmd.response, - &resp, sizeof(resp))) { - err = -EFAULT; + err = ib_copy_to_udata(ucore, + &resp, sizeof(resp)); + if (err) goto err_copy; - } put_qp_read(qp); mutex_lock(&file->mutex); @@ -2775,7 +2769,7 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, kfree(flow_attr); if (cmd.flow_attr.num_of_specs) kfree(kern_flow_attr); - return in_len; + return 0; err_copy: idr_remove_uobj(&ib_uverbs_rule_idr, uobj); destroy_flow: @@ -2792,16 +2786,18 @@ err_free_attr: return err; } -ssize_t ib_uverbs_destroy_flow(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) { +int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ struct ib_uverbs_destroy_flow cmd; struct ib_flow *flow_id; struct ib_uobject *uobj; int ret; - if (copy_from_user(&cmd, buf, sizeof(cmd))) - return -EFAULT; + ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (ret) + return ret; uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle, file->ucontext); @@ -2823,7 +2819,7 @@ ssize_t ib_uverbs_destroy_flow(struct ib_uverbs_file *file, put_uobj(uobj); - return ret ? ret : in_len; + return ret; } #endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 2df31f68ea09..189d99e76d9f 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -115,11 +115,16 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd, [IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq, [IB_USER_VERBS_CMD_OPEN_QP] = ib_uverbs_open_qp, +}; + #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING - [IB_USER_VERBS_CMD_CREATE_FLOW] = ib_uverbs_create_flow, - [IB_USER_VERBS_CMD_DESTROY_FLOW] = ib_uverbs_destroy_flow -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ +static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) = { + [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, + [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow }; +#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ static void ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device); @@ -589,6 +594,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, { struct ib_uverbs_file *file = filp->private_data; struct ib_uverbs_cmd_hdr hdr; + __u32 flags; if (count < sizeof hdr) return -EINVAL; @@ -596,45 +602,108 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, if (copy_from_user(&hdr, buf, sizeof hdr)) return -EFAULT; - if (hdr.command >= ARRAY_SIZE(uverbs_cmd_table) || - !uverbs_cmd_table[hdr.command]) - return -EINVAL; + flags = (hdr.command & + IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; - if (!file->ucontext && - hdr.command != IB_USER_VERBS_CMD_GET_CONTEXT) - return -EINVAL; + if (!flags) { + __u32 command; - if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command))) - return -ENOSYS; + if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + IB_USER_VERBS_CMD_COMMAND_MASK)) + return -EINVAL; -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING - if (hdr.command >= IB_USER_VERBS_CMD_THRESHOLD) { - struct ib_uverbs_cmd_hdr_ex hdr_ex; + command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; - if (copy_from_user(&hdr_ex, buf, sizeof(hdr_ex))) - return -EFAULT; + if (command >= ARRAY_SIZE(uverbs_cmd_table) || + !uverbs_cmd_table[command]) + return -EINVAL; - if (((hdr_ex.in_words + hdr_ex.provider_in_words) * 4) != count) + if (!file->ucontext && + command != IB_USER_VERBS_CMD_GET_CONTEXT) return -EINVAL; - return uverbs_cmd_table[hdr.command](file, - buf + sizeof(hdr_ex), - (hdr_ex.in_words + - hdr_ex.provider_in_words) * 4, - (hdr_ex.out_words + - hdr_ex.provider_out_words) * 4); - } else { -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ + if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command))) + return -ENOSYS; + if (hdr.in_words * 4 != count) return -EINVAL; - return uverbs_cmd_table[hdr.command](file, - buf + sizeof(hdr), - hdr.in_words * 4, - hdr.out_words * 4); + return uverbs_cmd_table[command](file, + buf + sizeof(hdr), + hdr.in_words * 4, + hdr.out_words * 4); + #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING + + } else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) { + __u32 command; + + struct ib_uverbs_ex_cmd_hdr ex_hdr; + struct ib_udata ucore; + struct ib_udata uhw; + int err; + size_t written_count = count; + + if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + IB_USER_VERBS_CMD_COMMAND_MASK)) + return -EINVAL; + + command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; + + if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) || + !uverbs_ex_cmd_table[command]) + return -ENOSYS; + + if (!file->ucontext) + return -EINVAL; + + if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command))) + return -ENOSYS; + + if (count < (sizeof(hdr) + sizeof(ex_hdr))) + return -EINVAL; + + if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) + return -EFAULT; + + count -= sizeof(hdr) + sizeof(ex_hdr); + buf += sizeof(hdr) + sizeof(ex_hdr); + + if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) + return -EINVAL; + + if (ex_hdr.response) { + if (!hdr.out_words && !ex_hdr.provider_out_words) + return -EINVAL; + } else { + if (hdr.out_words || ex_hdr.provider_out_words) + return -EINVAL; + } + + INIT_UDATA(&ucore, + (hdr.in_words) ? buf : 0, + (unsigned long)ex_hdr.response, + hdr.in_words * 8, + hdr.out_words * 8); + + INIT_UDATA(&uhw, + (ex_hdr.provider_in_words) ? buf + ucore.inlen : 0, + (ex_hdr.provider_out_words) ? (unsigned long)ex_hdr.response + ucore.outlen : 0, + ex_hdr.provider_in_words * 8, + ex_hdr.provider_out_words * 8); + + err = uverbs_ex_cmd_table[command](file, + &ucore, + &uhw); + + if (err) + return err; + + return written_count; } #endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ + + return -ENOSYS; } static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index f0612645de99..4be29fef66d6 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1692,9 +1692,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow; #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING - ibdev->ib_dev.uverbs_cmd_mask |= - (1ull << IB_USER_VERBS_CMD_CREATE_FLOW) | - (1ull << IB_USER_VERBS_CMD_DESTROY_FLOW); + ibdev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); #endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e393171e2fac..a06fc122f803 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1436,6 +1436,7 @@ struct ib_device { int uverbs_abi_ver; u64 uverbs_cmd_mask; + u64 uverbs_ex_cmd_mask; char node_desc[64]; __be64 node_guid; diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index fc9bbe37cfce..6ace125e1af6 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -87,11 +87,14 @@ enum { IB_USER_VERBS_CMD_CLOSE_XRCD, IB_USER_VERBS_CMD_CREATE_XSRQ, IB_USER_VERBS_CMD_OPEN_QP, +}; + #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING - IB_USER_VERBS_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, - IB_USER_VERBS_CMD_DESTROY_FLOW -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ +enum { + IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, + IB_USER_VERBS_EX_CMD_DESTROY_FLOW }; +#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ /* * Make sure that all structs defined in this file remain laid out so @@ -122,6 +125,12 @@ struct ib_uverbs_comp_event_desc { * the rest of the command struct based on these value. */ +#define IB_USER_VERBS_CMD_COMMAND_MASK 0xff +#define IB_USER_VERBS_CMD_FLAGS_MASK 0xff000000u +#define IB_USER_VERBS_CMD_FLAGS_SHIFT 24 + +#define IB_USER_VERBS_CMD_FLAG_EXTENDED 0x80 + struct ib_uverbs_cmd_hdr { __u32 command; __u16 in_words; @@ -129,10 +138,8 @@ struct ib_uverbs_cmd_hdr { }; #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING -struct ib_uverbs_cmd_hdr_ex { - __u32 command; - __u16 in_words; - __u16 out_words; +struct ib_uverbs_ex_cmd_hdr { + __u64 response; __u16 provider_in_words; __u16 provider_out_words; __u32 cmd_hdr_reserved; @@ -782,8 +789,6 @@ struct ib_uverbs_flow_attr { struct ib_uverbs_create_flow { __u32 comp_mask; - __u32 reserved; - __u64 response; __u32 qp_handle; struct ib_uverbs_flow_attr flow_attr; }; -- cgit v1.2.3 From 69ad5da41b4ed94aef31d4111a3442cfd73ce570 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 6 Nov 2013 23:21:50 +0100 Subject: IB/core: Re-enable create_flow/destroy_flow uverbs This commit reverts commit 7afbddfae993 ("IB/core: Temporarily disable create_flow/destroy_flow uverbs"). Since the uverbs extensions functionality was experimental for v3.12, this patch re-enables the support for them and flow-steering for v3.13. Signed-off-by: Matan Barak Signed-off-by: Roland Dreier --- drivers/infiniband/Kconfig | 11 ----------- drivers/infiniband/core/uverbs.h | 4 ---- drivers/infiniband/core/uverbs_cmd.c | 4 ---- drivers/infiniband/core/uverbs_main.c | 5 ----- drivers/infiniband/hw/mlx4/main.c | 2 -- include/uapi/rdma/ib_user_verbs.h | 6 ------ 6 files changed, 32 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index b84791f03a27..5ceda710f516 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -31,17 +31,6 @@ config INFINIBAND_USER_ACCESS libibverbs, libibcm and a hardware driver library from . -config INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING - bool "Experimental and unstable ABI for userspace access to flow steering verbs" - depends on INFINIBAND_USER_ACCESS - depends on STAGING - ---help--- - The final ABI for userspace access to flow steering verbs - has not been defined. To use the current ABI, *WHICH WILL - CHANGE IN THE FUTURE*, say Y here. - - If unsure, say N. - config INFINIBAND_USER_MEM bool depends on INFINIBAND_USER_ACCESS != n diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 24adccbf92f9..bdc842e9faef 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -242,8 +242,6 @@ IB_UVERBS_DECLARE_CMD(create_xsrq); IB_UVERBS_DECLARE_CMD(open_xrcd); IB_UVERBS_DECLARE_CMD(close_xrcd); -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING - #define IB_UVERBS_DECLARE_EX_CMD(name) \ int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \ struct ib_udata *ucore, \ @@ -252,6 +250,4 @@ IB_UVERBS_DECLARE_CMD(close_xrcd); IB_UVERBS_DECLARE_EX_CMD(create_flow); IB_UVERBS_DECLARE_EX_CMD(destroy_flow); -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ - #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 33a52785f812..8233744b4140 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -54,9 +54,7 @@ static struct uverbs_lock_class qp_lock_class = { .name = "QP-uobj" }; static struct uverbs_lock_class ah_lock_class = { .name = "AH-uobj" }; static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" }; static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" }; -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ /* * The ib_uobject locking scheme is as follows: @@ -2593,7 +2591,6 @@ out_put: return ret ? ret : in_len; } -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, union ib_flow_spec *ib_spec) { @@ -2821,7 +2818,6 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, return ret; } -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, struct ib_uverbs_create_xsrq *cmd, diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 189d99e76d9f..34386943ebcf 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -117,14 +117,12 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_OPEN_QP] = ib_uverbs_open_qp, }; -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, struct ib_udata *ucore, struct ib_udata *uhw) = { [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow }; -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ static void ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device); @@ -633,8 +631,6 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, hdr.in_words * 4, hdr.out_words * 4); -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING - } else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) { __u32 command; @@ -701,7 +697,6 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, return written_count; } -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ return -ENOSYS; } diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 4be29fef66d6..1aad9b3e6bdd 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1691,11 +1691,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.create_flow = mlx4_ib_create_flow; ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow; -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING ibdev->ib_dev.uverbs_ex_cmd_mask |= (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ } mlx4_ib_alloc_eqs(dev, ibdev); diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 6ace125e1af6..cbfdd4ca9510 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -89,12 +89,10 @@ enum { IB_USER_VERBS_CMD_OPEN_QP, }; -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING enum { IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, IB_USER_VERBS_EX_CMD_DESTROY_FLOW }; -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ /* * Make sure that all structs defined in this file remain laid out so @@ -137,14 +135,12 @@ struct ib_uverbs_cmd_hdr { __u16 out_words; }; -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING struct ib_uverbs_ex_cmd_hdr { __u64 response; __u16 provider_in_words; __u16 provider_out_words; __u32 cmd_hdr_reserved; }; -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ struct ib_uverbs_get_context { __u64 response; @@ -707,7 +703,6 @@ struct ib_uverbs_detach_mcast { __u64 driver_data[0]; }; -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING struct ib_uverbs_flow_spec_hdr { __u32 type; __u16 size; @@ -802,7 +797,6 @@ struct ib_uverbs_destroy_flow { __u32 comp_mask; __u32 flow_handle; }; -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ struct ib_uverbs_create_srq { __u64 response; -- cgit v1.2.3