From 34755f596110fb85f4c5b5fbe56aeb86042074bc Mon Sep 17 00:00:00 2001 From: Gustavo A. R. Silva Date: Wed, 29 May 2019 10:12:48 -0500 Subject: IB/rdmavt: Use struct_size() helper Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes, in particular in the context in which this code is being used. So, replace the following form: sizeof(struct rvt_sge) * init_attr->cap.max_send_sge + sizeof(struct rvt_swqe) with: struct_size(swq, sg_list, init_attr->cap.max_send_sge) and so on... Also, notice that variable size is unnecessary, hence it is removed. This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Reviewed-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/qp.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 31a2e65e4906..a60f5faea198 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -988,9 +988,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, case IB_QPT_UC: case IB_QPT_RC: case IB_QPT_UD: - sz = sizeof(struct rvt_sge) * - init_attr->cap.max_send_sge + - sizeof(struct rvt_swqe); + sz = struct_size(swq, sg_list, init_attr->cap.max_send_sge); swq = vzalloc_node(array_size(sz, sqsize), rdi->dparms.node); if (!swq) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From b9560a419bfd498279333387817adcf5faef2825 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 5 Jun 2019 14:39:24 -0300 Subject: RDMA: Move driver_id into struct ib_device_ops No reason for every driver to emit code to set this, just make it part of the driver's existing static const ops structure. Signed-off-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 12 +++++++++--- drivers/infiniband/core/uverbs_uapi.c | 2 +- drivers/infiniband/hw/bnxt_re/main.c | 3 ++- drivers/infiniband/hw/cxgb3/iwch_provider.c | 3 ++- drivers/infiniband/hw/cxgb4/provider.c | 3 ++- drivers/infiniband/hw/efa/efa_main.c | 3 ++- drivers/infiniband/hw/hfi1/verbs.c | 4 +++- drivers/infiniband/hw/hns/hns_roce_main.c | 3 ++- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 3 ++- drivers/infiniband/hw/mlx4/main.c | 3 ++- drivers/infiniband/hw/mlx5/main.c | 3 ++- drivers/infiniband/hw/mthca/mthca_provider.c | 3 ++- drivers/infiniband/hw/nes/nes_verbs.c | 3 ++- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 3 ++- drivers/infiniband/hw/qedr/main.c | 3 ++- drivers/infiniband/hw/qib/qib_verbs.c | 4 +++- drivers/infiniband/hw/usnic/usnic_ib_main.c | 3 ++- drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 3 ++- drivers/infiniband/sw/rdmavt/vt.c | 3 +-- drivers/infiniband/sw/rxe/rxe_verbs.c | 3 ++- include/rdma/ib_verbs.h | 3 ++- include/rdma/rdma_vt.h | 2 +- 22 files changed, 50 insertions(+), 25 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index afb3f5946796..538d01f27bf8 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -373,7 +373,7 @@ struct ib_device *ib_device_get_by_name(const char *name, down_read(&devices_rwsem); device = __ib_device_get_by_name(name); if (device && driver_id != RDMA_DRIVER_UNKNOWN && - device->driver_id != driver_id) + device->ops.driver_id != driver_id) device = NULL; if (device) { @@ -1456,7 +1456,7 @@ void ib_unregister_driver(enum rdma_driver_id driver_id) down_read(&devices_rwsem); xa_for_each (&devices, index, ib_dev) { - if (ib_dev->driver_id != driver_id) + if (ib_dev->ops.driver_id != driver_id) continue; get_device(&ib_dev->dev); @@ -2013,7 +2013,7 @@ struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, (uintptr_t)ndev) { if (rcu_access_pointer(cur->netdev) == ndev && (driver_id == RDMA_DRIVER_UNKNOWN || - cur->ib_dev->driver_id == driver_id) && + cur->ib_dev->ops.driver_id == driver_id) && ib_device_try_get(cur->ib_dev)) { res = cur->ib_dev; break; @@ -2318,6 +2318,12 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) + if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { + WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && + dev_ops->driver_id != ops->driver_id); + dev_ops->driver_id = ops->driver_id; + } + SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index ccc4be0a6566..00c547887132 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -647,7 +647,7 @@ struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev) return ERR_PTR(-ENOMEM); INIT_RADIX_TREE(&uapi->radix, GFP_KERNEL); - uapi->driver_id = ibdev->driver_id; + uapi->driver_id = ibdev->ops.driver_id; rc = uapi_merge_def(uapi, ibdev, uverbs_core_api, false); if (rc) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 814f959c7db9..1ef5f83ec914 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -596,6 +596,8 @@ static void bnxt_re_unregister_ib(struct bnxt_re_dev *rdev) } static const struct ib_device_ops bnxt_re_dev_ops = { + .driver_id = RDMA_DRIVER_BNXT_RE, + .add_gid = bnxt_re_add_gid, .alloc_hw_stats = bnxt_re_ib_alloc_hw_stats, .alloc_mr = bnxt_re_alloc_mr, @@ -691,7 +693,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) rdma_set_device_sysfs_group(ibdev, &bnxt_re_dev_attr_group); - ibdev->driver_id = RDMA_DRIVER_BNXT_RE; ib_set_device_ops(ibdev, &bnxt_re_dev_ops); ret = ib_device_set_netdev(&rdev->ibdev, rdev->netdev, 1); if (ret) diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 4bfab739ec0d..9a9527fdcb9a 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1236,6 +1236,8 @@ static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str) } static const struct ib_device_ops iwch_dev_ops = { + .driver_id = RDMA_DRIVER_CXGB3, + .alloc_hw_stats = iwch_alloc_stats, .alloc_mr = iwch_alloc_mr, .alloc_mw = iwch_alloc_mw, @@ -1319,7 +1321,6 @@ int iwch_register_device(struct iwch_dev *dev) memcpy(dev->ibdev.iw_ifname, dev->rdev.t3cdev_p->lldev->name, sizeof(dev->ibdev.iw_ifname)); - dev->ibdev.driver_id = RDMA_DRIVER_CXGB3; rdma_set_device_sysfs_group(&dev->ibdev, &iwch_attr_group); ib_set_device_ops(&dev->ibdev, &iwch_dev_ops); return ib_register_device(&dev->ibdev, "cxgb3_%d"); diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 8ed75b141521..74644afe25ab 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -489,6 +489,8 @@ static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res) } static const struct ib_device_ops c4iw_dev_ops = { + .driver_id = RDMA_DRIVER_CXGB4, + .alloc_hw_stats = c4iw_alloc_stats, .alloc_mr = c4iw_alloc_mr, .alloc_mw = c4iw_alloc_mw, @@ -599,7 +601,6 @@ void c4iw_register_device(struct work_struct *work) sizeof(dev->ibdev.iw_ifname)); rdma_set_device_sysfs_group(&dev->ibdev, &c4iw_attr_group); - dev->ibdev.driver_id = RDMA_DRIVER_CXGB4; ib_set_device_ops(&dev->ibdev, &c4iw_dev_ops); ret = set_netdevs(&dev->ibdev, &dev->rdev); if (ret) diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index db974caf1eb1..3803dd4526b5 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -197,6 +197,8 @@ static void efa_stats_init(struct efa_dev *dev) } static const struct ib_device_ops efa_dev_ops = { + .driver_id = RDMA_DRIVER_EFA, + .alloc_pd = efa_alloc_pd, .alloc_ucontext = efa_alloc_ucontext, .create_ah = efa_create_ah, @@ -287,7 +289,6 @@ static int efa_ib_device_add(struct efa_dev *dev) dev->ibdev.uverbs_ex_cmd_mask = (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE); - dev->ibdev.driver_id = RDMA_DRIVER_EFA; ib_set_device_ops(&dev->ibdev, &efa_dev_ops); err = ib_register_device(&dev->ibdev, "efa_%d"); diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 1eb4105b2d22..a97f4f9e5c6a 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1779,6 +1779,8 @@ static int get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, } static const struct ib_device_ops hfi1_dev_ops = { + .driver_id = RDMA_DRIVER_HFI1, + .alloc_hw_stats = alloc_hw_stats, .alloc_rdma_netdev = hfi1_vnic_alloc_rn, .get_dev_fw_str = hfi1_get_dev_fw_str, @@ -1923,7 +1925,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev, &ib_hfi1_attr_group); - ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_HFI1); + ret = rvt_register_device(&dd->verbs_dev.rdi); if (ret) goto err_verbs_txreq; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index a6c5c67d0b87..dd408f8afe72 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -414,6 +414,8 @@ static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev) } static const struct ib_device_ops hns_roce_dev_ops = { + .driver_id = RDMA_DRIVER_HNS, + .add_gid = hns_roce_add_gid, .alloc_pd = hns_roce_alloc_pd, .alloc_ucontext = hns_roce_alloc_ucontext, @@ -536,7 +538,6 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_srq_ops); } - ib_dev->driver_id = RDMA_DRIVER_HNS; ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_ops); ib_set_device_ops(ib_dev, &hns_roce_dev_ops); for (i = 0; i < hr_dev->caps.num_ports; i++) { diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index a10a30d44b32..1979cefdf90c 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2650,6 +2650,8 @@ static int i40iw_query_pkey(struct ib_device *ibdev, } static const struct ib_device_ops i40iw_dev_ops = { + .driver_id = RDMA_DRIVER_I40IW, + .alloc_hw_stats = i40iw_alloc_hw_stats, .alloc_mr = i40iw_alloc_mr, .alloc_pd = i40iw_alloc_pd, @@ -2787,7 +2789,6 @@ int i40iw_register_rdma_device(struct i40iw_device *iwdev) return -ENOMEM; iwibdev = iwdev->iwibdev; rdma_set_device_sysfs_group(&iwibdev->ibdev, &i40iw_attr_group); - iwibdev->ibdev.driver_id = RDMA_DRIVER_I40IW; ret = ib_register_device(&iwibdev->ibdev, "i40iw%d"); if (ret) goto error; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 25d09d53b51c..03847e2f7835 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2509,6 +2509,8 @@ static void get_fw_ver_str(struct ib_device *device, char *str) } static const struct ib_device_ops mlx4_ib_dev_ops = { + .driver_id = RDMA_DRIVER_MLX4, + .add_gid = mlx4_ib_add_gid, .alloc_mr = mlx4_ib_alloc_mr, .alloc_pd = mlx4_ib_alloc_pd, @@ -2839,7 +2841,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) goto err_steer_free_bitmap; rdma_set_device_sysfs_group(&ibdev->ib_dev, &mlx4_attr_group); - ibdev->ib_dev.driver_id = RDMA_DRIVER_MLX4; if (ib_register_device(&ibdev->ib_dev, "mlx4_%d")) goto err_diag_counters; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index abac70ad5c7c..abd416a31e71 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6124,6 +6124,8 @@ static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev) } static const struct ib_device_ops mlx5_ib_dev_ops = { + .driver_id = RDMA_DRIVER_MLX5, + .add_gid = mlx5_ib_add_gid, .alloc_mr = mlx5_ib_alloc_mr, .alloc_pd = mlx5_ib_alloc_pd, @@ -6290,7 +6292,6 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_DEVICE) ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_flow_ipsec_ops); - dev->ib_dev.driver_id = RDMA_DRIVER_MLX5; ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops); if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)) diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 4f40dfedf920..d6467da39aab 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1153,6 +1153,8 @@ static void get_dev_fw_str(struct ib_device *device, char *str) } static const struct ib_device_ops mthca_dev_ops = { + .driver_id = RDMA_DRIVER_MTHCA, + .alloc_pd = mthca_alloc_pd, .alloc_ucontext = mthca_alloc_ucontext, .attach_mcast = mthca_multicast_attach, @@ -1303,7 +1305,6 @@ int mthca_register_device(struct mthca_dev *dev) mutex_init(&dev->cap_mask_mutex); rdma_set_device_sysfs_group(&dev->ib_dev, &mthca_attr_group); - dev->ib_dev.driver_id = RDMA_DRIVER_MTHCA; ret = ib_register_device(&dev->ib_dev, "mthca%d"); if (ret) return ret; diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index fb2d0762c7c8..3c85e2ef5a08 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3558,6 +3558,8 @@ static void get_dev_fw_str(struct ib_device *dev, char *str) } static const struct ib_device_ops nes_dev_ops = { + .driver_id = RDMA_DRIVER_NES, + .alloc_mr = nes_alloc_mr, .alloc_mw = nes_alloc_mw, .alloc_pd = nes_alloc_pd, @@ -3722,7 +3724,6 @@ int nes_register_ofa_device(struct nes_ib_device *nesibdev) int ret; rdma_set_device_sysfs_group(&nesvnic->nesibdev->ibdev, &nes_attr_group); - nesvnic->nesibdev->ibdev.driver_id = RDMA_DRIVER_NES; ret = ib_register_device(&nesvnic->nesibdev->ibdev, "nes%d"); if (ret) { return ret; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index fc6c0962dea9..a9da4b857566 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -144,6 +144,8 @@ static const struct attribute_group ocrdma_attr_group = { }; static const struct ib_device_ops ocrdma_dev_ops = { + .driver_id = RDMA_DRIVER_OCRDMA, + .alloc_mr = ocrdma_alloc_mr, .alloc_pd = ocrdma_alloc_pd, .alloc_ucontext = ocrdma_alloc_ucontext, @@ -249,7 +251,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) ib_set_device_ops(&dev->ibdev, &ocrdma_dev_srq_ops); } rdma_set_device_sysfs_group(&dev->ibdev, &ocrdma_attr_group); - dev->ibdev.driver_id = RDMA_DRIVER_OCRDMA; ret = ib_device_set_netdev(&dev->ibdev, dev->nic_info.netdev, 1); if (ret) return ret; diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 083c2c00a8e9..737745231f8f 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -183,6 +183,8 @@ static void qedr_roce_register_device(struct qedr_dev *dev) } static const struct ib_device_ops qedr_dev_ops = { + .driver_id = RDMA_DRIVER_QEDR, + .alloc_mr = qedr_alloc_mr, .alloc_pd = qedr_alloc_pd, .alloc_ucontext = qedr_alloc_ucontext, @@ -274,7 +276,6 @@ static int qedr_register_device(struct qedr_dev *dev) rdma_set_device_sysfs_group(&dev->ibdev, &qedr_attr_group); ib_set_device_ops(&dev->ibdev, &qedr_dev_ops); - dev->ibdev.driver_id = RDMA_DRIVER_QEDR; rc = ib_device_set_netdev(&dev->ibdev, dev->ndev, 1); if (rc) return rc; diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 5ff32d32c61c..bbc331d7f49b 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1482,6 +1482,8 @@ static void qib_fill_device_attr(struct qib_devdata *dd) } static const struct ib_device_ops qib_dev_ops = { + .driver_id = RDMA_DRIVER_QIB, + .init_port = qib_create_port_files, .modify_device = qib_modify_device, .process_mad = qib_process_mad, @@ -1616,7 +1618,7 @@ int qib_register_ib_device(struct qib_devdata *dd) rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev, &qib_attr_group); ib_set_device_ops(ibdev, &qib_dev_ops); - ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_QIB); + ret = rvt_register_device(&dd->verbs_dev.rdi); if (ret) goto err_tx; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index d88d9f8a7f9a..47830334cb55 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -329,6 +329,8 @@ static void usnic_get_dev_fw_str(struct ib_device *device, char *str) } static const struct ib_device_ops usnic_dev_ops = { + .driver_id = RDMA_DRIVER_USNIC, + .alloc_pd = usnic_ib_alloc_pd, .alloc_ucontext = usnic_ib_alloc_ucontext, .create_cq = usnic_ib_create_cq, @@ -412,7 +414,6 @@ static void *usnic_ib_device_add(struct pci_dev *dev) ib_set_device_ops(&us_ibdev->ib_dev, &usnic_dev_ops); - us_ibdev->ib_dev.driver_id = RDMA_DRIVER_USNIC; rdma_set_device_sysfs_group(&us_ibdev->ib_dev, &usnic_attr_group); ret = ib_device_set_netdev(&us_ibdev->ib_dev, us_ibdev->netdev, 1); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index 40182297f87f..54a0b6372629 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -144,6 +144,8 @@ static int pvrdma_port_immutable(struct ib_device *ibdev, u8 port_num, } static const struct ib_device_ops pvrdma_dev_ops = { + .driver_id = RDMA_DRIVER_VMW_PVRDMA, + .add_gid = pvrdma_add_gid, .alloc_mr = pvrdma_alloc_mr, .alloc_pd = pvrdma_alloc_pd, @@ -261,7 +263,6 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) if (!dev->srq_tbl) goto err_qp_free; } - dev->ib_dev.driver_id = RDMA_DRIVER_VMW_PVRDMA; ret = ib_device_set_netdev(&dev->ib_dev, dev->netdev, 1); if (ret) return ret; diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 9546a837a8ac..60700e197e6c 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -530,7 +530,7 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb) * * Return: 0 on success otherwise an errno. */ -int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) +int rvt_register_device(struct rvt_dev_info *rdi) { int ret = 0, i; @@ -636,7 +636,6 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) if (!rdi->ibdev.num_comp_vectors) rdi->ibdev.num_comp_vectors = 1; - rdi->ibdev.driver_id = driver_id; /* We are now good to announce we exist */ ret = ib_register_device(&rdi->ibdev, dev_name(&rdi->ibdev.dev)); if (ret) { diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 8c3e2a18cfe4..3d3130dc6380 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1111,6 +1111,8 @@ static int rxe_enable_driver(struct ib_device *ib_dev) } static const struct ib_device_ops rxe_dev_ops = { + .driver_id = RDMA_DRIVER_RXE, + .alloc_hw_stats = rxe_ib_alloc_hw_stats, .alloc_mr = rxe_alloc_mr, .alloc_pd = rxe_alloc_pd, @@ -1230,7 +1232,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) rxe->tfm = tfm; rdma_set_device_sysfs_group(dev, &rxe_attr_group); - dev->driver_id = RDMA_DRIVER_RXE; err = ib_register_device(dev, ibdev_name); if (err) pr_warn("%s failed with error %d\n", __func__, err); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index ec6446864b08..dacf2b5ad862 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2329,6 +2329,8 @@ struct iw_cm_conn_param; * need to define the supported operations, otherwise they will be set to null. */ struct ib_device_ops { + enum rdma_driver_id driver_id; + int (*post_send)(struct ib_qp *qp, const struct ib_send_wr *send_wr, const struct ib_send_wr **bad_send_wr); int (*post_recv)(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, @@ -2672,7 +2674,6 @@ struct ib_device { struct rdma_restrack_root *res; const struct uapi_definition *driver_def; - enum rdma_driver_id driver_id; /* * Positive refcount indicates that the device is currently diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index b9cd06db1a71..997f42678806 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -555,7 +555,7 @@ static inline u16 rvt_get_pkey(struct rvt_dev_info *rdi, struct rvt_dev_info *rvt_alloc_device(size_t size, int nports); void rvt_dealloc_device(struct rvt_dev_info *rdi); -int rvt_register_device(struct rvt_dev_info *rvd, u32 driver_id); +int rvt_register_device(struct rvt_dev_info *rvd); void rvt_unregister_device(struct rvt_dev_info *rvd); int rvt_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr); int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port, -- cgit v1.2.3 From 72c6ec18eb6161c8fc672ae96ec5c77df4d07405 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 5 Jun 2019 14:39:25 -0300 Subject: RDMA: Move uverbs_abi_ver into struct ib_device_ops No reason for every driver to emit code to set this, just make it part of the driver's existing static const ops structure. Signed-off-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 2 ++ drivers/infiniband/core/uverbs_main.c | 2 +- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 6 +++--- drivers/infiniband/hw/bnxt_re/main.c | 2 +- drivers/infiniband/hw/cxgb3/iwch_provider.c | 2 +- drivers/infiniband/hw/cxgb4/provider.c | 2 +- drivers/infiniband/hw/efa/efa_main.c | 2 +- drivers/infiniband/hw/hns/hns_roce_main.c | 2 +- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 2 ++ drivers/infiniband/hw/mlx4/main.c | 15 ++++++++------- drivers/infiniband/hw/mlx5/main.c | 2 +- drivers/infiniband/hw/mthca/mthca_provider.c | 2 +- drivers/infiniband/hw/nes/nes_verbs.c | 2 ++ drivers/infiniband/hw/ocrdma/ocrdma_main.c | 2 +- drivers/infiniband/hw/qedr/main.c | 2 +- drivers/infiniband/hw/usnic/usnic_ib_main.c | 2 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 2 +- drivers/infiniband/sw/rdmavt/vt.c | 3 ++- drivers/infiniband/sw/rxe/rxe_verbs.c | 2 +- include/rdma/ib_verbs.h | 2 +- 20 files changed, 33 insertions(+), 25 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 538d01f27bf8..a00b7fc360bf 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2323,6 +2323,8 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) dev_ops->driver_id != ops->driver_id); dev_ops->driver_id = ops->driver_id; } + if (ops->uverbs_abi_ver) + dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, advise_mr); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 84a5e9a6d483..0f8a286a92d3 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -1186,7 +1186,7 @@ static ssize_t abi_version_show(struct device *device, srcu_key = srcu_read_lock(&dev->disassociate_srcu); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (ib_dev) - ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver); + ret = sprintf(buf, "%u\n", ib_dev->ops.uverbs_abi_ver); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); return ret; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 2c3685faa57a..8af8e1472101 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -3630,10 +3630,10 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) u32 chip_met_rev_num = 0; int rc; - dev_dbg(rdev_to_dev(rdev), "ABI version requested %d", - ibdev->uverbs_abi_ver); + dev_dbg(rdev_to_dev(rdev), "ABI version requested %u", + ibdev->ops.uverbs_abi_ver); - if (ibdev->uverbs_abi_ver != BNXT_RE_ABI_VERSION) { + if (ibdev->ops.uverbs_abi_ver != BNXT_RE_ABI_VERSION) { dev_dbg(rdev_to_dev(rdev), " is different from the device %d ", BNXT_RE_ABI_VERSION); return -EPERM; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 1ef5f83ec914..a45cb9ee51ab 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -597,6 +597,7 @@ static void bnxt_re_unregister_ib(struct bnxt_re_dev *rdev) static const struct ib_device_ops bnxt_re_dev_ops = { .driver_id = RDMA_DRIVER_BNXT_RE, + .uverbs_abi_ver = BNXT_RE_ABI_VERSION, .add_gid = bnxt_re_add_gid, .alloc_hw_stats = bnxt_re_ib_alloc_hw_stats, @@ -663,7 +664,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->local_dma_lkey = BNXT_QPLIB_RSVD_LKEY; /* User space */ - ibdev->uverbs_abi_ver = BNXT_RE_ABI_VERSION; ibdev->uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 9a9527fdcb9a..5b410605a3a1 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1237,6 +1237,7 @@ static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str) static const struct ib_device_ops iwch_dev_ops = { .driver_id = RDMA_DRIVER_CXGB3, + .uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION, .alloc_hw_stats = iwch_alloc_stats, .alloc_mr = iwch_alloc_mr, @@ -1316,7 +1317,6 @@ int iwch_register_device(struct iwch_dev *dev) dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports; dev->ibdev.num_comp_vectors = 1; dev->ibdev.dev.parent = &dev->rdev.rnic_info.pdev->dev; - dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION; memcpy(dev->ibdev.iw_ifname, dev->rdev.t3cdev_p->lldev->name, sizeof(dev->ibdev.iw_ifname)); diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 74644afe25ab..c56cdfbd8a88 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -490,6 +490,7 @@ static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res) static const struct ib_device_ops c4iw_dev_ops = { .driver_id = RDMA_DRIVER_CXGB4, + .uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION, .alloc_hw_stats = c4iw_alloc_stats, .alloc_mr = c4iw_alloc_mr, @@ -595,7 +596,6 @@ void c4iw_register_device(struct work_struct *work) dev->ibdev.phys_port_cnt = dev->rdev.lldi.nports; dev->ibdev.num_comp_vectors = dev->rdev.lldi.nciq; dev->ibdev.dev.parent = &dev->rdev.lldi.pdev->dev; - dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION; memcpy(dev->ibdev.iw_ifname, dev->rdev.lldi.ports[0]->name, sizeof(dev->ibdev.iw_ifname)); diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index 3803dd4526b5..b05c5a0b9bc0 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -198,6 +198,7 @@ static void efa_stats_init(struct efa_dev *dev) static const struct ib_device_ops efa_dev_ops = { .driver_id = RDMA_DRIVER_EFA, + .uverbs_abi_ver = EFA_UVERBS_ABI_VERSION, .alloc_pd = efa_alloc_pd, .alloc_ucontext = efa_alloc_ucontext, @@ -266,7 +267,6 @@ static int efa_ib_device_add(struct efa_dev *dev) dev->ibdev.phys_port_cnt = 1; dev->ibdev.num_comp_vectors = 1; dev->ibdev.dev.parent = &pdev->dev; - dev->ibdev.uverbs_abi_ver = EFA_UVERBS_ABI_VERSION; dev->ibdev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index dd408f8afe72..e496b0628e25 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -415,6 +415,7 @@ static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev) static const struct ib_device_ops hns_roce_dev_ops = { .driver_id = RDMA_DRIVER_HNS, + .uverbs_abi_ver = 1, .add_gid = hns_roce_add_gid, .alloc_pd = hns_roce_alloc_pd, @@ -489,7 +490,6 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_dev->phys_port_cnt = hr_dev->caps.num_ports; ib_dev->local_dma_lkey = hr_dev->caps.reserved_lkey; ib_dev->num_comp_vectors = hr_dev->caps.num_comp_vectors; - ib_dev->uverbs_abi_ver = 1; ib_dev->uverbs_cmd_mask = (1ULL << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ULL << IB_USER_VERBS_CMD_QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 1979cefdf90c..4dc647c8556b 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2651,6 +2651,8 @@ static int i40iw_query_pkey(struct ib_device *ibdev, static const struct ib_device_ops i40iw_dev_ops = { .driver_id = RDMA_DRIVER_I40IW, + /* NOTE: Older kernels wrongly use 0 for the uverbs_abi_ver */ + .uverbs_abi_ver = I40IW_ABI_VER, .alloc_hw_stats = i40iw_alloc_hw_stats, .alloc_mr = i40iw_alloc_mr, diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 03847e2f7835..1f87221acbb0 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1089,7 +1089,8 @@ static int mlx4_ib_alloc_ucontext(struct ib_ucontext *uctx, if (!dev->ib_active) return -EAGAIN; - if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) { + if (ibdev->ops.uverbs_abi_ver == + MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) { resp_v3.qp_tab_size = dev->dev->caps.num_qps; resp_v3.bf_reg_size = dev->dev->caps.bf_reg_size; resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; @@ -1111,7 +1112,7 @@ static int mlx4_ib_alloc_ucontext(struct ib_ucontext *uctx, INIT_LIST_HEAD(&context->wqn_ranges_list); mutex_init(&context->wqn_ranges_mutex); - if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) + if (ibdev->ops.uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3)); else err = ib_copy_to_udata(udata, &resp, sizeof(resp)); @@ -2510,6 +2511,7 @@ static void get_fw_ver_str(struct ib_device *device, char *str) static const struct ib_device_ops mlx4_ib_dev_ops = { .driver_id = RDMA_DRIVER_MLX4, + .uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION, .add_gid = mlx4_ib_add_gid, .alloc_mr = mlx4_ib_alloc_mr, @@ -2653,11 +2655,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; ibdev->ib_dev.dev.parent = &dev->persist->pdev->dev; - if (dev->caps.userspace_caps) - ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; - else - ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION; - ibdev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | @@ -2731,6 +2728,10 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_fs_ops); } + if (!dev->caps.userspace_caps) + ibdev->ib_dev.ops.uverbs_abi_ver = + MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION; + mlx4_ib_alloc_eqs(dev, ibdev); spin_lock_init(&iboe->lock); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index abd416a31e71..0c23eccb8855 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6125,6 +6125,7 @@ static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev) static const struct ib_device_ops mlx5_ib_dev_ops = { .driver_id = RDMA_DRIVER_MLX5, + .uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION, .add_gid = mlx5_ib_add_gid, .alloc_mr = mlx5_ib_alloc_mr, @@ -6223,7 +6224,6 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) struct mlx5_core_dev *mdev = dev->mdev; int err; - dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; dev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index d6467da39aab..690c65accea4 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1154,6 +1154,7 @@ static void get_dev_fw_str(struct ib_device *device, char *str) static const struct ib_device_ops mthca_dev_ops = { .driver_id = RDMA_DRIVER_MTHCA, + .uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION, .alloc_pd = mthca_alloc_pd, .alloc_ucontext = mthca_alloc_ucontext, @@ -1247,7 +1248,6 @@ int mthca_register_device(struct mthca_dev *dev) dev->ib_dev.owner = THIS_MODULE; - dev->ib_dev.uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION; dev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 3c85e2ef5a08..f1fdd6829a40 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3559,6 +3559,8 @@ static void get_dev_fw_str(struct ib_device *dev, char *str) static const struct ib_device_ops nes_dev_ops = { .driver_id = RDMA_DRIVER_NES, + /* NOTE: Older kernels wrongly use 0 for the uverbs_abi_ver */ + .uverbs_abi_ver = NES_ABI_USERSPACE_VER, .alloc_mr = nes_alloc_mr, .alloc_mw = nes_alloc_mw, diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index a9da4b857566..ef823f1144b5 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -145,6 +145,7 @@ static const struct attribute_group ocrdma_attr_group = { static const struct ib_device_ops ocrdma_dev_ops = { .driver_id = RDMA_DRIVER_OCRDMA, + .uverbs_abi_ver = OCRDMA_ABI_VERSION, .alloc_mr = ocrdma_alloc_mr, .alloc_pd = ocrdma_alloc_pd, @@ -203,7 +204,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC, sizeof(OCRDMA_NODE_DESC)); dev->ibdev.owner = THIS_MODULE; - dev->ibdev.uverbs_abi_ver = OCRDMA_ABI_VERSION; dev->ibdev.uverbs_cmd_mask = OCRDMA_UVERBS(GET_CONTEXT) | OCRDMA_UVERBS(QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 737745231f8f..793e25776a7e 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -184,6 +184,7 @@ static void qedr_roce_register_device(struct qedr_dev *dev) static const struct ib_device_ops qedr_dev_ops = { .driver_id = RDMA_DRIVER_QEDR, + .uverbs_abi_ver = QEDR_ABI_VERSION, .alloc_mr = qedr_alloc_mr, .alloc_pd = qedr_alloc_pd, @@ -234,7 +235,6 @@ static int qedr_register_device(struct qedr_dev *dev) dev->ibdev.node_guid = dev->attr.node_guid; memcpy(dev->ibdev.node_desc, QEDR_NODE_DESC, sizeof(QEDR_NODE_DESC)); dev->ibdev.owner = THIS_MODULE; - dev->ibdev.uverbs_abi_ver = QEDR_ABI_VERSION; dev->ibdev.uverbs_cmd_mask = QEDR_UVERBS(GET_CONTEXT) | QEDR_UVERBS(QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index 47830334cb55..f61690816095 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -330,6 +330,7 @@ static void usnic_get_dev_fw_str(struct ib_device *device, char *str) static const struct ib_device_ops usnic_dev_ops = { .driver_id = RDMA_DRIVER_USNIC, + .uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION, .alloc_pd = usnic_ib_alloc_pd, .alloc_ucontext = usnic_ib_alloc_ucontext, @@ -391,7 +392,6 @@ static void *usnic_ib_device_add(struct pci_dev *dev) us_ibdev->ib_dev.phys_port_cnt = USNIC_IB_PORT_CNT; us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS; us_ibdev->ib_dev.dev.parent = &dev->dev; - us_ibdev->ib_dev.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION; us_ibdev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index 54a0b6372629..2a7eb2838453 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -145,6 +145,7 @@ static int pvrdma_port_immutable(struct ib_device *ibdev, u8 port_num, static const struct ib_device_ops pvrdma_dev_ops = { .driver_id = RDMA_DRIVER_VMW_PVRDMA, + .uverbs_abi_ver = PVRDMA_UVERBS_ABI_VERSION, .add_gid = pvrdma_add_gid, .alloc_mr = pvrdma_alloc_mr, @@ -203,7 +204,6 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.num_comp_vectors = 1; dev->ib_dev.dev.parent = &dev->pdev->dev; - dev->ib_dev.uverbs_abi_ver = PVRDMA_UVERBS_ABI_VERSION; dev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 60700e197e6c..639ef8ac5400 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -382,6 +382,8 @@ enum { }; static const struct ib_device_ops rvt_dev_ops = { + .uverbs_abi_ver = RVT_UVERBS_ABI_VERSION, + .alloc_fmr = rvt_alloc_fmr, .alloc_mr = rvt_alloc_mr, .alloc_pd = rvt_alloc_pd, @@ -600,7 +602,6 @@ int rvt_register_device(struct rvt_dev_info *rdi) * exactly which functions rdmavt supports, nor do they know the ABI * version, so we do all of this sort of stuff here. */ - rdi->ibdev.uverbs_abi_ver = RVT_UVERBS_ABI_VERSION; rdi->ibdev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 3d3130dc6380..9e87cdb82bec 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1112,6 +1112,7 @@ static int rxe_enable_driver(struct ib_device *ib_dev) static const struct ib_device_ops rxe_dev_ops = { .driver_id = RDMA_DRIVER_RXE, + .uverbs_abi_ver = RXE_UVERBS_ABI_VERSION, .alloc_hw_stats = rxe_ib_alloc_hw_stats, .alloc_mr = rxe_alloc_mr, @@ -1184,7 +1185,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) dma_coerce_mask_and_coherent(&dev->dev, dma_get_required_mask(&dev->dev)); - dev->uverbs_abi_ver = RXE_UVERBS_ABI_VERSION; dev->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index dacf2b5ad862..16405b9bca13 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2330,6 +2330,7 @@ struct iw_cm_conn_param; */ struct ib_device_ops { enum rdma_driver_id driver_id; + u32 uverbs_abi_ver; int (*post_send)(struct ib_qp *qp, const struct ib_send_wr *send_wr, const struct ib_send_wr **bad_send_wr); @@ -2650,7 +2651,6 @@ struct ib_device { */ const struct attribute_group *groups[3]; - int uverbs_abi_ver; u64 uverbs_cmd_mask; u64 uverbs_ex_cmd_mask; -- cgit v1.2.3 From 7a15414252ae4f1d450462d83f883b2d9d8036ee Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 5 Jun 2019 14:39:26 -0300 Subject: RDMA: Move owner into struct ib_device_ops This more closely follows how other subsytems work, with owner being a member of the structure containing the function pointers. Signed-off-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 4 ++++ drivers/infiniband/core/uverbs_main.c | 6 +++--- drivers/infiniband/hw/bnxt_re/main.c | 2 +- drivers/infiniband/hw/cxgb3/iwch_provider.c | 2 +- drivers/infiniband/hw/cxgb4/provider.c | 2 +- drivers/infiniband/hw/efa/efa_main.c | 2 +- drivers/infiniband/hw/hfi1/verbs.c | 2 +- drivers/infiniband/hw/hns/hns_roce_main.c | 2 +- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 2 +- drivers/infiniband/hw/mlx4/main.c | 2 +- drivers/infiniband/hw/mlx5/main.c | 2 +- drivers/infiniband/hw/mthca/mthca_provider.c | 3 +-- drivers/infiniband/hw/nes/nes_verbs.c | 2 +- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 2 +- drivers/infiniband/hw/qedr/main.c | 2 +- drivers/infiniband/hw/qib/qib_verbs.c | 2 +- drivers/infiniband/hw/usnic/usnic_ib_main.c | 2 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 2 +- drivers/infiniband/sw/rxe/rxe_verbs.c | 2 +- include/rdma/ib_verbs.h | 2 +- 20 files changed, 25 insertions(+), 22 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index a00b7fc360bf..357d74c8df2b 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2323,6 +2323,10 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) dev_ops->driver_id != ops->driver_id); dev_ops->driver_id = ops->driver_id; } + if (ops->owner) { + WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); + dev_ops->owner = ops->owner; + } if (ops->uverbs_abi_ver) dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 0f8a286a92d3..870b3dd35aac 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -198,7 +198,7 @@ void ib_uverbs_release_file(struct kref *ref) ib_dev = srcu_dereference(file->device->ib_dev, &file->device->disassociate_srcu); if (ib_dev && !ib_dev->ops.disassociate_ucontext) - module_put(ib_dev->owner); + module_put(ib_dev->ops.owner); srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); if (atomic_dec_and_test(&file->device->refcount)) @@ -1065,7 +1065,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) module_dependent = !(ib_dev->ops.disassociate_ucontext); if (module_dependent) { - if (!try_module_get(ib_dev->owner)) { + if (!try_module_get(ib_dev->ops.owner)) { ret = -ENODEV; goto err; } @@ -1100,7 +1100,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) return stream_open(inode, filp); err_module: - module_put(ib_dev->owner); + module_put(ib_dev->ops.owner); err: mutex_unlock(&dev->lists_mutex); diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index a45cb9ee51ab..351c420248a0 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -596,6 +596,7 @@ static void bnxt_re_unregister_ib(struct bnxt_re_dev *rdev) } static const struct ib_device_ops bnxt_re_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_BNXT_RE, .uverbs_abi_ver = BNXT_RE_ABI_VERSION, @@ -651,7 +652,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) int ret; /* ib device init */ - ibdev->owner = THIS_MODULE; ibdev->node_type = RDMA_NODE_IB_CA; strlcpy(ibdev->node_desc, BNXT_RE_DESC " HCA", strlen(BNXT_RE_DESC) + 5); diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 5b410605a3a1..1b35941eee74 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1236,6 +1236,7 @@ static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str) } static const struct ib_device_ops iwch_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_CXGB3, .uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION, @@ -1285,7 +1286,6 @@ int iwch_register_device(struct iwch_dev *dev) pr_debug("%s iwch_dev %p\n", __func__, dev); memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6); - dev->ibdev.owner = THIS_MODULE; dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW | IB_DEVICE_MEM_MGT_EXTENSIONS; diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index c56cdfbd8a88..2b1f2443b7da 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -489,6 +489,7 @@ static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res) } static const struct ib_device_ops c4iw_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_CXGB4, .uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION, @@ -563,7 +564,6 @@ void c4iw_register_device(struct work_struct *work) pr_debug("c4iw_dev %p\n", dev); memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); memcpy(&dev->ibdev.node_guid, dev->rdev.lldi.ports[0]->dev_addr, 6); - dev->ibdev.owner = THIS_MODULE; dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW; if (fastreg_support) dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index b05c5a0b9bc0..b891ee239a67 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -197,6 +197,7 @@ static void efa_stats_init(struct efa_dev *dev) } static const struct ib_device_ops efa_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_EFA, .uverbs_abi_ver = EFA_UVERBS_ABI_VERSION, @@ -262,7 +263,6 @@ static int efa_ib_device_add(struct efa_dev *dev) if (err) goto err_release_doorbell_bar; - dev->ibdev.owner = THIS_MODULE; dev->ibdev.node_type = RDMA_NODE_UNSPECIFIED; dev->ibdev.phys_port_cnt = 1; dev->ibdev.num_comp_vectors = 1; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index a97f4f9e5c6a..1f36db98240f 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1779,6 +1779,7 @@ static int get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, } static const struct ib_device_ops hfi1_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_HFI1, .alloc_hw_stats = alloc_hw_stats, @@ -1831,7 +1832,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) */ if (!ib_hfi1_sys_image_guid) ib_hfi1_sys_image_guid = ibdev->node_guid; - ibdev->owner = THIS_MODULE; ibdev->phys_port_cnt = dd->num_pports; ibdev->dev.parent = &dd->pcidev->dev; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index e496b0628e25..f07b2ec86ec2 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -414,6 +414,7 @@ static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev) } static const struct ib_device_ops hns_roce_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_HNS, .uverbs_abi_ver = 1, @@ -483,7 +484,6 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_dev = &hr_dev->ib_dev; - ib_dev->owner = THIS_MODULE; ib_dev->node_type = RDMA_NODE_IB_CA; ib_dev->dev.parent = dev; diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 4dc647c8556b..bfe16e6f04f4 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2650,6 +2650,7 @@ static int i40iw_query_pkey(struct ib_device *ibdev, } static const struct ib_device_ops i40iw_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_I40IW, /* NOTE: Older kernels wrongly use 0 for the uverbs_abi_ver */ .uverbs_abi_ver = I40IW_ABI_VER, @@ -2711,7 +2712,6 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev i40iw_pr_err("iwdev == NULL\n"); return NULL; } - iwibdev->ibdev.owner = THIS_MODULE; iwdev->iwibdev = iwibdev; iwibdev->iwdev = iwdev; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 1f87221acbb0..5d7a87842291 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2510,6 +2510,7 @@ static void get_fw_ver_str(struct ib_device *device, char *str) } static const struct ib_device_ops mlx4_ib_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_MLX4, .uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION, @@ -2646,7 +2647,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->dev = dev; ibdev->bond_next_port = 0; - ibdev->ib_dev.owner = THIS_MODULE; ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey; ibdev->num_ports = num_ports; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 0c23eccb8855..1e3d936ed809 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6044,7 +6044,6 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) if (mlx5_use_mad_ifc(dev)) get_ext_port_caps(dev); - dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; dev->ib_dev.phys_port_cnt = dev->num_ports; @@ -6124,6 +6123,7 @@ static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev) } static const struct ib_device_ops mlx5_ib_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_MLX5, .uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION, diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 690c65accea4..b128ff76f709 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1153,6 +1153,7 @@ static void get_dev_fw_str(struct ib_device *device, char *str) } static const struct ib_device_ops mthca_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_MTHCA, .uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION, @@ -1246,8 +1247,6 @@ int mthca_register_device(struct mthca_dev *dev) if (ret) return ret; - dev->ib_dev.owner = THIS_MODULE; - dev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index f1fdd6829a40..db044b2eaead 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3558,6 +3558,7 @@ static void get_dev_fw_str(struct ib_device *dev, char *str) } static const struct ib_device_ops nes_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_NES, /* NOTE: Older kernels wrongly use 0 for the uverbs_abi_ver */ .uverbs_abi_ver = NES_ABI_USERSPACE_VER, @@ -3617,7 +3618,6 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) if (nesibdev == NULL) { return NULL; } - nesibdev->ibdev.owner = THIS_MODULE; nesibdev->ibdev.node_type = RDMA_NODE_RNIC; memset(&nesibdev->ibdev.node_guid, 0, sizeof(nesibdev->ibdev.node_guid)); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index ef823f1144b5..b326313d413f 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -144,6 +144,7 @@ static const struct attribute_group ocrdma_attr_group = { }; static const struct ib_device_ops ocrdma_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_OCRDMA, .uverbs_abi_ver = OCRDMA_ABI_VERSION, @@ -203,7 +204,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) BUILD_BUG_ON(sizeof(OCRDMA_NODE_DESC) > IB_DEVICE_NODE_DESC_MAX); memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC, sizeof(OCRDMA_NODE_DESC)); - dev->ibdev.owner = THIS_MODULE; dev->ibdev.uverbs_cmd_mask = OCRDMA_UVERBS(GET_CONTEXT) | OCRDMA_UVERBS(QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 793e25776a7e..a0bb07ba0f3c 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -183,6 +183,7 @@ static void qedr_roce_register_device(struct qedr_dev *dev) } static const struct ib_device_ops qedr_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_QEDR, .uverbs_abi_ver = QEDR_ABI_VERSION, @@ -234,7 +235,6 @@ static int qedr_register_device(struct qedr_dev *dev) dev->ibdev.node_guid = dev->attr.node_guid; memcpy(dev->ibdev.node_desc, QEDR_NODE_DESC, sizeof(QEDR_NODE_DESC)); - dev->ibdev.owner = THIS_MODULE; dev->ibdev.uverbs_cmd_mask = QEDR_UVERBS(GET_CONTEXT) | QEDR_UVERBS(QUERY_DEVICE) | diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index bbc331d7f49b..54310fd6c7b6 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1482,6 +1482,7 @@ static void qib_fill_device_attr(struct qib_devdata *dd) } static const struct ib_device_ops qib_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_QIB, .init_port = qib_create_port_files, @@ -1547,7 +1548,6 @@ int qib_register_ib_device(struct qib_devdata *dd) if (!ib_qib_sys_image_guid) ib_qib_sys_image_guid = ppd->guid; - ibdev->owner = THIS_MODULE; ibdev->node_guid = ppd->guid; ibdev->phys_port_cnt = dd->num_pports; ibdev->dev.parent = &dd->pcidev->dev; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index f61690816095..e701322dc740 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -329,6 +329,7 @@ static void usnic_get_dev_fw_str(struct ib_device *device, char *str) } static const struct ib_device_ops usnic_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_USNIC, .uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION, @@ -387,7 +388,6 @@ static void *usnic_ib_device_add(struct pci_dev *dev) us_ibdev->pdev = dev; us_ibdev->netdev = pci_get_drvdata(dev); - us_ibdev->ib_dev.owner = THIS_MODULE; us_ibdev->ib_dev.node_type = RDMA_NODE_USNIC_UDP; us_ibdev->ib_dev.phys_port_cnt = USNIC_IB_PORT_CNT; us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS; diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index 2a7eb2838453..0c48464ffff1 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -144,6 +144,7 @@ static int pvrdma_port_immutable(struct ib_device *ibdev, u8 port_num, } static const struct ib_device_ops pvrdma_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_VMW_PVRDMA, .uverbs_abi_ver = PVRDMA_UVERBS_ABI_VERSION, @@ -201,7 +202,6 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) dev->ib_dev.node_guid = dev->dsr->caps.node_guid; dev->sys_image_guid = dev->dsr->caps.sys_image_guid; dev->flags = 0; - dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.num_comp_vectors = 1; dev->ib_dev.dev.parent = &dev->pdev->dev; dev->ib_dev.uverbs_cmd_mask = diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 9e87cdb82bec..046129393215 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1111,6 +1111,7 @@ static int rxe_enable_driver(struct ib_device *ib_dev) } static const struct ib_device_ops rxe_dev_ops = { + .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_RXE, .uverbs_abi_ver = RXE_UVERBS_ABI_VERSION, @@ -1173,7 +1174,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) strlcpy(dev->node_desc, "rxe", sizeof(dev->node_desc)); - dev->owner = THIS_MODULE; dev->node_type = RDMA_NODE_IB_CA; dev->phys_port_cnt = 1; dev->num_comp_vectors = num_possible_cpus(); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 16405b9bca13..d1f16a6c4810 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2329,6 +2329,7 @@ struct iw_cm_conn_param; * need to define the supported operations, otherwise they will be set to null. */ struct ib_device_ops { + struct module *owner; enum rdma_driver_id driver_id; u32 uverbs_abi_ver; @@ -2639,7 +2640,6 @@ struct ib_device { int num_comp_vectors; - struct module *owner; union { struct device dev; struct ib_core_device coredev; -- cgit v1.2.3 From a52c8e2469c30cf7ac453d624aed9c168b23d1af Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 28 May 2019 14:37:28 +0300 Subject: RDMA: Clean destroy CQ in drivers do not return errors Like all other destroy commands, .destroy_cq() call is not supposed to fail. In all flows, the attempt to return earlier caused to memory leaks. This patch converts .destroy_cq() to do not return any errors. Signed-off-by: Leon Romanovsky Acked-by: Gal Pressman Acked-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/core/cq.c | 5 +-- drivers/infiniband/core/verbs.c | 3 +- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 13 ++----- drivers/infiniband/hw/bnxt_re/ib_verbs.h | 2 +- drivers/infiniband/hw/cxgb3/cxio_hal.c | 6 ++-- drivers/infiniband/hw/cxgb3/cxio_hal.h | 2 +- drivers/infiniband/hw/cxgb3/iwch_provider.c | 3 +- drivers/infiniband/hw/cxgb4/cq.c | 13 +++---- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 2 +- drivers/infiniband/hw/efa/efa.h | 2 +- drivers/infiniband/hw/efa/efa_verbs.c | 9 ++--- drivers/infiniband/hw/hns/hns_roce_cq.c | 48 ++++++++++++------------- drivers/infiniband/hw/hns/hns_roce_device.h | 4 +-- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 14 ++------ drivers/infiniband/hw/i40iw/i40iw_verbs.c | 3 +- drivers/infiniband/hw/mlx4/cq.c | 4 +-- drivers/infiniband/hw/mlx4/mlx4_ib.h | 2 +- drivers/infiniband/hw/mlx5/cq.c | 4 +-- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- drivers/infiniband/hw/mthca/mthca_provider.c | 4 +-- drivers/infiniband/hw/nes/nes_utils.c | 4 +-- drivers/infiniband/hw/nes/nes_verbs.c | 30 +++++----------- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 8 ++--- drivers/infiniband/hw/ocrdma/ocrdma_hw.h | 2 +- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 6 ++-- drivers/infiniband/hw/ocrdma/ocrdma_verbs.h | 2 +- drivers/infiniband/hw/qedr/verbs.c | 20 ++--------- drivers/infiniband/hw/qedr/verbs.h | 2 +- drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 4 +-- drivers/infiniband/hw/usnic/usnic_ib_verbs.h | 2 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c | 6 +--- drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h | 2 +- drivers/infiniband/sw/rdmavt/cq.c | 6 +--- drivers/infiniband/sw/rdmavt/cq.h | 2 +- drivers/infiniband/sw/rxe/rxe_verbs.c | 3 +- include/rdma/ib_verbs.h | 2 +- 36 files changed, 81 insertions(+), 165 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index cb72aa4985a4..6ee62600a812 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -207,8 +207,6 @@ EXPORT_SYMBOL(__ib_alloc_cq_user); */ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) { - int ret; - if (WARN_ON_ONCE(atomic_read(&cq->usecnt))) return; @@ -228,7 +226,6 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) kfree(cq->wc); rdma_restrack_del(&cq->res); - ret = cq->device->ops.destroy_cq(cq, udata); - WARN_ON_ONCE(ret); + cq->device->ops.destroy_cq(cq, udata); } EXPORT_SYMBOL(ib_free_cq_user); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 4fd5aad890d2..933bc35701ad 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1949,7 +1949,8 @@ int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata) return -EBUSY; rdma_restrack_del(&cq->res); - return cq->device->ops.destroy_cq(cq, udata); + cq->device->ops.destroy_cq(cq, udata); + return 0; } EXPORT_SYMBOL(ib_destroy_cq_user); diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 8af8e1472101..0127af45dcd1 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2517,9 +2517,8 @@ int bnxt_re_post_recv(struct ib_qp *ib_qp, const struct ib_recv_wr *wr, } /* Completion Queues */ -int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) +void bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) { - int rc; struct bnxt_re_cq *cq; struct bnxt_qplib_nq *nq; struct bnxt_re_dev *rdev; @@ -2528,20 +2527,14 @@ int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) rdev = cq->rdev; nq = cq->qplib_cq.nq; - rc = bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq); - if (rc) { - dev_err(rdev_to_dev(rdev), "Failed to destroy HW CQ"); - return rc; - } - if (!IS_ERR_OR_NULL(cq->umem)) + bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq); + if (!cq->umem) ib_umem_release(cq->umem); atomic_dec(&rdev->cq_count); nq->budget--; kfree(cq->cql); kfree(cq); - - return 0; } struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index 09a33049e42f..828403ee0104 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -193,7 +193,7 @@ int bnxt_re_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); -int bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); +void bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc); int bnxt_re_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *pd, int mr_access_flags); diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c index d9c741fea0e9..37ee93824349 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.c +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -303,17 +303,15 @@ err1: return -ENOMEM; } -int cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) +void cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) { - int err; - err = cxio_hal_clear_cq_ctx(rdev_p, cq->cqid); + cxio_hal_clear_cq_ctx(rdev_p, cq->cqid); kfree(cq->sw_queue); dma_free_coherent(&(rdev_p->rnic_info.pdev->dev), (1UL << (cq->size_log2)) * sizeof(struct t3_cqe) + 1, cq->queue, dma_unmap_addr(cq, mapping)); cxio_hal_put_cqid(rdev_p->rscp, cq->cqid); - return err; } int cxio_destroy_qp(struct cxio_rdev *rdev_p, struct t3_wq *wq, diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h index 5fc26a4648d3..40c029ffa425 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.h +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h @@ -158,7 +158,7 @@ void cxio_rdev_close(struct cxio_rdev *rdev); int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq, enum t3_cq_opcode op, u32 credit); int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq, int kernel); -int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq); +void cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq); void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx); void cxio_init_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx); int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq, diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 1b35941eee74..5bde4ae93681 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -88,7 +88,7 @@ static int iwch_alloc_ucontext(struct ib_ucontext *ucontext, return 0; } -static int iwch_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) +static void iwch_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) { struct iwch_cq *chp; @@ -101,7 +101,6 @@ static int iwch_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) cxio_destroy_cq(&chp->rhp->rdev, &chp->cq); kfree(chp); - return 0; } static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 6557e7c5af66..f49e6d271c42 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -34,14 +34,13 @@ #include "iw_cxgb4.h" -static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, - struct c4iw_dev_ucontext *uctx, struct sk_buff *skb, - struct c4iw_wr_wait *wr_waitp) +static void destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, + struct c4iw_dev_ucontext *uctx, struct sk_buff *skb, + struct c4iw_wr_wait *wr_waitp) { struct fw_ri_res_wr *res_wr; struct fw_ri_res *res; int wr_len; - int ret; wr_len = sizeof(*res_wr) + sizeof(*res); set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); @@ -59,14 +58,13 @@ static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, res->u.cq.iqid = cpu_to_be32(cq->cqid); c4iw_init_wr_wait(wr_waitp); - ret = c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, 0, __func__); + c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, 0, __func__); kfree(cq->sw_queue); dma_free_coherent(&(rdev->lldi.pdev->dev), cq->memsize, cq->queue, dma_unmap_addr(cq, mapping)); c4iw_put_cqid(rdev, cq->cqid, uctx); - return ret; } static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, @@ -970,7 +968,7 @@ int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) return !err || err == -ENODATA ? npolled : err; } -int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) +void c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) { struct c4iw_cq *chp; struct c4iw_ucontext *ucontext; @@ -989,7 +987,6 @@ int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) chp->destroy_skb, chp->wr_waitp); c4iw_put_wr_wait(chp->wr_waitp); kfree(chp); - return 0; } struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index cf7512b2c4c0..45e720288f0f 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -992,7 +992,7 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, struct ib_udata *udata); struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc); int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata); -int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); +void c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h index 14a36546985b..52d894f0ad3e 100644 --- a/drivers/infiniband/hw/efa/efa.h +++ b/drivers/infiniband/hw/efa/efa.h @@ -134,7 +134,7 @@ int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); struct ib_qp *efa_create_qp(struct ib_pd *ibpd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); -int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); struct ib_cq *efa_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 607aff869200..42865cf4f149 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -847,25 +847,20 @@ static int efa_destroy_cq_idx(struct efa_dev *dev, int cq_idx) return efa_com_destroy_cq(&dev->edev, ¶ms); } -int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) { struct efa_dev *dev = to_edev(ibcq->device); struct efa_cq *cq = to_ecq(ibcq); - int err; ibdev_dbg(&dev->ibdev, "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n", cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr); - err = efa_destroy_cq_idx(dev, cq->cq_idx); - if (err) - return err; - + efa_destroy_cq_idx(dev, cq->cq_idx); dma_unmap_single(&dev->pdev->dev, cq->dma_addr, cq->size, DMA_FROM_DEVICE); kfree(cq); - return 0; } static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq, diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 6e81ff3f1813..0eb7c16c007b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -443,40 +443,36 @@ err_cq: } EXPORT_SYMBOL_GPL(hns_roce_ib_create_cq); -int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) +void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device); struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq); - int ret = 0; if (hr_dev->hw->destroy_cq) { - ret = hr_dev->hw->destroy_cq(ib_cq, udata); - } else { - hns_roce_free_cq(hr_dev, hr_cq); - hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt); - - if (udata) { - ib_umem_release(hr_cq->umem); - - if (hr_cq->db_en == 1) - hns_roce_db_unmap_user( - rdma_udata_to_drv_context( - udata, - struct hns_roce_ucontext, - ibucontext), - &hr_cq->db); - } else { - /* Free the buff of stored cq */ - hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf, - ib_cq->cqe); - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) - hns_roce_free_db(hr_dev, &hr_cq->db); - } + hr_dev->hw->destroy_cq(ib_cq, udata); + return; + } + + hns_roce_free_cq(hr_dev, hr_cq); + hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt); - kfree(hr_cq); + if (udata) { + ib_umem_release(hr_cq->umem); + + if (hr_cq->db_en == 1) + hns_roce_db_unmap_user(rdma_udata_to_drv_context( + udata, + struct hns_roce_ucontext, + ibucontext), + &hr_cq->db); + } else { + /* Free the buff of stored cq */ + hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf, ib_cq->cqe); + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) + hns_roce_free_db(hr_dev, &hr_cq->db); } - return ret; + kfree(hr_cq); } EXPORT_SYMBOL_GPL(hns_roce_ib_destroy_cq); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index ce23338831eb..2f7d9644fd24 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -938,7 +938,7 @@ struct hns_roce_hw { int (*poll_cq)(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int (*dereg_mr)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, struct ib_udata *udata); - int (*destroy_cq)(struct ib_cq *ibcq, struct ib_udata *udata); + void (*destroy_cq)(struct ib_cq *ibcq, struct ib_udata *udata); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); int (*init_eq)(struct hns_roce_dev *hr_dev); void (*cleanup_eq)(struct hns_roce_dev *hr_dev); @@ -1209,7 +1209,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); -int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); +void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq); int hns_roce_db_map_user(struct hns_roce_ucontext *context, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 878c8ae35630..aa7b67d283af 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -865,8 +865,7 @@ alloc_pd_failed: kfree(pd); alloc_mem_failed: - if (hns_roce_ib_destroy_cq(cq, NULL)) - dev_err(dev, "Destroy cq for create_lp_qp failed!\n"); + hns_roce_ib_destroy_cq(cq, NULL); return ret; } @@ -894,10 +893,7 @@ static void hns_roce_v1_release_lp_qp(struct hns_roce_dev *hr_dev) i, ret); } - ret = hns_roce_ib_destroy_cq(&free_mr->mr_free_cq->ib_cq, NULL); - if (ret) - dev_err(dev, "Destroy cq for mr_free failed(%d)!\n", ret); - + hns_roce_ib_destroy_cq(&free_mr->mr_free_cq->ib_cq, NULL); hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd, NULL); } @@ -3654,7 +3650,7 @@ int hns_roce_v1_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) return 0; } -static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +static void hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibcq->device); struct hns_roce_cq *hr_cq = to_hr_cq(ibcq); @@ -3663,7 +3659,6 @@ static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) u32 cqe_cnt_cur; u32 cq_buf_size; int wait_time = 0; - int ret = 0; hns_roce_free_cq(hr_dev, hr_cq); @@ -3685,7 +3680,6 @@ static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) if (wait_time > HNS_ROCE_MAX_FREE_CQ_WAIT_CNT) { dev_warn(dev, "Destroy cq 0x%lx timeout!\n", hr_cq->cqn); - ret = -ETIMEDOUT; break; } wait_time++; @@ -3702,8 +3696,6 @@ static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) } kfree(hr_cq); - - return ret; } static void set_eq_cons_index_v1(struct hns_roce_eq *eq, int req_not) diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index bfe16e6f04f4..205053cb5f97 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -1064,7 +1064,7 @@ void i40iw_cq_wq_destroy(struct i40iw_device *iwdev, struct i40iw_sc_cq *cq) * @ib_cq: cq pointer * @udata: user data or NULL for kernel object */ -static int i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) +static void i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) { struct i40iw_cq *iwcq; struct i40iw_device *iwdev; @@ -1077,7 +1077,6 @@ static int i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) cq_free_resources(iwdev, iwcq); kfree(iwcq); i40iw_rem_devusecount(iwdev); - return 0; } /** diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 022a0b4ea452..8eb7490dabb8 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -486,7 +486,7 @@ out: return err; } -int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) +void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(cq->device); struct mlx4_ib_cq *mcq = to_mcq(cq); @@ -508,8 +508,6 @@ int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) } kfree(mcq); - - return 0; } static void dump_cqe(void *cqe) diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 26897102057d..af5ee45a9f19 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -746,7 +746,7 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); -int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); +void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq); diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 2e2e65f00257..ebd01bd7f8f6 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -998,7 +998,7 @@ err_create: return ERR_PTR(err); } -int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) +void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(cq->device); struct mlx5_ib_cq *mcq = to_mcq(cq); @@ -1010,8 +1010,6 @@ int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) destroy_cq_kernel(dev, mcq); kfree(mcq); - - return 0; } static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 40eb8be482e4..bf697c5b3e79 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1118,7 +1118,7 @@ int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); -int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); +void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index b128ff76f709..81fc04e1c142 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -804,7 +804,7 @@ out: return ret; } -static int mthca_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) +static void mthca_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) { if (udata) { struct mthca_ucontext *context = @@ -824,8 +824,6 @@ static int mthca_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) } mthca_free_cq(to_mdev(cq->device), to_mcq(cq)); kfree(cq); - - return 0; } static inline u32 convert_access(int acc) diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c index 90f28890246d..e976292fc6c0 100644 --- a/drivers/infiniband/hw/nes/nes_utils.c +++ b/drivers/infiniband/hw/nes/nes_utils.c @@ -588,9 +588,7 @@ struct nes_cqp_request *nes_get_cqp_request(struct nes_device *nesdev) cqp_request->callback = 0; nes_debug(NES_DBG_CQP, "Got cqp request %p from the available list \n", cqp_request); - } else - printk(KERN_ERR PFX "%s: Could not allocated a CQP request.\n", - __func__); + } return cqp_request; } diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 7677f1f734bb..cac3fa624c4d 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1634,7 +1634,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, /** * nes_destroy_cq */ -static int nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) +static void nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) { struct nes_cq *nescq; struct nes_device *nesdev; @@ -1644,7 +1644,6 @@ static int nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) struct nes_cqp_request cqp_request = {}; unsigned long flags; u32 opcode = 0; - int ret; nescq = to_nescq(ib_cq); nesvnic = to_nesvnic(ib_cq->device); @@ -1656,6 +1655,7 @@ static int nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) /* Send DestroyCQ request to CQP */ INIT_LIST_HEAD(&cqp_request.list); init_waitqueue_head(&cqp_request.waitq); + cqp_request.waiting = 1; cqp_wqe = &cqp_request.cqp_wqe; opcode = NES_CQP_DESTROY_CQ | (nescq->hw_cq.cq_size << 16); @@ -1689,30 +1689,18 @@ static int nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) /* Wait for CQP */ nes_debug(NES_DBG_CQ, "Waiting for destroy iWARP CQ%u to complete.\n", nescq->hw_cq.cq_number); - ret = wait_event_timeout(cqp_request.waitq, cqp_request.request_done, - NES_EVENT_TIMEOUT); - nes_debug(NES_DBG_CQ, "Destroy iWARP CQ%u completed, wait_event_timeout ret = %u," - " CQP Major:Minor codes = 0x%04X:0x%04X.\n", - nescq->hw_cq.cq_number, ret, cqp_request.major_code, - cqp_request.minor_code); - if (!ret) { - nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy timeout expired\n", - nescq->hw_cq.cq_number); - ret = -ETIME; - } else if (cqp_request.major_code) { - nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy failed\n", - nescq->hw_cq.cq_number); - ret = -EIO; - } else { - ret = 0; - } + wait_event_timeout(cqp_request.waitq, cqp_request.request_done, + NES_EVENT_TIMEOUT); + nes_debug( + NES_DBG_CQ, + "Destroy iWARP CQ%u completed CQP Major:Minor codes = 0x%04X:0x%04X.\n", + nescq->hw_cq.cq_number, cqp_request.major_code, + cqp_request.minor_code); if (nescq->cq_mem_size) pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, nescq->hw_cq.cq_vbase, nescq->hw_cq.cq_pbase); kfree(nescq); - - return ret; } /** diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 5127e2ea4bdd..b2dd4e0a4be2 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -1888,14 +1888,13 @@ mem_err: return status; } -int ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq) +void ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq) { - int status = -ENOMEM; struct ocrdma_destroy_cq *cmd; cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_CQ, sizeof(*cmd)); if (!cmd) - return status; + return; ocrdma_init_mch(&cmd->req, OCRDMA_CMD_DELETE_CQ, OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); @@ -1903,11 +1902,10 @@ int ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq) (cq->id << OCRDMA_DESTROY_CQ_QID_SHIFT) & OCRDMA_DESTROY_CQ_QID_MASK; - status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); + ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); ocrdma_unbind_eq(dev, cq->eqn); dma_free_coherent(&dev->nic_info.pdev->dev, cq->len, cq->va, cq->pa); kfree(cmd); - return status; } int ocrdma_mbx_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_hw_mr *hwmr, diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h index 06ec59326a90..12c23a7652b9 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h @@ -122,7 +122,7 @@ int ocrdma_reg_mr(struct ocrdma_dev *, struct ocrdma_hw_mr *hwmr, u32 pd_id, int acc); int ocrdma_mbx_create_cq(struct ocrdma_dev *, struct ocrdma_cq *, int entries, int dpp_cq, u16 pd_id); -int ocrdma_mbx_destroy_cq(struct ocrdma_dev *, struct ocrdma_cq *); +void ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq); int ocrdma_mbx_create_qp(struct ocrdma_qp *, struct ib_qp_init_attr *attrs, u8 enable_dpp_cq, u16 dpp_cq_id, u16 *dpp_offset, diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 35ec87015792..94e4f7f9b1f7 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -1070,7 +1070,7 @@ static void ocrdma_flush_cq(struct ocrdma_cq *cq) spin_unlock_irqrestore(&cq->cq_lock, flags); } -int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +void ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) { struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); struct ocrdma_eq *eq = NULL; @@ -1080,14 +1080,13 @@ int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) dev->cq_tbl[cq->id] = NULL; indx = ocrdma_get_eq_table_index(dev, cq->eqn); - BUG_ON(indx == -EINVAL); eq = &dev->eq_tbl[indx]; irq = ocrdma_get_irq(dev, eq); synchronize_irq(irq); ocrdma_flush_cq(cq); - (void)ocrdma_mbx_destroy_cq(dev, cq); + ocrdma_mbx_destroy_cq(dev, cq); if (cq->ucontext) { pdid = cq->ucontext->cntxt_pd->id; ocrdma_del_mmap(cq->ucontext, (u64) cq->pa, @@ -1098,7 +1097,6 @@ int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) } kfree(cq); - return 0; } static int ocrdma_add_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index d76aae7ed0d3..89cebe05669e 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -75,7 +75,7 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); int ocrdma_resize_cq(struct ib_cq *, int cqe, struct ib_udata *); -int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +void ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); struct ib_qp *ocrdma_create_qp(struct ib_pd *, struct ib_qp_init_attr *attrs, diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 3c0dba072071..be29bbbc4b14 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -955,14 +955,13 @@ int qedr_resize_cq(struct ib_cq *ibcq, int new_cnt, struct ib_udata *udata) #define QEDR_DESTROY_CQ_MAX_ITERATIONS (10) #define QEDR_DESTROY_CQ_ITER_DURATION (10) -int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) { struct qedr_dev *dev = get_qedr_dev(ibcq->device); struct qed_rdma_destroy_cq_out_params oparams; struct qed_rdma_destroy_cq_in_params iparams; struct qedr_cq *cq = get_qedr_cq(ibcq); int iter; - int rc; DP_DEBUG(dev, QEDR_MSG_CQ, "destroy cq %p (icid=%d)\n", cq, cq->icid); @@ -973,10 +972,7 @@ int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) goto done; iparams.icid = cq->icid; - rc = dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams); - if (rc) - return rc; - + dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams); dev->ops->common->chain_free(dev->cdev, &cq->pbl); if (udata) { @@ -1007,9 +1003,6 @@ int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) iter--; } - if (oparams.num_cq_notif != cq->cnq_notif) - goto err; - /* Note that we don't need to have explicit code to wait for the * completion of the event handler because it is invoked from the EQ. * Since the destroy CQ ramrod has also been received on the EQ we can @@ -1019,15 +1012,6 @@ done: cq->sig = ~cq->sig; kfree(cq); - - return 0; - -err: - DP_ERR(dev, - "CQ %p (icid=%d) not freed, expecting %d ints but got %d ints\n", - cq, cq->icid, oparams.num_cq_notif, cq->cnq_notif); - - return -EINVAL; } static inline int get_gid_info_from_table(struct ib_qp *ibqp, diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h index 9328c80375ef..32d7ce77e339 100644 --- a/drivers/infiniband/hw/qedr/verbs.h +++ b/drivers/infiniband/hw/qedr/verbs.h @@ -54,7 +54,7 @@ struct ib_cq *qedr_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); int qedr_resize_cq(struct ib_cq *, int cqe, struct ib_udata *); -int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); struct ib_qp *qedr_create_qp(struct ib_pd *, struct ib_qp_init_attr *attrs, struct ib_udata *); diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index e9352750e029..5686d14b86fe 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -604,11 +604,9 @@ struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, return cq; } -int usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) +void usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) { - usnic_dbg("\n"); kfree(cq); - return 0; } struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h index 028f322f8e9b..0b9d993433a7 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h @@ -61,7 +61,7 @@ int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); -int usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); +void usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index d7deb19a2800..0682781f6555 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -246,10 +246,8 @@ static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq) * pvrdma_destroy_cq - destroy completion queue * @cq: the completion queue to destroy. * @udata: user data or null for kernel object - * - * @return: 0 for success. */ -int pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) +void pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) { struct pvrdma_cq *vcq = to_vcq(cq); union pvrdma_cmd_req req; @@ -275,8 +273,6 @@ int pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) pvrdma_free_cq(dev, vcq); atomic_dec(&dev->num_cqs); - - return ret; } static inline struct pvrdma_cqe *get_cqe(struct pvrdma_cq *cq, int i) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h index 9d7b021e1c59..f0dd6e4d058b 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h @@ -412,7 +412,7 @@ int pvrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); -int pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); +void pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int pvrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int pvrdma_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); int pvrdma_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags, diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index a06e6da7a026..8e76036fad4a 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -300,10 +300,8 @@ done: * @udata: user data or NULL for kernel object * * Called by ib_destroy_cq() in the generic verbs code. - * - * Return: always 0 */ -int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) { struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); struct rvt_dev_info *rdi = cq->rdi; @@ -317,8 +315,6 @@ int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) else vfree(cq->queue); kfree(cq); - - return 0; } /** diff --git a/drivers/infiniband/sw/rdmavt/cq.h b/drivers/infiniband/sw/rdmavt/cq.h index 3ad6faf18ecb..495d8c3e6580 100644 --- a/drivers/infiniband/sw/rdmavt/cq.h +++ b/drivers/infiniband/sw/rdmavt/cq.h @@ -54,7 +54,7 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_udata *udata); -int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags); int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 046129393215..b14881decbee 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -819,14 +819,13 @@ err1: return ERR_PTR(err); } -static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +static void rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) { struct rxe_cq *cq = to_rcq(ibcq); rxe_cq_disable(cq); rxe_drop_ref(cq); - return 0; } static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index d1f16a6c4810..bc1d94c9c9ba 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2462,7 +2462,7 @@ struct ib_device_ops { const struct ib_cq_init_attr *attr, struct ib_udata *udata); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); - int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata); + void (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata); int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); struct ib_mr *(*get_dma_mr)(struct ib_pd *pd, int mr_access_flags); struct ib_mr *(*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length, -- cgit v1.2.3 From e39afe3d6dbd908d8fd189571a3c1561088a86c2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 28 May 2019 14:37:29 +0300 Subject: RDMA: Convert CQ allocations to be under core responsibility Ensure that CQ is allocated and freed by IB/core and not by drivers. Signed-off-by: Leon Romanovsky Acked-by: Gal Pressman Reviewed-by: Dennis Dalessandro Tested-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/core/cq.c | 28 +++++++----- drivers/infiniband/core/device.c | 1 + drivers/infiniband/core/uverbs_cmd.c | 15 ++++--- drivers/infiniband/core/uverbs_std_types_cq.c | 19 +++++--- drivers/infiniband/core/verbs.c | 32 ++++++++----- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 20 +++------ drivers/infiniband/hw/bnxt_re/ib_verbs.h | 7 ++- drivers/infiniband/hw/bnxt_re/main.c | 1 + drivers/infiniband/hw/cxgb3/iwch_provider.c | 43 ++++++++---------- drivers/infiniband/hw/cxgb4/cq.c | 27 +++++------ drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 5 +-- drivers/infiniband/hw/cxgb4/provider.c | 1 + drivers/infiniband/hw/efa/efa.h | 5 +-- drivers/infiniband/hw/efa/efa_main.c | 1 + drivers/infiniband/hw/efa/efa_verbs.c | 48 ++++++-------------- drivers/infiniband/hw/hns/hns_roce_cq.c | 23 ++++------ drivers/infiniband/hw/hns/hns_roce_device.h | 6 +-- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 21 +++++---- drivers/infiniband/hw/hns/hns_roce_main.c | 1 + drivers/infiniband/hw/i40iw/i40iw_verbs.c | 33 ++++++-------- drivers/infiniband/hw/mlx4/cq.c | 25 ++++------- drivers/infiniband/hw/mlx4/main.c | 1 + drivers/infiniband/hw/mlx4/mlx4_ib.h | 5 +-- drivers/infiniband/hw/mlx5/cq.c | 32 +++++-------- drivers/infiniband/hw/mlx5/main.c | 21 +++++---- drivers/infiniband/hw/mlx5/mlx5_ib.h | 5 +-- drivers/infiniband/hw/mthca/mthca_provider.c | 36 +++++++-------- drivers/infiniband/hw/nes/nes_verbs.c | 60 +++++++++---------------- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 1 + drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 29 +++++------- drivers/infiniband/hw/ocrdma/ocrdma_verbs.h | 5 +-- drivers/infiniband/hw/qedr/main.c | 1 + drivers/infiniband/hw/qedr/verbs.c | 28 ++++-------- drivers/infiniband/hw/qedr/verbs.h | 5 +-- drivers/infiniband/hw/usnic/usnic_ib.h | 4 ++ drivers/infiniband/hw/usnic/usnic_ib_main.c | 1 + drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 18 +++----- drivers/infiniband/hw/usnic/usnic_ib_verbs.h | 5 +-- drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c | 34 +++++--------- drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 1 + drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h | 5 +-- drivers/infiniband/sw/rdmavt/cq.c | 51 +++++++-------------- drivers/infiniband/sw/rdmavt/cq.h | 5 +-- drivers/infiniband/sw/rdmavt/vt.c | 1 + drivers/infiniband/sw/rxe/rxe_pool.c | 1 + drivers/infiniband/sw/rxe/rxe_verbs.c | 30 +++++-------- drivers/infiniband/sw/rxe/rxe_verbs.h | 2 +- include/rdma/ib_verbs.h | 6 +-- 48 files changed, 318 insertions(+), 437 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index 6ee62600a812..3b9412c69565 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -147,23 +147,26 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, struct ib_cq *cq; int ret = -ENOMEM; - cq = dev->ops.create_cq(dev, &cq_attr, NULL); - if (IS_ERR(cq)) - return cq; + cq = rdma_zalloc_drv_obj(dev, ib_cq); + if (!cq) + return ERR_PTR(ret); cq->device = dev; - cq->uobject = NULL; - cq->event_handler = NULL; cq->cq_context = private; cq->poll_ctx = poll_ctx; atomic_set(&cq->usecnt, 0); cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL); if (!cq->wc) - goto out_destroy_cq; + goto out_free_cq; cq->res.type = RDMA_RESTRACK_CQ; rdma_restrack_set_task(&cq->res, caller); + + ret = dev->ops.create_cq(cq, &cq_attr, NULL); + if (ret) + goto out_free_wc; + rdma_restrack_kadd(&cq->res); switch (cq->poll_ctx) { @@ -186,16 +189,18 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, break; default: ret = -EINVAL; - goto out_free_wc; + goto out_destroy_cq; } return cq; -out_free_wc: - kfree(cq->wc); - rdma_restrack_del(&cq->res); out_destroy_cq: + rdma_restrack_del(&cq->res); cq->device->ops.destroy_cq(cq, udata); +out_free_wc: + kfree(cq->wc); +out_free_cq: + kfree(cq); return ERR_PTR(ret); } EXPORT_SYMBOL(__ib_alloc_cq_user); @@ -224,8 +229,9 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) WARN_ON_ONCE(1); } - kfree(cq->wc); rdma_restrack_del(&cq->res); cq->device->ops.destroy_cq(cq, udata); + kfree(cq->wc); + kfree(cq); } EXPORT_SYMBOL(ib_free_cq_user); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 357d74c8df2b..abb169f31d0f 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2431,6 +2431,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, unmap_fmr); SET_OBJ_SIZE(dev_ops, ib_ah); + SET_OBJ_SIZE(dev_ops, ib_cq); SET_OBJ_SIZE(dev_ops, ib_pd); SET_OBJ_SIZE(dev_ops, ib_srq); SET_OBJ_SIZE(dev_ops, ib_ucontext); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 5a3a1780ceea..5c00d9a5698a 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1010,12 +1010,11 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs, attr.comp_vector = cmd->comp_vector; attr.flags = cmd->flags; - cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata); - if (IS_ERR(cq)) { - ret = PTR_ERR(cq); + cq = rdma_zalloc_drv_obj(ib_dev, ib_cq); + if (!cq) { + ret = -ENOMEM; goto err_file; } - cq->device = ib_dev; cq->uobject = &obj->uobject; cq->comp_handler = ib_uverbs_comp_handler; @@ -1023,6 +1022,10 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs, cq->cq_context = ev_file ? &ev_file->ev_queue : NULL; atomic_set(&cq->usecnt, 0); + ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata); + if (ret) + goto err_free; + obj->uobject.object = cq; memset(&resp, 0, sizeof resp); resp.base.cq_handle = obj->uobject.id; @@ -1043,7 +1046,9 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs, err_cb: ib_destroy_cq(cq); - + cq = NULL; +err_free: + kfree(cq); err_file: if (ev_file) ib_uverbs_release_ucq(attrs->ufile, ev_file, obj); diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index db5c46a1bb2d..06b8c7d017b7 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -111,9 +111,9 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( INIT_LIST_HEAD(&obj->comp_list); INIT_LIST_HEAD(&obj->async_list); - cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata); - if (IS_ERR(cq)) { - ret = PTR_ERR(cq); + cq = rdma_zalloc_drv_obj(ib_dev, ib_cq); + if (!cq) { + ret = -ENOMEM; goto err_event_file; } @@ -122,10 +122,15 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( cq->comp_handler = ib_uverbs_comp_handler; cq->event_handler = ib_uverbs_cq_event_handler; cq->cq_context = ev_file ? &ev_file->ev_queue : NULL; - obj->uobject.object = cq; - obj->uobject.user_handle = user_handle; atomic_set(&cq->usecnt, 0); cq->res.type = RDMA_RESTRACK_CQ; + + ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata); + if (ret) + goto err_free; + + obj->uobject.object = cq; + obj->uobject.user_handle = user_handle; rdma_restrack_uadd(&cq->res); ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe, @@ -136,7 +141,9 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( return 0; err_cq: ib_destroy_cq(cq); - + cq = NULL; +err_free: + kfree(cq); err_event_file: if (ev_file) uverbs_uobject_put(ev_file_uobj); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 933bc35701ad..585e100706aa 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1916,21 +1916,28 @@ struct ib_cq *__ib_create_cq(struct ib_device *device, const char *caller) { struct ib_cq *cq; + int ret; + + cq = rdma_zalloc_drv_obj(device, ib_cq); + if (!cq) + return ERR_PTR(-ENOMEM); - cq = device->ops.create_cq(device, cq_attr, NULL); - - if (!IS_ERR(cq)) { - cq->device = device; - cq->uobject = NULL; - cq->comp_handler = comp_handler; - cq->event_handler = event_handler; - cq->cq_context = cq_context; - atomic_set(&cq->usecnt, 0); - cq->res.type = RDMA_RESTRACK_CQ; - rdma_restrack_set_task(&cq->res, caller); - rdma_restrack_kadd(&cq->res); + cq->device = device; + cq->uobject = NULL; + cq->comp_handler = comp_handler; + cq->event_handler = event_handler; + cq->cq_context = cq_context; + atomic_set(&cq->usecnt, 0); + cq->res.type = RDMA_RESTRACK_CQ; + rdma_restrack_set_task(&cq->res, caller); + + ret = device->ops.create_cq(cq, cq_attr, NULL); + if (ret) { + kfree(cq); + return ERR_PTR(ret); } + rdma_restrack_kadd(&cq->res); return cq; } EXPORT_SYMBOL(__ib_create_cq); @@ -1950,6 +1957,7 @@ int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata) rdma_restrack_del(&cq->res); cq->device->ops.destroy_cq(cq, udata); + kfree(cq); return 0; } EXPORT_SYMBOL(ib_destroy_cq_user); diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 0127af45dcd1..44cc5f19df3b 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2534,16 +2534,14 @@ void bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) atomic_dec(&rdev->cq_count); nq->budget--; kfree(cq->cql); - kfree(cq); } -struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { - struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); + struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibcq->device, ibdev); struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; - struct bnxt_re_cq *cq = NULL; + struct bnxt_re_cq *cq = container_of(ibcq, struct bnxt_re_cq, ib_cq); int rc, entries; int cqe = attr->cqe; struct bnxt_qplib_nq *nq = NULL; @@ -2552,11 +2550,8 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, /* Validate CQ fields */ if (cqe < 1 || cqe > dev_attr->max_cq_wqes) { dev_err(rdev_to_dev(rdev), "Failed to create CQ -max exceeded"); - return ERR_PTR(-EINVAL); + return -EINVAL; } - cq = kzalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) - return ERR_PTR(-ENOMEM); cq->rdev = rdev; cq->qplib_cq.cq_handle = (u64)(unsigned long)(&cq->qplib_cq); @@ -2634,15 +2629,14 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, } } - return &cq->ib_cq; + return 0; c2fail: if (udata) ib_umem_release(cq->umem); fail: kfree(cq->cql); - kfree(cq); - return ERR_PTR(rc); + return rc; } static u8 __req_to_ib_wc_status(u8 qstatus) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index 828403ee0104..31662b1ee35a 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -94,11 +94,11 @@ struct bnxt_re_qp { }; struct bnxt_re_cq { + struct ib_cq ib_cq; struct bnxt_re_dev *rdev; spinlock_t cq_lock; /* protect cq */ u16 cq_count; u16 cq_period; - struct ib_cq ib_cq; struct bnxt_qplib_cq qplib_cq; struct bnxt_qplib_cqe *cql; #define MAX_CQL_PER_POLL 1024 @@ -190,9 +190,8 @@ int bnxt_re_post_send(struct ib_qp *qp, const struct ib_send_wr *send_wr, const struct ib_send_wr **bad_send_wr); int bnxt_re_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); -struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); void bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc); int bnxt_re_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 351c420248a0..029babe713f3 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -641,6 +641,7 @@ static const struct ib_device_ops bnxt_re_dev_ops = { .reg_user_mr = bnxt_re_reg_user_mr, .req_notify_cq = bnxt_re_req_notify_cq, INIT_RDMA_OBJ_SIZE(ib_ah, bnxt_re_ah, ib_ah), + INIT_RDMA_OBJ_SIZE(ib_cq, bnxt_re_cq, ib_cq), INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd), INIT_RDMA_OBJ_SIZE(ib_srq, bnxt_re_srq, ib_srq), INIT_RDMA_OBJ_SIZE(ib_ucontext, bnxt_re_ucontext, ib_uctx), diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 5bde4ae93681..acba96f289cc 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -100,16 +100,16 @@ static void iwch_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) wait_event(chp->wait, !atomic_read(&chp->refcnt)); cxio_destroy_cq(&chp->rhp->rdev, &chp->cq); - kfree(chp); } -static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +static int iwch_create_cq(struct ib_cq *ibcq, + const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; - struct iwch_dev *rhp; - struct iwch_cq *chp; + struct iwch_dev *rhp = to_iwch_dev(ibcq->device); + struct iwch_cq *chp = to_iwch_cq(ibcq); struct iwch_create_cq_resp uresp; struct iwch_create_cq_req ureq; static int warned; @@ -117,19 +117,13 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, pr_debug("%s ib_dev %p entries %d\n", __func__, ibdev, entries); if (attr->flags) - return ERR_PTR(-EINVAL); - - rhp = to_iwch_dev(ibdev); - chp = kzalloc(sizeof(*chp), GFP_KERNEL); - if (!chp) - return ERR_PTR(-ENOMEM); + return -EINVAL; if (udata) { if (!t3a_device(rhp)) { - if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) { - kfree(chp); - return ERR_PTR(-EFAULT); - } + if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) + return -EFAULT; + chp->user_rptr_addr = (u32 __user *)(unsigned long)ureq.user_rptr_addr; } } @@ -150,10 +144,9 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, entries = roundup_pow_of_two(entries); chp->cq.size_log2 = ilog2(entries); - if (cxio_create_cq(&rhp->rdev, &chp->cq, !udata)) { - kfree(chp); - return ERR_PTR(-ENOMEM); - } + if (cxio_create_cq(&rhp->rdev, &chp->cq, !udata)) + return -ENOMEM; + chp->rhp = rhp; chp->ibcq.cqe = 1 << chp->cq.size_log2; spin_lock_init(&chp->lock); @@ -162,8 +155,7 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, init_waitqueue_head(&chp->wait); if (xa_store_irq(&rhp->cqs, chp->cq.cqid, chp, GFP_KERNEL)) { cxio_destroy_cq(&chp->rhp->rdev, &chp->cq); - kfree(chp); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } if (udata) { @@ -174,7 +166,7 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, mm = kmalloc(sizeof(*mm), GFP_KERNEL); if (!mm) { iwch_destroy_cq(&chp->ibcq, udata); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } uresp.cqid = chp->cq.cqid; uresp.size_log2 = chp->cq.size_log2; @@ -200,14 +192,14 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, if (ib_copy_to_udata(udata, &uresp, resplen)) { kfree(mm); iwch_destroy_cq(&chp->ibcq, udata); - return ERR_PTR(-EFAULT); + return -EFAULT; } insert_mmap(ucontext, mm); } pr_debug("created cqid 0x%0x chp %p size 0x%0x, dma_addr %pad\n", chp->cq.cqid, chp, (1 << chp->cq.size_log2), &chp->cq.dma_addr); - return &chp->ibcq; + return 0; } static int iwch_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) @@ -1277,6 +1269,7 @@ static const struct ib_device_ops iwch_dev_ops = { .reg_user_mr = iwch_reg_user_mr, .req_notify_cq = iwch_arm_cq, INIT_RDMA_OBJ_SIZE(ib_pd, iwch_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_cq, iwch_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_ucontext, iwch_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index f49e6d271c42..3cc4d3331a3f 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -986,17 +986,16 @@ void c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx, chp->destroy_skb, chp->wr_waitp); c4iw_put_wr_wait(chp->wr_waitp); - kfree(chp); } -struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; int vector = attr->comp_vector; - struct c4iw_dev *rhp; - struct c4iw_cq *chp; + struct c4iw_dev *rhp = to_c4iw_dev(ibcq->device); + struct c4iw_cq *chp = to_c4iw_cq(ibcq); struct c4iw_create_cq ucmd; struct c4iw_create_cq_resp uresp; int ret, wr_len; @@ -1007,22 +1006,16 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, pr_debug("ib_dev %p entries %d\n", ibdev, entries); if (attr->flags) - return ERR_PTR(-EINVAL); - - rhp = to_c4iw_dev(ibdev); + return -EINVAL; if (vector >= rhp->rdev.lldi.nciq) - return ERR_PTR(-EINVAL); + return -EINVAL; if (udata) { if (udata->inlen < sizeof(ucmd)) ucontext->is_32b_cqe = 1; } - chp = kzalloc(sizeof(*chp), GFP_KERNEL); - if (!chp) - return ERR_PTR(-ENOMEM); - chp->wr_waitp = c4iw_alloc_wr_wait(GFP_KERNEL); if (!chp->wr_waitp) { ret = -ENOMEM; @@ -1132,10 +1125,11 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, mm2->len = PAGE_SIZE; insert_mmap(ucontext, mm2); } + pr_debug("cqid 0x%0x chp %p size %u memsize %zu, dma_addr %pad\n", chp->cq.cqid, chp, chp->cq.size, chp->cq.memsize, &chp->cq.dma_addr); - return &chp->ibcq; + return 0; err_free_mm2: kfree(mm2); err_free_mm: @@ -1151,8 +1145,7 @@ err_free_skb: err_free_wr_wait: c4iw_put_wr_wait(chp->wr_waitp); err_free_chp: - kfree(chp); - return ERR_PTR(ret); + return ret; } int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 45e720288f0f..7d06b0f8d49a 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -993,9 +993,8 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc); int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata); void c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); -struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); int c4iw_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *attr, enum ib_srq_attr_mask srq_attr_mask, diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 2b1f2443b7da..5e59c5708729 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -537,6 +537,7 @@ static const struct ib_device_ops c4iw_dev_ops = { .reg_user_mr = c4iw_reg_user_mr, .req_notify_cq = c4iw_arm_cq, INIT_RDMA_OBJ_SIZE(ib_pd, c4iw_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_cq, c4iw_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_srq, c4iw_srq, ibsrq), INIT_RDMA_OBJ_SIZE(ib_ucontext, c4iw_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h index 52d894f0ad3e..119f8efec564 100644 --- a/drivers/infiniband/hw/efa/efa.h +++ b/drivers/infiniband/hw/efa/efa.h @@ -135,9 +135,8 @@ struct ib_qp *efa_create_qp(struct ib_pd *ibpd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); -struct ib_cq *efa_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index b891ee239a67..46861461dd2d 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -224,6 +224,7 @@ static const struct ib_device_ops efa_dev_ops = { .reg_user_mr = efa_reg_mr, INIT_RDMA_OBJ_SIZE(ib_ah, efa_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, efa_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, efa_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_ucontext, efa_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 42865cf4f149..a9372c9e4b30 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -859,8 +859,6 @@ void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) efa_destroy_cq_idx(dev, cq->cq_idx); dma_unmap_single(&dev->pdev->dev, cq->dma_addr, cq->size, DMA_FROM_DEVICE); - - kfree(cq); } static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq, @@ -876,17 +874,20 @@ static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq, return 0; } -static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries, - int vector, struct ib_ucontext *ibucontext, - struct ib_udata *udata) +int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct efa_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct efa_ucontext, ibucontext); struct efa_ibv_create_cq_resp resp = {}; struct efa_com_create_cq_params params; struct efa_com_create_cq_result result; + struct ib_device *ibdev = ibcq->device; struct efa_dev *dev = to_edev(ibdev); struct efa_ibv_create_cq cmd = {}; + struct efa_cq *cq = to_ecq(ibcq); bool cq_entry_inserted = false; - struct efa_cq *cq; + int entries = attr->cqe; int err; ibdev_dbg(ibdev, "create_cq entries %d\n", entries); @@ -944,19 +945,13 @@ static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries, goto err_out; } - cq = kzalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) { - err = -ENOMEM; - goto err_out; - } - - cq->ucontext = to_eucontext(ibucontext); + cq->ucontext = ucontext; cq->size = PAGE_ALIGN(cmd.cq_entry_size * entries * cmd.num_sub_cqs); cq->cpu_addr = efa_zalloc_mapped(dev, &cq->dma_addr, cq->size, DMA_FROM_DEVICE); if (!cq->cpu_addr) { err = -ENOMEM; - goto err_free_cq; + goto err_out; } params.uarn = cq->ucontext->uarn; @@ -975,8 +970,8 @@ static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries, err = cq_mmap_entries_setup(dev, cq, &resp); if (err) { - ibdev_dbg(ibdev, - "Could not setup cq[%u] mmap entries\n", cq->cq_idx); + ibdev_dbg(ibdev, "Could not setup cq[%u] mmap entries\n", + cq->cq_idx); goto err_destroy_cq; } @@ -992,11 +987,10 @@ static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries, } } - ibdev_dbg(ibdev, - "Created cq[%d], cq depth[%u]. dma[%pad] virt[0x%p]\n", + ibdev_dbg(ibdev, "Created cq[%d], cq depth[%u]. dma[%pad] virt[0x%p]\n", cq->cq_idx, result.actual_depth, &cq->dma_addr, cq->cpu_addr); - return &cq->ibcq; + return 0; err_destroy_cq: efa_destroy_cq_idx(dev, cq->cq_idx); @@ -1005,23 +999,9 @@ err_free_mapped: DMA_FROM_DEVICE); if (!cq_entry_inserted) free_pages_exact(cq->cpu_addr, cq->size); -err_free_cq: - kfree(cq); err_out: atomic64_inc(&dev->stats.sw_stats.create_cq_err); - return ERR_PTR(err); -} - -struct ib_cq *efa_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) -{ - struct efa_ucontext *ucontext = rdma_udata_to_drv_context(udata, - struct efa_ucontext, - ibucontext); - - return do_create_cq(ibdev, attr->cqe, attr->comp_vector, - &ucontext->ibucontext, udata); + return err; } static int umem_to_page_list(struct efa_dev *dev, diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 0eb7c16c007b..7e198c9ffbfe 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -299,15 +299,15 @@ static void hns_roce_ib_free_cq_buf(struct hns_roce_dev *hr_dev, &buf->hr_buf); } -struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int hns_roce_ib_create_cq(struct ib_cq *ib_cq, + const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { - struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev); + struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device); struct device *dev = hr_dev->dev; struct hns_roce_ib_create_cq ucmd; struct hns_roce_ib_create_cq_resp resp = {}; - struct hns_roce_cq *hr_cq = NULL; + struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq); struct hns_roce_uar *uar = NULL; int vector = attr->comp_vector; int cq_entries = attr->cqe; @@ -318,13 +318,9 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, if (cq_entries < 1 || cq_entries > hr_dev->caps.max_cqes) { dev_err(dev, "Creat CQ failed. entries=%d, max=%d\n", cq_entries, hr_dev->caps.max_cqes); - return ERR_PTR(-EINVAL); + return -EINVAL; } - hr_cq = kzalloc(sizeof(*hr_cq), GFP_KERNEL); - if (!hr_cq) - return ERR_PTR(-ENOMEM); - if (hr_dev->caps.min_cqes) cq_entries = max(cq_entries, hr_dev->caps.min_cqes); @@ -415,7 +411,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, goto err_cqc; } - return &hr_cq->ib_cq; + return 0; err_cqc: hns_roce_free_cq(hr_dev, hr_cq); @@ -438,8 +434,7 @@ err_db: hns_roce_free_db(hr_dev, &hr_cq->db); err_cq: - kfree(hr_cq); - return ERR_PTR(ret); + return ret; } EXPORT_SYMBOL_GPL(hns_roce_ib_create_cq); @@ -471,8 +466,6 @@ void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) hns_roce_free_db(hr_dev, &hr_cq->db); } - - kfree(hr_cq); } EXPORT_SYMBOL_GPL(hns_roce_ib_destroy_cq); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 2f7d9644fd24..303ea7c614a8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1205,9 +1205,9 @@ void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn, __be32 send_ieth(const struct ib_send_wr *wr); int to_hr_qp_type(int qp_type); -struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int hns_roce_ib_create_cq(struct ib_cq *ib_cq, + const struct ib_cq_init_attr *attr, + struct ib_udata *udata); void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index aa7b67d283af..c899879da222 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -717,7 +717,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) union ib_gid dgid; u64 subnet_prefix; int attr_mask = 0; - int ret = -ENOMEM; + int ret; int i, j; u8 queue_en[HNS_ROCE_V1_RESV_QP] = { 0 }; u8 phy_port; @@ -730,10 +730,16 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) /* Reserved cq for loop qp */ cq_init_attr.cqe = HNS_ROCE_MIN_WQE_NUM * 2; cq_init_attr.comp_vector = 0; - cq = hns_roce_ib_create_cq(&hr_dev->ib_dev, &cq_init_attr, NULL); - if (IS_ERR(cq)) { - dev_err(dev, "Create cq for reserved loop qp failed!"); + + ibdev = &hr_dev->ib_dev; + cq = rdma_zalloc_drv_obj(ibdev, ib_cq); + if (!cq) return -ENOMEM; + + ret = hns_roce_ib_create_cq(cq, &cq_init_attr, NULL); + if (ret) { + dev_err(dev, "Create cq for reserved loop qp failed!"); + goto alloc_cq_failed; } free_mr->mr_free_cq = to_hr_cq(cq); free_mr->mr_free_cq->ib_cq.device = &hr_dev->ib_dev; @@ -743,7 +749,6 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) free_mr->mr_free_cq->ib_cq.cq_context = NULL; atomic_set(&free_mr->mr_free_cq->ib_cq.usecnt, 0); - ibdev = &hr_dev->ib_dev; pd = rdma_zalloc_drv_obj(ibdev, ib_pd); if (!pd) goto alloc_mem_failed; @@ -866,7 +871,8 @@ alloc_pd_failed: alloc_mem_failed: hns_roce_ib_destroy_cq(cq, NULL); - +alloc_cq_failed: + kfree(cq); return ret; } @@ -894,6 +900,7 @@ static void hns_roce_v1_release_lp_qp(struct hns_roce_dev *hr_dev) } hns_roce_ib_destroy_cq(&free_mr->mr_free_cq->ib_cq, NULL); + kfree(&free_mr->mr_free_cq->ib_cq); hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd, NULL); } @@ -3694,8 +3701,6 @@ static void hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) cq_buf_size = (ibcq->cqe + 1) * hr_dev->caps.cq_entry_sz; hns_roce_buf_free(hr_dev, cq_buf_size, &hr_cq->hr_buf.hr_buf); } - - kfree(hr_cq); } static void set_eq_cons_index_v1(struct hns_roce_eq *eq, int req_not) diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index f07b2ec86ec2..3e45b119b0eb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -446,6 +446,7 @@ static const struct ib_device_ops hns_roce_dev_ops = { .reg_user_mr = hns_roce_reg_user_mr, INIT_RDMA_OBJ_SIZE(ib_ah, hns_roce_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, hns_roce_cq, ib_cq), INIT_RDMA_OBJ_SIZE(ib_pd, hns_roce_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_ucontext, hns_roce_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 205053cb5f97..3100b0c31b0a 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -1075,27 +1075,27 @@ static void i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) cq = &iwcq->sc_cq; i40iw_cq_wq_destroy(iwdev, cq); cq_free_resources(iwdev, iwcq); - kfree(iwcq); i40iw_rem_devusecount(iwdev); } /** * i40iw_create_cq - create cq - * @ibdev: device pointer from stack + * @ibcq: CQ allocated * @attr: attributes for cq * @udata: user data */ -static struct ib_cq *i40iw_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +static int i40iw_create_cq(struct ib_cq *ibcq, + const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; struct i40iw_device *iwdev = to_iwdev(ibdev); - struct i40iw_cq *iwcq; + struct i40iw_cq *iwcq = to_iwcq(ibcq); struct i40iw_pbl *iwpbl; u32 cq_num = 0; struct i40iw_sc_cq *cq; struct i40iw_sc_dev *dev = &iwdev->sc_dev; - struct i40iw_cq_init_info info; + struct i40iw_cq_init_info info = {}; enum i40iw_status_code status; struct i40iw_cqp_request *cqp_request; struct cqp_commands_info *cqp_info; @@ -1105,22 +1105,16 @@ static struct ib_cq *i40iw_create_cq(struct ib_device *ibdev, int entries = attr->cqe; if (iwdev->closing) - return ERR_PTR(-ENODEV); + return -ENODEV; if (entries > iwdev->max_cqe) - return ERR_PTR(-EINVAL); - - iwcq = kzalloc(sizeof(*iwcq), GFP_KERNEL); - if (!iwcq) - return ERR_PTR(-ENOMEM); - - memset(&info, 0, sizeof(info)); + return -EINVAL; err_code = i40iw_alloc_resource(iwdev, iwdev->allocated_cqs, iwdev->max_cq, &cq_num, &iwdev->next_cq); if (err_code) - goto error; + return err_code; cq = &iwcq->sc_cq; cq->back_cq = (void *)iwcq; @@ -1227,15 +1221,13 @@ static struct ib_cq *i40iw_create_cq(struct ib_device *ibdev, } i40iw_add_devusecount(iwdev); - return (struct ib_cq *)iwcq; + return 0; cq_destroy: i40iw_cq_wq_destroy(iwdev, cq); cq_free_resources: cq_free_resources(iwdev, iwcq); -error: - kfree(iwcq); - return ERR_PTR(err_code); + return err_code; } /** @@ -2693,6 +2685,7 @@ static const struct ib_device_ops i40iw_dev_ops = { .reg_user_mr = i40iw_reg_user_mr, .req_notify_cq = i40iw_req_notify_cq, INIT_RDMA_OBJ_SIZE(ib_pd, i40iw_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_cq, i40iw_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_ucontext, i40iw_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 8eb7490dabb8..72f238ddafb5 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -172,14 +172,14 @@ err_buf: } #define CQ_CREATE_FLAGS_SUPPORTED IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION -struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; int vector = attr->comp_vector; struct mlx4_ib_dev *dev = to_mdev(ibdev); - struct mlx4_ib_cq *cq; + struct mlx4_ib_cq *cq = to_mcq(ibcq); struct mlx4_uar *uar; void *buf_addr; int err; @@ -187,14 +187,10 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, udata, struct mlx4_ib_ucontext, ibucontext); if (entries < 1 || entries > dev->dev->caps.max_cqes) - return ERR_PTR(-EINVAL); + return -EINVAL; if (attr->flags & ~CQ_CREATE_FLAGS_SUPPORTED) - return ERR_PTR(-EINVAL); - - cq = kzalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) - return ERR_PTR(-ENOMEM); + return -EINVAL; entries = roundup_pow_of_two(entries + 1); cq->ibcq.cqe = entries - 1; @@ -269,7 +265,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, goto err_cq_free; } - return &cq->ibcq; + return 0; err_cq_free: mlx4_cq_free(dev->dev, &cq->mcq); @@ -289,11 +285,8 @@ err_mtt: err_db: if (!udata) mlx4_db_free(dev->dev, &cq->db); - err_cq: - kfree(cq); - - return ERR_PTR(err); + return err; } static int mlx4_alloc_resize_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq, @@ -506,8 +499,6 @@ void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe); mlx4_db_free(dev->dev, &mcq->db); } - - kfree(mcq); } static void dump_cqe(void *cqe) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 5d7a87842291..8790101facb7 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2565,6 +2565,7 @@ static const struct ib_device_ops mlx4_ib_dev_ops = { .resize_cq = mlx4_ib_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, mlx4_ib_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, mlx4_ib_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, mlx4_ib_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_srq, mlx4_ib_srq, ibsrq), INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx4_ib_ucontext, ibucontext), diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index af5ee45a9f19..81b3d85e5167 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -743,9 +743,8 @@ int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); -struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index ebd01bd7f8f6..07b73df0e1a3 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -884,14 +884,14 @@ static void notify_soft_wc_handler(struct work_struct *work) cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); } -struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; int vector = attr->comp_vector; struct mlx5_ib_dev *dev = to_mdev(ibdev); - struct mlx5_ib_cq *cq; + struct mlx5_ib_cq *cq = to_mcq(ibcq); int uninitialized_var(index); int uninitialized_var(inlen); u32 *cqb = NULL; @@ -903,18 +903,14 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, if (entries < 0 || (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))) - return ERR_PTR(-EINVAL); + return -EINVAL; if (check_cq_create_flags(attr->flags)) - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; entries = roundup_pow_of_two(entries + 1); if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) - return ERR_PTR(-EINVAL); - - cq = kzalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) - return ERR_PTR(-ENOMEM); + return -EINVAL; cq->ibcq.cqe = entries - 1; mutex_init(&cq->resize_mutex); @@ -929,13 +925,13 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, err = create_cq_user(dev, udata, cq, entries, &cqb, &cqe_size, &index, &inlen); if (err) - goto err_create; + return err; } else { cqe_size = cache_line_size() == 128 ? 128 : 64; err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb, &index, &inlen); if (err) - goto err_create; + return err; INIT_WORK(&cq->notify_work, notify_soft_wc_handler); } @@ -980,7 +976,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, kvfree(cqb); - return &cq->ibcq; + return 0; err_cmd: mlx5_core_destroy_cq(dev->mdev, &cq->mcq); @@ -991,11 +987,7 @@ err_cqb: destroy_cq_user(cq, udata); else destroy_cq_kernel(dev, cq); - -err_create: - kfree(cq); - - return ERR_PTR(err); + return err; } void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) @@ -1008,8 +1000,6 @@ void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) destroy_cq_user(mcq, udata); else destroy_cq_kernel(dev, mcq); - - kfree(mcq); } static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 1e3d936ed809..99eb4a8b0b0d 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4891,18 +4891,19 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) if (ret) goto error0; - devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL); - if (IS_ERR(devr->c0)) { - ret = PTR_ERR(devr->c0); + devr->c0 = rdma_zalloc_drv_obj(ibdev, ib_cq); + if (!devr->c0) { + ret = -ENOMEM; goto error1; } - devr->c0->device = &dev->ib_dev; - devr->c0->uobject = NULL; - devr->c0->comp_handler = NULL; - devr->c0->event_handler = NULL; - devr->c0->cq_context = NULL; + + devr->c0->device = &dev->ib_dev; atomic_set(&devr->c0->usecnt, 0); + ret = mlx5_ib_create_cq(devr->c0, &cq_attr, NULL); + if (ret) + goto err_create_cq; + devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL); if (IS_ERR(devr->x0)) { ret = PTR_ERR(devr->x0); @@ -4994,6 +4995,8 @@ error3: mlx5_ib_dealloc_xrcd(devr->x0, NULL); error2: mlx5_ib_destroy_cq(devr->c0, NULL); +err_create_cq: + kfree(devr->c0); error1: mlx5_ib_dealloc_pd(devr->p0, NULL); error0: @@ -5012,6 +5015,7 @@ static void destroy_dev_resources(struct mlx5_ib_resources *devr) mlx5_ib_dealloc_xrcd(devr->x0, NULL); mlx5_ib_dealloc_xrcd(devr->x1, NULL); mlx5_ib_destroy_cq(devr->c0, NULL); + kfree(devr->c0); mlx5_ib_dealloc_pd(devr->p0, NULL); kfree(devr->p0); @@ -6182,6 +6186,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .resize_cq = mlx5_ib_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, mlx5_ib_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq), INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext), diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index bf697c5b3e79..f2ad0372d38d 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1115,9 +1115,8 @@ int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, int buflen, size_t *bc); int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer, int buflen, size_t *bc); -struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 81fc04e1c142..efd4e3d13ae2 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -601,10 +601,11 @@ static int mthca_destroy_qp(struct ib_qp *qp, struct ib_udata *udata) return 0; } -static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +static int mthca_create_cq(struct ib_cq *ibcq, + const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; struct mthca_create_cq ucmd; struct mthca_cq *cq; @@ -614,20 +615,20 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, udata, struct mthca_ucontext, ibucontext); if (attr->flags) - return ERR_PTR(-EINVAL); + return -EINVAL; if (entries < 1 || entries > to_mdev(ibdev)->limits.max_cqes) - return ERR_PTR(-EINVAL); + return -EINVAL; if (udata) { - if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) - return ERR_PTR(-EFAULT); + if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) + return -EFAULT; err = mthca_map_user_db(to_mdev(ibdev), &context->uar, context->db_tab, ucmd.set_db_index, ucmd.set_db_page); if (err) - return ERR_PTR(err); + return err; err = mthca_map_user_db(to_mdev(ibdev), &context->uar, context->db_tab, ucmd.arm_db_index, @@ -636,11 +637,7 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, goto err_unmap_set; } - cq = kzalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) { - err = -ENOMEM; - goto err_unmap_arm; - } + cq = to_mcq(ibcq); if (udata) { cq->buf.mr.ibmr.lkey = ucmd.lkey; @@ -655,20 +652,17 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, udata ? ucmd.pdn : to_mdev(ibdev)->driver_pd.pd_num, cq); if (err) - goto err_free; + goto err_unmap_arm; if (udata && ib_copy_to_udata(udata, &cq->cqn, sizeof(__u32))) { mthca_free_cq(to_mdev(ibdev), cq); err = -EFAULT; - goto err_free; + goto err_unmap_arm; } cq->resize_buf = NULL; - return &cq->ibcq; - -err_free: - kfree(cq); + return 0; err_unmap_arm: if (udata) @@ -680,7 +674,7 @@ err_unmap_set: mthca_unmap_user_db(to_mdev(ibdev), &context->uar, context->db_tab, ucmd.set_db_index); - return ERR_PTR(err); + return err; } static int mthca_alloc_resize_buf(struct mthca_dev *dev, struct mthca_cq *cq, @@ -823,7 +817,6 @@ static void mthca_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) to_mcq(cq)->set_ci_db_index); } mthca_free_cq(to_mdev(cq->device), to_mcq(cq)); - kfree(cq); } static inline u32 convert_access(int acc) @@ -1187,6 +1180,7 @@ static const struct ib_device_ops mthca_dev_ops = { .resize_cq = mthca_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, mthca_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, mthca_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, mthca_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_ucontext, mthca_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index cac3fa624c4d..0420203820f6 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -1374,16 +1374,17 @@ static int nes_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) /** * nes_create_cq */ -static struct ib_cq *nes_create_cq(struct ib_device *ibdev, +static int nes_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; u64 u64temp; struct nes_vnic *nesvnic = to_nesvnic(ibdev); struct nes_device *nesdev = nesvnic->nesdev; struct nes_adapter *nesadapter = nesdev->nesadapter; - struct nes_cq *nescq; + struct nes_cq *nescq = to_nescq(ibcq); struct nes_ucontext *nes_ucontext = NULL; struct nes_cqp_request *cqp_request; void *mem = NULL; @@ -1399,22 +1400,15 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, int ret; if (attr->flags) - return ERR_PTR(-EINVAL); + return -EINVAL; if (entries > nesadapter->max_cqe) - return ERR_PTR(-EINVAL); + return -EINVAL; err = nes_alloc_resource(nesadapter, nesadapter->allocated_cqs, nesadapter->max_cq, &cq_num, &nesadapter->next_cq, NES_RESOURCE_CQ); - if (err) { - return ERR_PTR(err); - } - - nescq = kzalloc(sizeof(struct nes_cq), GFP_KERNEL); - if (!nescq) { - nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - return ERR_PTR(-ENOMEM); - } + if (err) + return err; nescq->hw_cq.cq_size = max(entries + 1, 5); nescq->hw_cq.cq_number = cq_num; @@ -1424,10 +1418,10 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, struct nes_ucontext *nes_ucontext = rdma_udata_to_drv_context( udata, struct nes_ucontext, ibucontext); - if (ib_copy_from_udata(&req, udata, sizeof (struct nes_create_cq_req))) { + if (ib_copy_from_udata(&req, udata, + sizeof(struct nes_create_cq_req))) { nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - kfree(nescq); - return ERR_PTR(-EFAULT); + return -EFAULT; } nesvnic->mcrq_ucontext = nes_ucontext; nes_ucontext->mcrqf = req.mcrqf; @@ -1441,8 +1435,6 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, nescq->mcrqf = nes_ucontext->mcrqf; nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); } - nes_debug(NES_DBG_CQ, "CQ Virtual Address = %08lX, size = %u.\n", - (unsigned long)req.user_cq_buffer, entries); err = 1; list_for_each_entry(nespbl, &nes_ucontext->cq_reg_mem_list, list) { if (nespbl->user_base == (unsigned long )req.user_cq_buffer) { @@ -1455,8 +1447,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, } if (err) { nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - kfree(nescq); - return ERR_PTR(-EFAULT); + return -EFAULT; } pbl_entries = nespbl->pbl_size >> 3; @@ -1472,15 +1463,11 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, if (!mem) { printk(KERN_ERR PFX "Unable to allocate pci memory for cq\n"); nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - kfree(nescq); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } nescq->hw_cq.cq_vbase = mem; nescq->hw_cq.cq_head = 0; - nes_debug(NES_DBG_CQ, "CQ%u virtual address @ %p, phys = 0x%08X\n", - nescq->hw_cq.cq_number, nescq->hw_cq.cq_vbase, - (u32)nescq->hw_cq.cq_pbase); } nescq->hw_cq.ce_handler = nes_iwarp_ce_handler; @@ -1500,8 +1487,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, } nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - kfree(nescq); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } cqp_request->waiting = 1; cqp_wqe = &cqp_request->cqp_wqe; @@ -1528,8 +1514,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, kfree(nespbl); } nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - kfree(nescq); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } else { opcode |= (NES_CQP_CQ_VIRT | NES_CQP_CQ_4KB_CHUNK); nescq->virtual_cq = 2; @@ -1550,8 +1535,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, kfree(nespbl); } nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - kfree(nescq); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } else { opcode |= NES_CQP_CQ_VIRT; nescq->virtual_cq = 1; @@ -1607,8 +1591,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, kfree(nespbl); } nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - kfree(nescq); - return ERR_PTR(-EIO); + return -EIO; } nes_put_cqp_request(nesdev, cqp_request); @@ -1620,17 +1603,16 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev, resp.cq_id = nescq->hw_cq.cq_number; resp.cq_size = nescq->hw_cq.cq_size; resp.mmap_db_index = 0; - if (ib_copy_to_udata(udata, &resp, sizeof resp - sizeof resp.reserved)) { + if (ib_copy_to_udata(udata, &resp, + sizeof(resp) - sizeof(resp.reserved))) { nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num); - kfree(nescq); - return ERR_PTR(-EFAULT); + return -EFAULT; } } - return &nescq->ibcq; + return 0; } - /** * nes_destroy_cq */ @@ -1700,7 +1682,6 @@ static void nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) if (nescq->cq_mem_size) pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, nescq->hw_cq.cq_vbase, nescq->hw_cq.cq_pbase); - kfree(nescq); } /** @@ -3584,6 +3565,7 @@ static const struct ib_device_ops nes_dev_ops = { .reg_user_mr = nes_reg_user_mr, .req_notify_cq = nes_req_notify_cq, INIT_RDMA_OBJ_SIZE(ib_pd, nes_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_cq, nes_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_ucontext, nes_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index b326313d413f..c15cfc6cef81 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -182,6 +182,7 @@ static const struct ib_device_ops ocrdma_dev_ops = { .resize_cq = ocrdma_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, ocrdma_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, ocrdma_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, ocrdma_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_ucontext, ocrdma_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 94e4f7f9b1f7..10b35edb286b 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -977,12 +977,12 @@ err: return status; } -struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int ocrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; - struct ocrdma_cq *cq; + struct ocrdma_cq *cq = get_ocrdma_cq(ibcq); struct ocrdma_dev *dev = get_ocrdma_dev(ibdev); struct ocrdma_ucontext *uctx = rdma_udata_to_drv_context( udata, struct ocrdma_ucontext, ibucontext); @@ -991,16 +991,13 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, struct ocrdma_create_cq_ureq ureq; if (attr->flags) - return ERR_PTR(-EINVAL); + return -EINVAL; if (udata) { if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) - return ERR_PTR(-EFAULT); + return -EFAULT; } else ureq.dpp_cq = 0; - cq = kzalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) - return ERR_PTR(-ENOMEM); spin_lock_init(&cq->cq_lock); spin_lock_init(&cq->comp_handler_lock); @@ -1011,10 +1008,9 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, pd_id = uctx->cntxt_pd->id; status = ocrdma_mbx_create_cq(dev, cq, entries, ureq.dpp_cq, pd_id); - if (status) { - kfree(cq); - return ERR_PTR(status); - } + if (status) + return status; + if (udata) { status = ocrdma_copy_cq_uresp(dev, cq, udata); if (status) @@ -1022,12 +1018,11 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, } cq->phase = OCRDMA_CQE_VALID; dev->cq_tbl[cq->id] = cq; - return &cq->ibcq; + return 0; ctx_err: ocrdma_mbx_destroy_cq(dev, cq); - kfree(cq); - return ERR_PTR(status); + return status; } int ocrdma_resize_cq(struct ib_cq *ibcq, int new_cnt, @@ -1095,8 +1090,6 @@ void ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) ocrdma_get_db_addr(dev, pdid), dev->nic_info.db_page_size); } - - kfree(cq); } static int ocrdma_add_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index 89cebe05669e..32488da1b752 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -71,9 +71,8 @@ int ocrdma_mmap(struct ib_ucontext *, struct vm_area_struct *vma); int ocrdma_alloc_pd(struct ib_pd *pd, struct ib_udata *udata); void ocrdma_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata); -struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int ocrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); int ocrdma_resize_cq(struct ib_cq *, int cqe, struct ib_udata *); void ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index a0bb07ba0f3c..a0a7ba0a5af4 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -224,6 +224,7 @@ static const struct ib_device_ops qedr_dev_ops = { .resize_cq = qedr_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, qedr_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, qedr_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, qedr_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_srq, qedr_srq, ibsrq), INIT_RDMA_OBJ_SIZE(ib_ucontext, qedr_ucontext, ibucontext), diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index be29bbbc4b14..3fc7a4e901c3 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -806,20 +806,20 @@ int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) return 0; } -struct ib_cq *qedr_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; struct qedr_ucontext *ctx = rdma_udata_to_drv_context( udata, struct qedr_ucontext, ibucontext); struct qed_rdma_destroy_cq_out_params destroy_oparams; struct qed_rdma_destroy_cq_in_params destroy_iparams; struct qedr_dev *dev = get_qedr_dev(ibdev); struct qed_rdma_create_cq_in_params params; - struct qedr_create_cq_ureq ureq; + struct qedr_create_cq_ureq ureq = {}; int vector = attr->comp_vector; int entries = attr->cqe; - struct qedr_cq *cq; + struct qedr_cq *cq = get_qedr_cq(ibcq); int chain_entries; int page_cnt; u64 pbl_ptr; @@ -834,18 +834,13 @@ struct ib_cq *qedr_create_cq(struct ib_device *ibdev, DP_ERR(dev, "create cq: the number of entries %d is too high. Must be equal or below %d.\n", entries, QEDR_MAX_CQES); - return ERR_PTR(-EINVAL); + return -EINVAL; } chain_entries = qedr_align_cq_entries(entries); chain_entries = min_t(int, chain_entries, QEDR_MAX_CQES); - cq = kzalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) - return ERR_PTR(-ENOMEM); - if (udata) { - memset(&ureq, 0, sizeof(ureq)); if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) { DP_ERR(dev, "create cq: problem copying data from user space\n"); @@ -923,7 +918,7 @@ struct ib_cq *qedr_create_cq(struct ib_device *ibdev, "create cq: icid=0x%0x, addr=%p, size(entries)=0x%0x\n", cq->icid, cq, params.cq_size); - return &cq->ibcq; + return 0; err3: destroy_iparams.icid = cq->icid; @@ -938,8 +933,7 @@ err1: if (udata) ib_umem_release(cq->q.umem); err0: - kfree(cq); - return ERR_PTR(-EINVAL); + return -EINVAL; } int qedr_resize_cq(struct ib_cq *ibcq, int new_cnt, struct ib_udata *udata) @@ -969,7 +963,7 @@ void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) /* GSIs CQs are handled by driver, so they don't exist in the FW */ if (cq->cq_type == QEDR_CQ_TYPE_GSI) - goto done; + return; iparams.icid = cq->icid; dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams); @@ -1008,10 +1002,6 @@ void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) * Since the destroy CQ ramrod has also been received on the EQ we can * be certain that there's no event handler in process. */ -done: - cq->sig = ~cq->sig; - - kfree(cq); } static inline int get_gid_info_from_table(struct ib_qp *ibqp, diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h index 32d7ce77e339..9aaa90283d6e 100644 --- a/drivers/infiniband/hw/qedr/verbs.h +++ b/drivers/infiniband/hw/qedr/verbs.h @@ -50,9 +50,8 @@ int qedr_mmap(struct ib_ucontext *, struct vm_area_struct *vma); int qedr_alloc_pd(struct ib_pd *pd, struct ib_udata *udata); void qedr_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata); -struct ib_cq *qedr_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); int qedr_resize_cq(struct ib_cq *, int cqe, struct ib_udata *); void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); diff --git a/drivers/infiniband/hw/usnic/usnic_ib.h b/drivers/infiniband/hw/usnic/usnic_ib.h index 525bf272671e..84dd682d2334 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib.h +++ b/drivers/infiniband/hw/usnic/usnic_ib.h @@ -61,6 +61,10 @@ struct usnic_ib_pd { struct usnic_uiom_pd *umem_pd; }; +struct usnic_ib_cq { + struct ib_cq ibcq; +}; + struct usnic_ib_mr { struct ib_mr ibmr; struct usnic_uiom_reg *umem; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index e701322dc740..6ae5ce007fed 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -354,6 +354,7 @@ static const struct ib_device_ops usnic_dev_ops = { .query_qp = usnic_ib_query_qp, .reg_user_mr = usnic_ib_reg_mr, INIT_RDMA_OBJ_SIZE(ib_pd, usnic_ib_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_cq, usnic_ib_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_ucontext, usnic_ib_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 5686d14b86fe..eeb07b245ef9 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -587,26 +587,18 @@ out_unlock: return status; } -struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int usnic_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { - struct ib_cq *cq; - - usnic_dbg("\n"); if (attr->flags) - return ERR_PTR(-EINVAL); - - cq = kzalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) - return ERR_PTR(-EBUSY); + return -EINVAL; - return cq; + return 0; } void usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) { - kfree(cq); + return; } struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h index 0b9d993433a7..2aedf78c13cf 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h @@ -58,9 +58,8 @@ struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, int usnic_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata); int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); -struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int usnic_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); void usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index 0682781f6555..38573fc0a9bf 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -92,20 +92,19 @@ int pvrdma_req_notify_cq(struct ib_cq *ibcq, /** * pvrdma_create_cq - create completion queue - * @ibdev: the device + * @ibcq: Allocated CQ * @attr: completion queue attributes * @udata: user data * - * @return: ib_cq completion queue pointer on success, - * otherwise returns negative errno. + * @return: 0 on success */ -struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; int entries = attr->cqe; struct pvrdma_dev *dev = to_vdev(ibdev); - struct pvrdma_cq *cq; + struct pvrdma_cq *cq = to_vcq(ibcq); int ret; int npages; unsigned long flags; @@ -113,7 +112,7 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, union pvrdma_cmd_resp rsp; struct pvrdma_cmd_create_cq *cmd = &req.create_cq; struct pvrdma_cmd_create_cq_resp *resp = &rsp.create_cq_resp; - struct pvrdma_create_cq_resp cq_resp = {0}; + struct pvrdma_create_cq_resp cq_resp = {}; struct pvrdma_create_cq ucmd; struct pvrdma_ucontext *context = rdma_udata_to_drv_context( udata, struct pvrdma_ucontext, ibucontext); @@ -122,16 +121,10 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, entries = roundup_pow_of_two(entries); if (entries < 1 || entries > dev->dsr->caps.max_cqe) - return ERR_PTR(-EINVAL); + return -EINVAL; if (!atomic_add_unless(&dev->num_cqs, 1, dev->dsr->caps.max_cq)) - return ERR_PTR(-ENOMEM); - - cq = kzalloc(sizeof(*cq), GFP_KERNEL); - if (!cq) { - atomic_dec(&dev->num_cqs); - return ERR_PTR(-ENOMEM); - } + return -ENOMEM; cq->ibcq.cqe = entries; cq->is_kernel = !udata; @@ -211,11 +204,11 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, dev_warn(&dev->pdev->dev, "failed to copy back udata\n"); pvrdma_destroy_cq(&cq->ibcq, udata); - return ERR_PTR(-EINVAL); + return -EINVAL; } } - return &cq->ibcq; + return 0; err_page_dir: pvrdma_page_dir_cleanup(dev, &cq->pdir); @@ -224,9 +217,7 @@ err_umem: ib_umem_release(cq->umem); err_cq: atomic_dec(&dev->num_cqs); - kfree(cq); - - return ERR_PTR(ret); + return ret; } static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq) @@ -239,7 +230,6 @@ static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq) ib_umem_release(cq->umem); pvrdma_page_dir_cleanup(dev, &cq->pdir); - kfree(cq); } /** diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index 0c48464ffff1..e580ae9cc55a 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -182,6 +182,7 @@ static const struct ib_device_ops pvrdma_dev_ops = { .req_notify_cq = pvrdma_req_notify_cq, INIT_RDMA_OBJ_SIZE(ib_ah, pvrdma_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, pvrdma_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, pvrdma_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_ucontext, pvrdma_ucontext, ibucontext), }; diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h index f0dd6e4d058b..e4a48f5c0c85 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h @@ -409,9 +409,8 @@ struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg, struct ib_udata *udata); int pvrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); -struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); void pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int pvrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int pvrdma_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 8e76036fad4a..b46714a92b7a 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -166,43 +166,37 @@ static void send_complete(struct work_struct *work) /** * rvt_create_cq - create a completion queue - * @ibdev: the device this completion queue is attached to + * @ibcq: Allocated CQ * @attr: creation attributes * @udata: user data for libibverbs.so * * Called by ib_create_cq() in the generic verbs code. * - * Return: pointer to the completion queue or negative errno values - * for failure. + * Return: 0 on success */ -struct ib_cq *rvt_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; struct rvt_dev_info *rdi = ib_to_rvt(ibdev); - struct rvt_cq *cq; + struct rvt_cq *cq = container_of(ibcq, struct rvt_cq, ibcq); struct rvt_cq_wc *wc; - struct ib_cq *ret; u32 sz; unsigned int entries = attr->cqe; int comp_vector = attr->comp_vector; + int err; if (attr->flags) - return ERR_PTR(-EINVAL); + return -EINVAL; if (entries < 1 || entries > rdi->dparms.props.max_cqe) - return ERR_PTR(-EINVAL); + return -EINVAL; if (comp_vector < 0) comp_vector = 0; comp_vector = comp_vector % rdi->ibdev.num_comp_vectors; - /* Allocate the completion queue structure. */ - cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, rdi->dparms.node); - if (!cq) - return ERR_PTR(-ENOMEM); - /* * Allocate the completion queue entries and head/tail pointers. * This is allocated separately so that it can be resized and @@ -218,36 +212,29 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, wc = udata ? vmalloc_user(sz) : vzalloc_node(sz, rdi->dparms.node); - if (!wc) { - ret = ERR_PTR(-ENOMEM); - goto bail_cq; - } - + if (!wc) + return -ENOMEM; /* * Return the address of the WC as the offset to mmap. * See rvt_mmap() for details. */ if (udata && udata->outlen >= sizeof(__u64)) { - int err; - cq->ip = rvt_create_mmap_info(rdi, sz, udata, wc); if (!cq->ip) { - ret = ERR_PTR(-ENOMEM); + err = -ENOMEM; goto bail_wc; } err = ib_copy_to_udata(udata, &cq->ip->offset, sizeof(cq->ip->offset)); - if (err) { - ret = ERR_PTR(err); + if (err) goto bail_ip; - } } spin_lock_irq(&rdi->n_cqs_lock); if (rdi->n_cqs_allocated == rdi->dparms.props.max_cq) { spin_unlock_irq(&rdi->n_cqs_lock); - ret = ERR_PTR(-ENOMEM); + err = -ENOMEM; goto bail_ip; } @@ -279,19 +266,14 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, INIT_WORK(&cq->comptask, send_complete); cq->queue = wc; - ret = &cq->ibcq; - trace_rvt_create_cq(cq, attr); - goto done; + return 0; bail_ip: kfree(cq->ip); bail_wc: vfree(wc); -bail_cq: - kfree(cq); -done: - return ret; + return err; } /** @@ -314,7 +296,6 @@ void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) kref_put(&cq->ip->ref, rvt_release_mmap_info); else vfree(cq->queue); - kfree(cq); } /** diff --git a/drivers/infiniband/sw/rdmavt/cq.h b/drivers/infiniband/sw/rdmavt/cq.h index 495d8c3e6580..5e26a2eb19a4 100644 --- a/drivers/infiniband/sw/rdmavt/cq.h +++ b/drivers/infiniband/sw/rdmavt/cq.h @@ -51,9 +51,8 @@ #include #include -struct ib_cq *rvt_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); +int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags); int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 639ef8ac5400..18da1e1ea979 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -429,6 +429,7 @@ static const struct ib_device_ops rvt_dev_ops = { .unmap_fmr = rvt_unmap_fmr, INIT_RDMA_OBJ_SIZE(ib_ah, rvt_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, rvt_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, rvt_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_srq, rvt_srq, ibsrq), INIT_RDMA_OBJ_SIZE(ib_ucontext, rvt_ucontext, ibucontext), diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 56cf18af016a..fbcbac52290b 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -72,6 +72,7 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = { [RXE_TYPE_CQ] = { .name = "rxe-cq", .size = sizeof(struct rxe_cq), + .flags = RXE_POOL_NO_ALLOC, .cleanup = rxe_cq_cleanup, }, [RXE_TYPE_MR] = { diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index b14881decbee..4ebdfcf4d33e 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -778,45 +778,34 @@ err1: return err; } -static struct ib_cq *rxe_create_cq(struct ib_device *dev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { int err; + struct ib_device *dev = ibcq->device; struct rxe_dev *rxe = to_rdev(dev); - struct rxe_cq *cq; + struct rxe_cq *cq = to_rcq(ibcq); struct rxe_create_cq_resp __user *uresp = NULL; if (udata) { if (udata->outlen < sizeof(*uresp)) - return ERR_PTR(-EINVAL); + return -EINVAL; uresp = udata->outbuf; } if (attr->flags) - return ERR_PTR(-EINVAL); + return -EINVAL; err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector); if (err) - goto err1; - - cq = rxe_alloc(&rxe->cq_pool); - if (!cq) { - err = -ENOMEM; - goto err1; - } + return err; err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector, udata, uresp); if (err) - goto err2; - - return &cq->ibcq; + return err; -err2: - rxe_drop_ref(cq); -err1: - return ERR_PTR(err); + return rxe_add_to_pool(&rxe->cq_pool, &cq->pelem); } static void rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) @@ -1160,6 +1149,7 @@ static const struct ib_device_ops rxe_dev_ops = { .resize_cq = rxe_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, rxe_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, rxe_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq), INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc), diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index e8be7f44e3be..6c997d39a418 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -85,8 +85,8 @@ struct rxe_cqe { }; struct rxe_cq { - struct rxe_pool_entry pelem; struct ib_cq ibcq; + struct rxe_pool_entry pelem; struct rxe_queue *queue; spinlock_t cq_lock; u8 notify; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index bc1d94c9c9ba..f357e03a85a6 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2458,9 +2458,8 @@ struct ib_device_ops { int (*query_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int (*destroy_qp)(struct ib_qp *qp, struct ib_udata *udata); - struct ib_cq *(*create_cq)(struct ib_device *device, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); + int (*create_cq)(struct ib_cq *cq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); void (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata); int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); @@ -2601,6 +2600,7 @@ struct ib_device_ops { int (*iw_destroy_listen)(struct iw_cm_id *cm_id); DECLARE_RDMA_OBJ_SIZE(ib_ah); + DECLARE_RDMA_OBJ_SIZE(ib_cq); DECLARE_RDMA_OBJ_SIZE(ib_pd); DECLARE_RDMA_OBJ_SIZE(ib_srq); DECLARE_RDMA_OBJ_SIZE(ib_ucontext); -- cgit v1.2.3 From 836a0fbb3e76f704ad65ddfb57f00725245e509b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 16 Jun 2019 15:05:20 +0300 Subject: RDMA: Check umem pointer validity prior to release Update ib_umem_release() to behave similarly to kfree() and allow submitting NULL pointer as safe input to this function. Fixes: a52c8e2469c3 ("RDMA: Clean destroy CQ in drivers do not return errors") Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem.c | 3 +++ drivers/infiniband/hw/bnxt_re/ib_verbs.c | 29 +++++++++------------------- drivers/infiniband/hw/cxgb3/iwch_provider.c | 3 +-- drivers/infiniband/hw/cxgb4/mem.c | 3 +-- drivers/infiniband/hw/efa/efa_verbs.c | 2 +- drivers/infiniband/hw/hns/hns_roce_cq.c | 8 +++----- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 13 +++++-------- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- drivers/infiniband/hw/hns/hns_roce_mr.c | 4 +--- drivers/infiniband/hw/hns/hns_roce_qp.c | 5 ++--- drivers/infiniband/hw/hns/hns_roce_srq.c | 14 ++++++-------- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 3 +-- drivers/infiniband/hw/mlx4/cq.c | 14 +++++--------- drivers/infiniband/hw/mlx4/qp.c | 7 +++---- drivers/infiniband/hw/mlx4/srq.c | 7 +++---- drivers/infiniband/hw/mlx5/cq.c | 20 +++++-------------- drivers/infiniband/hw/mlx5/mr.c | 13 ++++++------- drivers/infiniband/hw/mlx5/qp.c | 9 +++------ drivers/infiniband/hw/mthca/mthca_provider.c | 3 +-- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 3 +-- drivers/infiniband/hw/qedr/verbs.c | 9 +++------ drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c | 6 ++---- drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c | 3 +-- drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c | 16 ++++----------- drivers/infiniband/sw/rdmavt/mr.c | 3 +-- drivers/infiniband/sw/rxe/rxe_mr.c | 3 +-- 26 files changed, 73 insertions(+), 132 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 54628ef879f0..08da840ed7ee 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -361,6 +361,9 @@ static void __ib_umem_release_tail(struct ib_umem *umem) */ void ib_umem_release(struct ib_umem *umem) { + if (!umem) + return; + if (umem->is_odp) { ib_umem_odp_release(to_ib_umem_odp(umem)); __ib_umem_release_tail(umem); diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 44cc5f19df3b..a91653aabf38 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -805,10 +805,8 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata) rdev->sqp_ah = NULL; } - if (!IS_ERR_OR_NULL(qp->rumem)) - ib_umem_release(qp->rumem); - if (!IS_ERR_OR_NULL(qp->sumem)) - ib_umem_release(qp->sumem); + ib_umem_release(qp->rumem); + ib_umem_release(qp->sumem); mutex_lock(&rdev->qp_lock); list_del(&qp->list); @@ -1201,12 +1199,8 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd, qp_destroy: bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp); free_umem: - if (udata) { - if (qp->rumem) - ib_umem_release(qp->rumem); - if (qp->sumem) - ib_umem_release(qp->sumem); - } + ib_umem_release(qp->rumem); + ib_umem_release(qp->sumem); fail: kfree(qp); return ERR_PTR(rc); @@ -1302,8 +1296,7 @@ void bnxt_re_destroy_srq(struct ib_srq *ib_srq, struct ib_udata *udata) if (qplib_srq->cq) nq = qplib_srq->cq->nq; bnxt_qplib_destroy_srq(&rdev->qplib_res, qplib_srq); - if (srq->umem) - ib_umem_release(srq->umem); + ib_umem_release(srq->umem); atomic_dec(&rdev->srq_count); if (nq) nq->budget--; @@ -1412,8 +1405,7 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq, return 0; fail: - if (srq->umem) - ib_umem_release(srq->umem); + ib_umem_release(srq->umem); exit: return rc; } @@ -2528,8 +2520,7 @@ void bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) nq = cq->qplib_cq.nq; bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq); - if (!cq->umem) - ib_umem_release(cq->umem); + ib_umem_release(cq->umem); atomic_dec(&rdev->cq_count); nq->budget--; @@ -2632,8 +2623,7 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, return 0; c2fail: - if (udata) - ib_umem_release(cq->umem); + ib_umem_release(cq->umem); fail: kfree(cq->cql); return rc; @@ -3340,8 +3330,7 @@ int bnxt_re_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata) mr->npages = 0; mr->pages = NULL; } - if (!IS_ERR_OR_NULL(mr->ib_umem)) - ib_umem_release(mr->ib_umem); + ib_umem_release(mr->ib_umem); kfree(mr); atomic_dec(&rdev->mr_count); diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 810fa96af2e9..e775c1a1a450 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -346,8 +346,7 @@ static int iwch_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata) xa_erase_irq(&rhp->mrs, mmid); if (mhp->kva) kfree((void *) (unsigned long) mhp->kva); - if (mhp->umem) - ib_umem_release(mhp->umem); + ib_umem_release(mhp->umem); pr_debug("%s mmid 0x%x ptr %p\n", __func__, mmid, mhp); kfree(mhp); return 0; diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index fe3733c4026d..aa772ee0706f 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -808,8 +808,7 @@ int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata) mhp->attr.pbl_size << 3); if (mhp->kva) kfree((void *) (unsigned long) mhp->kva); - if (mhp->umem) - ib_umem_release(mhp->umem); + ib_umem_release(mhp->umem); pr_debug("mmid 0x%x ptr %p\n", mmid, mhp); c4iw_put_wr_wait(mhp->wr_waitp); kfree(mhp); diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index a9372c9e4b30..5e6e5eb65cff 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1513,8 +1513,8 @@ int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) err = efa_com_dereg_mr(&dev->edev, ¶ms); if (err) return err; - ib_umem_release(mr->umem); } + ib_umem_release(mr->umem); kfree(mr); diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 7e198c9ffbfe..6b4d8e50aabe 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -423,9 +423,8 @@ err_dbmap: err_mtt: hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt); - if (udata) - ib_umem_release(hr_cq->umem); - else + ib_umem_release(hr_cq->umem); + if (!udata) hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf, hr_cq->ib_cq.cqe); @@ -451,9 +450,8 @@ void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) hns_roce_free_cq(hr_dev, hr_cq); hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt); + ib_umem_release(hr_cq->umem); if (udata) { - ib_umem_release(hr_cq->umem); - if (hr_cq->db_en == 1) hns_roce_db_unmap_user(rdma_udata_to_drv_context( udata, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index c899879da222..cb004190ccba 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -1163,8 +1163,7 @@ free_mr: hns_roce_bitmap_free(&hr_dev->mr_table.mtpt_bitmap, key_to_hw_index(mr->key), 0); - if (mr->umem) - ib_umem_release(mr->umem); + ib_umem_release(mr->umem); kfree(mr); @@ -3641,9 +3640,8 @@ int hns_roce_v1_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt); - if (udata) - ib_umem_release(hr_qp->umem); - else { + ib_umem_release(hr_qp->umem); + if (!udata) { kfree(hr_qp->sq.wrid); kfree(hr_qp->rq.wrid); @@ -3694,9 +3692,8 @@ static void hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt); - if (ibcq->uobject) - ib_umem_release(hr_cq->umem); - else { + ib_umem_release(hr_cq->umem); + if (!udata) { /* Free the buff of stored cq */ cq_buf_size = (ibcq->cqe + 1) * hr_dev->caps.cq_entry_sz; hns_roce_buf_free(hr_dev, cq_buf_size, &hr_cq->hr_buf.hr_buf); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 5c8551b54444..edd62b4dc0a0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4582,7 +4582,6 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, if (hr_qp->rq.wqe_cnt && (hr_qp->rdb_en == 1)) hns_roce_db_unmap_user(context, &hr_qp->rdb); - ib_umem_release(hr_qp->umem); } else { kfree(hr_qp->sq.wrid); kfree(hr_qp->rq.wrid); @@ -4590,6 +4589,7 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, if (hr_qp->rq.wqe_cnt) hns_roce_free_db(hr_dev, &hr_qp->rdb); } + ib_umem_release(hr_qp->umem); if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) && hr_qp->rq.wqe_cnt) { diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 6db0dae18ab7..adf075183dfb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -1298,9 +1298,7 @@ int hns_roce_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) } else { hns_roce_mr_free(hr_dev, mr); - if (mr->umem) - ib_umem_release(mr->umem); - + ib_umem_release(mr->umem); kfree(mr); } diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 99ec5d43b99b..7e9db8236072 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -938,10 +938,9 @@ err_get_bufs: hns_roce_free_buf_list(buf_list, hr_qp->region_cnt); err_alloc_list: - if (hr_qp->umem) - ib_umem_release(hr_qp->umem); - else + if (!hr_qp->umem) hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf); + ib_umem_release(hr_qp->umem); err_db: if (!udata && hns_roce_qp_has_rq(init_attr) && diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index c222f243953a..de645be8aa48 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -380,8 +380,7 @@ err_idx_buf: hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt); err_idx_mtt: - if (udata) - ib_umem_release(srq->idx_que.umem); + ib_umem_release(srq->idx_que.umem); err_create_idx: hns_roce_buf_free(hr_dev, srq->idx_que.buf_size, @@ -392,9 +391,8 @@ err_srq_mtt: hns_roce_mtt_cleanup(hr_dev, &srq->mtt); err_buf: - if (udata) - ib_umem_release(srq->umem); - else + ib_umem_release(srq->umem); + if (!udata) hns_roce_buf_free(hr_dev, srq_buf_size, &srq->buf); return ret; @@ -408,15 +406,15 @@ void hns_roce_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) hns_roce_srq_free(hr_dev, srq); hns_roce_mtt_cleanup(hr_dev, &srq->mtt); - if (ibsrq->uobject) { + if (udata) { hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt); - ib_umem_release(srq->idx_que.umem); - ib_umem_release(srq->umem); } else { kvfree(srq->wrid); hns_roce_buf_free(hr_dev, srq->max << srq->wqe_shift, &srq->buf); } + ib_umem_release(srq->idx_que.umem); + ib_umem_release(srq->umem); } int hns_roce_init_srq_table(struct hns_roce_dev *hr_dev) diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 3100b0c31b0a..3c0c6aabc64e 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2004,8 +2004,7 @@ static int i40iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata) struct cqp_commands_info *cqp_info; u32 stag_idx; - if (iwmr->region) - ib_umem_release(iwmr->region); + ib_umem_release(iwmr->region); if (iwmr->type != IW_MEMREG_TYPE_MEM) { /* region is released. only test for userness. */ diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 72f238ddafb5..a7d238d312f0 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -277,9 +277,8 @@ err_dbmap: err_mtt: mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt); - if (udata) - ib_umem_release(cq->umem); - else + ib_umem_release(cq->umem); + if (!udata) mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe); err_db: @@ -468,11 +467,8 @@ err_buf: kfree(cq->resize_buf); cq->resize_buf = NULL; - if (cq->resize_umem) { - ib_umem_release(cq->resize_umem); - cq->resize_umem = NULL; - } - + ib_umem_release(cq->resize_umem); + cq->resize_umem = NULL; out: mutex_unlock(&cq->resize_mutex); @@ -494,11 +490,11 @@ void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) struct mlx4_ib_ucontext, ibucontext), &mcq->db); - ib_umem_release(mcq->umem); } else { mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe); mlx4_db_free(dev->dev, &mcq->db); } + ib_umem_release(mcq->umem); } static void dump_cqe(void *cqe) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 520364defa28..82aff2f2fdc2 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1207,10 +1207,9 @@ err_mtt: mlx4_mtt_cleanup(dev->dev, &qp->mtt); err_buf: - if (qp->umem) - ib_umem_release(qp->umem); - else + if (!qp->umem) mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); + ib_umem_release(qp->umem); err_db: if (!udata && qp_has_rq(init_attr)) @@ -1421,7 +1420,6 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, mlx4_ib_db_unmap_user(mcontext, &qp->db); } - ib_umem_release(qp->umem); } else { kvfree(qp->sq.wrid); kvfree(qp->rq.wrid); @@ -1432,6 +1430,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, if (qp->rq.wqe_cnt) mlx4_db_free(dev->dev, &qp->db); } + ib_umem_release(qp->umem); del_gid_entries(qp); } diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index c9f555e04c9f..848db7264cc9 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -204,10 +204,9 @@ err_mtt: mlx4_mtt_cleanup(dev->dev, &srq->mtt); err_buf: - if (srq->umem) - ib_umem_release(srq->umem); - else + if (!srq->umem) mlx4_buf_free(dev->dev, buf_size, &srq->buf); + ib_umem_release(srq->umem); err_db: if (!udata) @@ -275,13 +274,13 @@ void mlx4_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata) struct mlx4_ib_ucontext, ibucontext), &msrq->db); - ib_umem_release(msrq->umem); } else { kvfree(msrq->wrid); mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift, &msrq->buf); mlx4_db_free(dev->dev, &msrq->db); } + ib_umem_release(msrq->umem); } void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 07b73df0e1a3..22230fd7d741 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -1125,11 +1125,6 @@ static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, return 0; } -static void un_resize_user(struct mlx5_ib_cq *cq) -{ - ib_umem_release(cq->resize_umem); -} - static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, int entries, int cqe_size) { @@ -1152,12 +1147,6 @@ ex: return err; } -static void un_resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) -{ - free_cq_buf(dev, cq->resize_buf); - cq->resize_buf = NULL; -} - static int copy_resize_cqes(struct mlx5_ib_cq *cq) { struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); @@ -1338,10 +1327,11 @@ ex_alloc: kvfree(in); ex_resize: - if (udata) - un_resize_user(cq); - else - un_resize_kernel(dev, cq); + ib_umem_release(cq->resize_umem); + if (!udata) { + free_cq_buf(dev, cq->resize_buf); + cq->resize_buf = NULL; + } ex: mutex_unlock(&cq->resize_mutex); return err; diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 4d033796dcfc..994abcebb057 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1507,10 +1507,9 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, return 0; err: - if (mr->umem) { - ib_umem_release(mr->umem); - mr->umem = NULL; - } + ib_umem_release(mr->umem); + mr->umem = NULL; + clean_mr(dev, mr); return err; } @@ -1630,10 +1629,10 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) * remove the DMA mapping. */ mlx5_mr_cache_free(dev, mr); - if (umem) { - ib_umem_release(umem); + ib_umem_release(umem); + if (umem) atomic_sub(npages, &dev->mdev->priv.reg_pages); - } + if (!mr->allocated_from_cache) kfree(mr); } diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index ae847709b3d3..12ccbd584d2a 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -790,8 +790,7 @@ static void destroy_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, atomic_dec(&dev->delay_drop.rqs_cnt); mlx5_ib_db_unmap_user(context, &rwq->db); - if (rwq->umem) - ib_umem_release(rwq->umem); + ib_umem_release(rwq->umem); } static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, @@ -977,8 +976,7 @@ err_free: kvfree(*in); err_umem: - if (ubuffer->umem) - ib_umem_release(ubuffer->umem); + ib_umem_release(ubuffer->umem); err_bfreg: if (bfregn != MLX5_IB_INVALID_BFREG) @@ -997,8 +995,7 @@ static void destroy_qp_user(struct mlx5_ib_dev *dev, struct ib_pd *pd, ibucontext); mlx5_ib_db_unmap_user(context, &qp->db); - if (base->ubuffer.umem) - ib_umem_release(base->ubuffer.umem); + ib_umem_release(base->ubuffer.umem); /* * Free only the BFREGs which are handled by the kernel. diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index d97124bee703..23554d8bf241 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -953,8 +953,7 @@ static int mthca_dereg_mr(struct ib_mr *mr, struct ib_udata *udata) struct mthca_mr *mmr = to_mmr(mr); mthca_free_mr(to_mdev(mr->device), mmr); - if (mmr->umem) - ib_umem_release(mmr->umem); + ib_umem_release(mmr->umem); kfree(mmr); return 0; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 10b35edb286b..bccc11378109 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -925,8 +925,7 @@ int ocrdma_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata) ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); /* it could be user registered memory. */ - if (mr->umem) - ib_umem_release(mr->umem); + ib_umem_release(mr->umem); kfree(mr); /* Don't stop cleanup, in case FW is unresponsive */ diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 3fc7a4e901c3..27d90a84ea01 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -1572,12 +1572,10 @@ qedr_iwarp_populate_user_qp(struct qedr_dev *dev, static void qedr_cleanup_user(struct qedr_dev *dev, struct qedr_qp *qp) { - if (qp->usq.umem) - ib_umem_release(qp->usq.umem); + ib_umem_release(qp->usq.umem); qp->usq.umem = NULL; - if (qp->urq.umem) - ib_umem_release(qp->urq.umem); + ib_umem_release(qp->urq.umem); qp->urq.umem = NULL; } @@ -2680,8 +2678,7 @@ int qedr_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata) qedr_free_pbl(dev, &mr->info.pbl_info, mr->info.pbl_table); /* it could be user registered memory. */ - if (mr->umem) - ib_umem_release(mr->umem); + ib_umem_release(mr->umem); kfree(mr); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index 38573fc0a9bf..7800e6930502 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -213,8 +213,7 @@ int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, err_page_dir: pvrdma_page_dir_cleanup(dev, &cq->pdir); err_umem: - if (!cq->is_kernel) - ib_umem_release(cq->umem); + ib_umem_release(cq->umem); err_cq: atomic_dec(&dev->num_cqs); return ret; @@ -226,8 +225,7 @@ static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq) complete(&cq->free); wait_for_completion(&cq->free); - if (!cq->is_kernel) - ib_umem_release(cq->umem); + ib_umem_release(cq->umem); pvrdma_page_dir_cleanup(dev, &cq->pdir); } diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c index 65dc47ffb8f3..f3a3d22ee8d7 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c @@ -290,8 +290,7 @@ int pvrdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) "could not deregister mem region, error: %d\n", ret); pvrdma_page_dir_cleanup(dev, &mr->pdir); - if (mr->umem) - ib_umem_release(mr->umem); + ib_umem_release(mr->umem); kfree(mr->pages); kfree(mr); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c index 0eaaead5baec..bca6a58a442e 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c @@ -391,12 +391,8 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd, err_pdir: pvrdma_page_dir_cleanup(dev, &qp->pdir); err_umem: - if (!qp->is_kernel) { - if (qp->rumem) - ib_umem_release(qp->rumem); - if (qp->sumem) - ib_umem_release(qp->sumem); - } + ib_umem_release(qp->rumem); + ib_umem_release(qp->sumem); err_qp: kfree(qp); atomic_dec(&dev->num_qps); @@ -429,12 +425,8 @@ static void pvrdma_free_qp(struct pvrdma_qp *qp) complete(&qp->free); wait_for_completion(&qp->free); - if (!qp->is_kernel) { - if (qp->rumem) - ib_umem_release(qp->rumem); - if (qp->sumem) - ib_umem_release(qp->sumem); - } + ib_umem_release(qp->rumem); + ib_umem_release(qp->sumem); pvrdma_page_dir_cleanup(dev, &qp->pdir); diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 54f3f9c27552..db800eb2b1f5 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -560,8 +560,7 @@ int rvt_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) if (ret) goto out; rvt_deinit_mregion(&mr->mr); - if (mr->umem) - ib_umem_release(mr->umem); + ib_umem_release(mr->umem); kfree(mr); out: return ret; diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index f501f72489d8..ea6a819b7167 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -96,8 +96,7 @@ void rxe_mem_cleanup(struct rxe_pool_entry *arg) struct rxe_mem *mem = container_of(arg, typeof(*mem), pelem); int i; - if (mem->umem) - ib_umem_release(mem->umem); + ib_umem_release(mem->umem); if (mem->map) { for (i = 0; i < mem->num_map; i++) -- cgit v1.2.3 From 4a9ceb7dbadf9e1435644b1f49720ee87431ce26 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Thu, 13 Jun 2019 08:30:52 -0400 Subject: IB/{rdmavt, qib, hfi1}: Convert to new completion API Convert all completions to use the new completion routine that fixes a race between post send and completion where fields from a SWQE can be read after SWQE has been freed. This patch also addresses issues reported in https://marc.info/?l=linux-kernel&m=155656897409107&w=2. The reserved operation path has no need for any barrier. The barrier for the other path is addressed by the smp_load_acquire() barrier. Cc: Andrea Parri Reviewed-by: Michael J. Ruhl Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 26 ++++---------------------- drivers/infiniband/hw/qib/qib_rc.c | 26 ++++---------------------- drivers/infiniband/sw/rdmavt/qp.c | 31 +++++++++---------------------- include/rdma/rdmavt_qp.h | 36 ------------------------------------ 4 files changed, 17 insertions(+), 102 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index a922edcf23d6..84b51cc36dbd 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -1819,23 +1819,14 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) } while (qp->s_last != qp->s_acked) { - u32 s_last; - wqe = rvt_get_swqe_ptr(qp, qp->s_last); if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 && cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) break; trdma_clean_swqe(qp, wqe); rvt_qp_wqe_unreserve(qp, wqe); - s_last = qp->s_last; - trace_hfi1_qp_send_completion(qp, wqe, s_last); - if (++s_last >= qp->s_size) - s_last = 0; - qp->s_last = s_last; - /* see post_send() */ - barrier(); - rvt_put_qp_swqe(qp, wqe); - rvt_qp_swqe_complete(qp, + trace_hfi1_qp_send_completion(qp, wqe, qp->s_last); + rvt_qp_complete_swqe(qp, wqe, ib_hfi1_wc_opcode[wqe->wr.opcode], IB_WC_SUCCESS); @@ -1879,19 +1870,10 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, trace_hfi1_rc_completion(qp, wqe->lpsn); if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 || cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { - u32 s_last; - trdma_clean_swqe(qp, wqe); - rvt_put_qp_swqe(qp, wqe); rvt_qp_wqe_unreserve(qp, wqe); - s_last = qp->s_last; - trace_hfi1_qp_send_completion(qp, wqe, s_last); - if (++s_last >= qp->s_size) - s_last = 0; - qp->s_last = s_last; - /* see post_send() */ - barrier(); - rvt_qp_swqe_complete(qp, + trace_hfi1_qp_send_completion(qp, wqe, qp->s_last); + rvt_qp_complete_swqe(qp, wqe, ib_hfi1_wc_opcode[wqe->wr.opcode], IB_WC_SUCCESS); diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 2ac4c67f5ba1..8d9a94d6f685 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -921,20 +921,11 @@ void qib_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr) rvt_add_retry_timer(qp); while (qp->s_last != qp->s_acked) { - u32 s_last; - wqe = rvt_get_swqe_ptr(qp, qp->s_last); if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 && qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) break; - s_last = qp->s_last; - if (++s_last >= qp->s_size) - s_last = 0; - qp->s_last = s_last; - /* see post_send() */ - barrier(); - rvt_put_qp_swqe(qp, wqe); - rvt_qp_swqe_complete(qp, + rvt_qp_complete_swqe(qp, wqe, ib_qib_wc_opcode[wqe->wr.opcode], IB_WC_SUCCESS); @@ -972,21 +963,12 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, * is finished. */ if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 || - qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { - u32 s_last; - - rvt_put_qp_swqe(qp, wqe); - s_last = qp->s_last; - if (++s_last >= qp->s_size) - s_last = 0; - qp->s_last = s_last; - /* see post_send() */ - barrier(); - rvt_qp_swqe_complete(qp, + qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) + rvt_qp_complete_swqe(qp, wqe, ib_qib_wc_opcode[wqe->wr.opcode], IB_WC_SUCCESS); - } else + else this_cpu_inc(*ibp->rvp.rc_delayed_comp); qp->s_retry = qp->s_retry_cnt; diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index a60f5faea198..dfbc7d8640fb 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1853,10 +1853,9 @@ static inline int rvt_qp_is_avail( /* see rvt_qp_wqe_unreserve() */ smp_mb__before_atomic(); - reserved_used = atomic_read(&qp->s_reserved_used); if (unlikely(reserved_op)) { /* see rvt_qp_wqe_unreserve() */ - smp_mb__before_atomic(); + reserved_used = atomic_read(&qp->s_reserved_used); if (reserved_used >= rdi->dparms.reserved_operations) return -ENOMEM; return 0; @@ -1864,14 +1863,13 @@ static inline int rvt_qp_is_avail( /* non-reserved operations */ if (likely(qp->s_avail)) return 0; - slast = READ_ONCE(qp->s_last); + /* See rvt_qp_complete_swqe() */ + slast = smp_load_acquire(&qp->s_last); if (qp->s_head >= slast) avail = qp->s_size - (qp->s_head - slast); else avail = slast - qp->s_head; - /* see rvt_qp_wqe_unreserve() */ - smp_mb__before_atomic(); reserved_used = atomic_read(&qp->s_reserved_used); avail = avail - 1 - (rdi->dparms.reserved_operations - reserved_used); @@ -2664,27 +2662,16 @@ void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, enum ib_wc_status status) { u32 old_last, last; - struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + struct rvt_dev_info *rdi; if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) return; + rdi = ib_to_rvt(qp->ibqp.device); - last = qp->s_last; - old_last = last; - trace_rvt_qp_send_completion(qp, wqe, last); - if (++last >= qp->s_size) - last = 0; - trace_rvt_qp_send_completion(qp, wqe, last); - qp->s_last = last; - /* See post_send() */ - barrier(); - rvt_put_qp_swqe(qp, wqe); - - rvt_qp_swqe_complete(qp, - wqe, - rdi->wc_opcode[wqe->wr.opcode], - status); - + old_last = qp->s_last; + trace_rvt_qp_send_completion(qp, wqe, old_last); + last = rvt_qp_complete_swqe(qp, wqe, rdi->wc_opcode[wqe->wr.opcode], + status); if (qp->s_acked == old_last) qp->s_acked = last; if (qp->s_cur == old_last) diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 6014f1766907..84d0f36afc2f 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -565,42 +565,6 @@ static inline void rvt_qp_wqe_unreserve( extern const enum ib_wc_opcode ib_rvt_wc_opcode[]; -/** - * rvt_qp_swqe_complete() - insert send completion - * @qp - the qp - * @wqe - the send wqe - * @status - completion status - * - * Insert a send completion into the completion - * queue if the qp indicates it should be done. - * - * See IBTA 10.7.3.1 for info on completion - * control. - */ -static inline void rvt_qp_swqe_complete( - struct rvt_qp *qp, - struct rvt_swqe *wqe, - enum ib_wc_opcode opcode, - enum ib_wc_status status) -{ - if (unlikely(wqe->wr.send_flags & RVT_SEND_RESERVE_USED)) - return; - if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) || - (wqe->wr.send_flags & IB_SEND_SIGNALED) || - status != IB_WC_SUCCESS) { - struct ib_wc wc; - - memset(&wc, 0, sizeof(wc)); - wc.wr_id = wqe->wr.wr_id; - wc.status = status; - wc.opcode = opcode; - wc.qp = &qp->ibqp; - wc.byte_len = wqe->length; - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, - status != IB_WC_SUCCESS); - } -} - /* * Compare the lower 24 bits of the msn values. * Returns an integer <, ==, or > than zero. -- cgit v1.2.3 From 239b0e52d8aa64d2559c672fd8c29cf1fffc3ec7 Mon Sep 17 00:00:00 2001 From: Kamenee Arumugam Date: Fri, 28 Jun 2019 14:04:17 -0400 Subject: IB/hfi1: Move rvt_cq_wc struct into uapi directory The rvt_cq_wc struct elements are shared between rdmavt and the providers but not in uapi directory. As per the comment in https://marc.info/?l=linux-rdma&m=152296522708522&w=2 The hfi1 driver and the rdma core driver are not using shared structures in the uapi directory. In that case, move rvt_cq_wc struct into the rvt-abi.h header file and create a rvt_k_cq_w for the kernel completion queue. Signed-off-by: Kamenee Arumugam Reviewed-by: Mike Marciniszyn Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/qp.c | 4 +- drivers/infiniband/sw/rdmavt/cq.c | 192 ++++++++++++++++++++++++-------------- include/rdma/rdmavt_cq.h | 22 +++-- include/rdma/rdmavt_qp.h | 32 +++++++ include/uapi/rdma/rvt-abi.h | 32 +++++++ 5 files changed, 205 insertions(+), 77 deletions(-) create mode 100644 include/uapi/rdma/rvt-abi.h (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 4e0e9fc0a777..41261e72c429 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -702,8 +702,8 @@ void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter) sde ? sde->this_idx : 0, send_context, send_context ? send_context->sw_index : 0, - ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->head, - ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->tail, + ib_cq_head(qp->ibqp.send_cq), + ib_cq_tail(qp->ibqp.send_cq), qp->pid, qp->s_state, qp->s_ack_state, diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index b46714a92b7a..2602ad8b8cb0 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -63,19 +63,33 @@ static struct workqueue_struct *comp_vector_wq; */ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) { - struct rvt_cq_wc *wc; + struct ib_uverbs_wc *uqueue = NULL; + struct ib_wc *kqueue = NULL; + struct rvt_cq_wc *u_wc = NULL; + struct rvt_k_cq_wc *k_wc = NULL; unsigned long flags; u32 head; u32 next; + u32 tail; spin_lock_irqsave(&cq->lock, flags); + if (cq->ip) { + u_wc = cq->queue; + uqueue = &u_wc->uqueue[0]; + head = RDMA_READ_UAPI_ATOMIC(u_wc->head); + tail = RDMA_READ_UAPI_ATOMIC(u_wc->tail); + } else { + k_wc = cq->kqueue; + kqueue = &k_wc->kqueue[0]; + head = k_wc->head; + tail = k_wc->tail; + } + /* - * Note that the head pointer might be writable by user processes. - * Take care to verify it is a sane value. + * Note that the head pointer might be writable by + * user processes.Take care to verify it is a sane value. */ - wc = cq->queue; - head = wc->head; if (head >= (unsigned)cq->ibcq.cqe) { head = cq->ibcq.cqe; next = 0; @@ -83,7 +97,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) next = head + 1; } - if (unlikely(next == wc->tail)) { + if (unlikely(next == tail)) { spin_unlock_irqrestore(&cq->lock, flags); if (cq->ibcq.event_handler) { struct ib_event ev; @@ -96,27 +110,27 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) return; } trace_rvt_cq_enter(cq, entry, head); - if (cq->ip) { - wc->uqueue[head].wr_id = entry->wr_id; - wc->uqueue[head].status = entry->status; - wc->uqueue[head].opcode = entry->opcode; - wc->uqueue[head].vendor_err = entry->vendor_err; - wc->uqueue[head].byte_len = entry->byte_len; - wc->uqueue[head].ex.imm_data = entry->ex.imm_data; - wc->uqueue[head].qp_num = entry->qp->qp_num; - wc->uqueue[head].src_qp = entry->src_qp; - wc->uqueue[head].wc_flags = entry->wc_flags; - wc->uqueue[head].pkey_index = entry->pkey_index; - wc->uqueue[head].slid = ib_lid_cpu16(entry->slid); - wc->uqueue[head].sl = entry->sl; - wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits; - wc->uqueue[head].port_num = entry->port_num; + if (uqueue) { + uqueue[head].wr_id = entry->wr_id; + uqueue[head].status = entry->status; + uqueue[head].opcode = entry->opcode; + uqueue[head].vendor_err = entry->vendor_err; + uqueue[head].byte_len = entry->byte_len; + uqueue[head].ex.imm_data = entry->ex.imm_data; + uqueue[head].qp_num = entry->qp->qp_num; + uqueue[head].src_qp = entry->src_qp; + uqueue[head].wc_flags = entry->wc_flags; + uqueue[head].pkey_index = entry->pkey_index; + uqueue[head].slid = ib_lid_cpu16(entry->slid); + uqueue[head].sl = entry->sl; + uqueue[head].dlid_path_bits = entry->dlid_path_bits; + uqueue[head].port_num = entry->port_num; /* Make sure entry is written before the head index. */ - smp_wmb(); + RDMA_WRITE_UAPI_ATOMIC(u_wc->head, next); } else { - wc->kqueue[head] = *entry; + kqueue[head] = *entry; + k_wc->head = next; } - wc->head = next; if (cq->notify == IB_CQ_NEXT_COMP || (cq->notify == IB_CQ_SOLICITED && @@ -179,8 +193,9 @@ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, { struct ib_device *ibdev = ibcq->device; struct rvt_dev_info *rdi = ib_to_rvt(ibdev); - struct rvt_cq *cq = container_of(ibcq, struct rvt_cq, ibcq); - struct rvt_cq_wc *wc; + struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); + struct rvt_cq_wc *u_wc = NULL; + struct rvt_k_cq_wc *k_wc = NULL; u32 sz; unsigned int entries = attr->cqe; int comp_vector = attr->comp_vector; @@ -204,22 +219,28 @@ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, * We need to use vmalloc() in order to support mmap and large * numbers of entries. */ - sz = sizeof(*wc); - if (udata && udata->outlen >= sizeof(__u64)) - sz += sizeof(struct ib_uverbs_wc) * (entries + 1); - else - sz += sizeof(struct ib_wc) * (entries + 1); - wc = udata ? - vmalloc_user(sz) : - vzalloc_node(sz, rdi->dparms.node); - if (!wc) - return -ENOMEM; + if (udata && udata->outlen >= sizeof(__u64)) { + sz = sizeof(struct ib_uverbs_wc) * (entries + 1); + sz += sizeof(*u_wc); + u_wc = vmalloc_user(sz); + if (!u_wc) + return -ENOMEM; + } else { + sz = sizeof(struct ib_wc) * (entries + 1); + sz += sizeof(*k_wc); + k_wc = vzalloc_node(sz, rdi->dparms.node); + if (!k_wc) + return -ENOMEM; + } + /* * Return the address of the WC as the offset to mmap. * See rvt_mmap() for details. */ if (udata && udata->outlen >= sizeof(__u64)) { - cq->ip = rvt_create_mmap_info(rdi, sz, udata, wc); + int err; + + cq->ip = rvt_create_mmap_info(rdi, sz, udata, u_wc); if (!cq->ip) { err = -ENOMEM; goto bail_wc; @@ -264,7 +285,10 @@ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, cq->notify = RVT_CQ_NONE; spin_lock_init(&cq->lock); INIT_WORK(&cq->comptask, send_complete); - cq->queue = wc; + if (u_wc) + cq->queue = u_wc; + else + cq->kqueue = k_wc; trace_rvt_create_cq(cq, attr); return 0; @@ -272,7 +296,8 @@ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, bail_ip: kfree(cq->ip); bail_wc: - vfree(wc); + vfree(u_wc); + vfree(k_wc); return err; } @@ -322,9 +347,16 @@ int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) if (cq->notify != IB_CQ_NEXT_COMP) cq->notify = notify_flags & IB_CQ_SOLICITED_MASK; - if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) && - cq->queue->head != cq->queue->tail) - ret = 1; + if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) { + if (cq->queue) { + if (RDMA_READ_UAPI_ATOMIC(cq->queue->head) != + RDMA_READ_UAPI_ATOMIC(cq->queue->tail)) + ret = 1; + } else { + if (cq->kqueue->head != cq->kqueue->tail) + ret = 1; + } + } spin_unlock_irqrestore(&cq->lock, flags); @@ -340,12 +372,14 @@ int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) { struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); - struct rvt_cq_wc *old_wc; - struct rvt_cq_wc *wc; u32 head, tail, n; int ret; u32 sz; struct rvt_dev_info *rdi = cq->rdi; + struct rvt_cq_wc *u_wc = NULL; + struct rvt_cq_wc *old_u_wc = NULL; + struct rvt_k_cq_wc *k_wc = NULL; + struct rvt_k_cq_wc *old_k_wc = NULL; if (cqe < 1 || cqe > rdi->dparms.props.max_cqe) return -EINVAL; @@ -353,17 +387,19 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) /* * Need to use vmalloc() if we want to support large #s of entries. */ - sz = sizeof(*wc); - if (udata && udata->outlen >= sizeof(__u64)) - sz += sizeof(struct ib_uverbs_wc) * (cqe + 1); - else - sz += sizeof(struct ib_wc) * (cqe + 1); - wc = udata ? - vmalloc_user(sz) : - vzalloc_node(sz, rdi->dparms.node); - if (!wc) - return -ENOMEM; - + if (udata && udata->outlen >= sizeof(__u64)) { + sz = sizeof(struct ib_uverbs_wc) * (cqe + 1); + sz += sizeof(*u_wc); + u_wc = vmalloc_user(sz); + if (!u_wc) + return -ENOMEM; + } else { + sz = sizeof(struct ib_wc) * (cqe + 1); + sz += sizeof(*k_wc); + k_wc = vzalloc_node(sz, rdi->dparms.node); + if (!k_wc) + return -ENOMEM; + } /* Check that we can write the offset to mmap. */ if (udata && udata->outlen >= sizeof(__u64)) { __u64 offset = 0; @@ -378,11 +414,18 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) * Make sure head and tail are sane since they * might be user writable. */ - old_wc = cq->queue; - head = old_wc->head; + if (u_wc) { + old_u_wc = cq->queue; + head = RDMA_READ_UAPI_ATOMIC(old_u_wc->head); + tail = RDMA_READ_UAPI_ATOMIC(old_u_wc->tail); + } else { + old_k_wc = cq->kqueue; + head = old_k_wc->head; + tail = old_k_wc->tail; + } + if (head > (u32)cq->ibcq.cqe) head = (u32)cq->ibcq.cqe; - tail = old_wc->tail; if (tail > (u32)cq->ibcq.cqe) tail = (u32)cq->ibcq.cqe; if (head < tail) @@ -394,27 +437,36 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) goto bail_unlock; } for (n = 0; tail != head; n++) { - if (cq->ip) - wc->uqueue[n] = old_wc->uqueue[tail]; + if (u_wc) + u_wc->uqueue[n] = old_u_wc->uqueue[tail]; else - wc->kqueue[n] = old_wc->kqueue[tail]; + k_wc->kqueue[n] = old_k_wc->kqueue[tail]; if (tail == (u32)cq->ibcq.cqe) tail = 0; else tail++; } cq->ibcq.cqe = cqe; - wc->head = n; - wc->tail = 0; - cq->queue = wc; + if (u_wc) { + RDMA_WRITE_UAPI_ATOMIC(u_wc->head, n); + RDMA_WRITE_UAPI_ATOMIC(u_wc->tail, 0); + cq->queue = u_wc; + } else { + k_wc->head = n; + k_wc->tail = 0; + cq->kqueue = k_wc; + } spin_unlock_irq(&cq->lock); - vfree(old_wc); + if (u_wc) + vfree(old_u_wc); + else + vfree(old_k_wc); if (cq->ip) { struct rvt_mmap_info *ip = cq->ip; - rvt_update_mmap_info(rdi, ip, sz, wc); + rvt_update_mmap_info(rdi, ip, sz, u_wc); /* * Return the offset to mmap. @@ -438,7 +490,9 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) bail_unlock: spin_unlock_irq(&cq->lock); bail_free: - vfree(wc); + vfree(u_wc); + vfree(k_wc); + return ret; } @@ -456,7 +510,7 @@ bail_free: int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) { struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); - struct rvt_cq_wc *wc; + struct rvt_k_cq_wc *wc; unsigned long flags; int npolled; u32 tail; @@ -467,7 +521,7 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) spin_lock_irqsave(&cq->lock, flags); - wc = cq->queue; + wc = cq->kqueue; tail = wc->tail; if (tail > (u32)cq->ibcq.cqe) tail = (u32)cq->ibcq.cqe; diff --git a/include/rdma/rdmavt_cq.h b/include/rdma/rdmavt_cq.h index 75dc65c0bfb8..ab22860a63e2 100644 --- a/include/rdma/rdmavt_cq.h +++ b/include/rdma/rdmavt_cq.h @@ -60,19 +60,28 @@ */ #define RVT_CQ_NONE (IB_CQ_NEXT_COMP + 1) +/* + * Define read macro that apply smp_load_acquire memory barrier + * when reading indice of circular buffer that mmaped to user space. + */ +#define RDMA_READ_UAPI_ATOMIC(member) smp_load_acquire(&(member).val) + +/* + * Define write macro that uses smp_store_release memory barrier + * when writing indice of circular buffer that mmaped to user space. + */ +#define RDMA_WRITE_UAPI_ATOMIC(member, x) smp_store_release(&(member).val, x) +#include + /* * This structure is used to contain the head pointer, tail pointer, * and completion queue entries as a single memory allocation so * it can be mmap'ed into user space. */ -struct rvt_cq_wc { +struct rvt_k_cq_wc { u32 head; /* index of next entry to fill */ u32 tail; /* index of next ib_poll_cq() entry */ - union { - /* these are actually size ibcq.cqe + 1 */ - struct ib_uverbs_wc uqueue[0]; - struct ib_wc kqueue[0]; - }; + struct ib_wc kqueue[]; }; /* @@ -88,6 +97,7 @@ struct rvt_cq { struct rvt_dev_info *rdi; struct rvt_cq_wc *queue; struct rvt_mmap_info *ip; + struct rvt_k_cq_wc *kqueue; }; static inline struct rvt_cq *ibcq_to_rvtcq(struct ib_cq *ibcq) diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 84d0f36afc2f..7fcd687af278 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -820,6 +820,38 @@ struct rvt_qp_iter { int n; }; +/** + * ib_cq_tail - Return tail index of cq buffer + * @send_cq - The cq for send + * + * This is called in qp_iter_print to get tail + * of cq buffer. + */ +static inline u32 ib_cq_tail(struct ib_cq *send_cq) +{ + struct rvt_cq *cq = ibcq_to_rvtcq(send_cq); + + return ibcq_to_rvtcq(send_cq)->ip ? + RDMA_READ_UAPI_ATOMIC(cq->queue->tail) : + ibcq_to_rvtcq(send_cq)->kqueue->tail; +} + +/** + * ib_cq_head - Return head index of cq buffer + * @send_cq - The cq for send + * + * This is called in qp_iter_print to get head + * of cq buffer. + */ +static inline u32 ib_cq_head(struct ib_cq *send_cq) +{ + struct rvt_cq *cq = ibcq_to_rvtcq(send_cq); + + return ibcq_to_rvtcq(send_cq)->ip ? + RDMA_READ_UAPI_ATOMIC(cq->queue->head) : + ibcq_to_rvtcq(send_cq)->kqueue->head; +} + struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi, u64 v, void (*cb)(struct rvt_qp *qp, u64 v)); diff --git a/include/uapi/rdma/rvt-abi.h b/include/uapi/rdma/rvt-abi.h new file mode 100644 index 000000000000..8e5f7e0c15fe --- /dev/null +++ b/include/uapi/rdma/rvt-abi.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ + +/* + * This file contains defines, structures, etc. that are used + * to communicate between kernel and user code. + */ + +#ifndef RVT_ABI_USER_H +#define RVT_ABI_USER_H + +#include +#include +#ifndef RDMA_ATOMIC_UAPI +#define RDMA_ATOMIC_UAPI(_type, _name) struct{ _type val; } _name +#endif + +/* + * This structure is used to contain the head pointer, tail pointer, + * and completion queue entries as a single memory allocation so + * it can be mmap'ed into user space. + */ +struct rvt_cq_wc { + /* index of next entry to fill */ + RDMA_ATOMIC_UAPI(__u32, head); + /* index of next ib_poll_cq() entry */ + RDMA_ATOMIC_UAPI(__u32, tail); + + /* these are actually size ibcq.cqe + 1 */ + struct ib_uverbs_wc uqueue[]; +}; + +#endif /* RVT_ABI_USER_H */ -- cgit v1.2.3 From dabac6e460ce8473f1e685432a8ab7818d81a1f1 Mon Sep 17 00:00:00 2001 From: Kamenee Arumugam Date: Fri, 28 Jun 2019 14:04:24 -0400 Subject: IB/hfi1: Move receive work queue struct into uapi directory The rvt_rwqe and rvt_rwq struct elements are shared between rdmavt and the providers but are not in uapi directory. As per the comment in https://marc.info/?l=linux-rdma&m=152296522708522&w=2, The hfi1 driver and the rdma core driver are not using shared structures in the uapi directory. Move rvt_rwqe and rvt_rwq struct into rvt-abi.h header in uapi directory. Reviewed-by: Mike Marciniszyn Reviewed-by: Michael J. Ruhl Signed-off-by: Kamenee Arumugam Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/qp.c | 152 +++++++++++++++++++++++++++---------- drivers/infiniband/sw/rdmavt/qp.h | 2 + drivers/infiniband/sw/rdmavt/rc.c | 10 ++- drivers/infiniband/sw/rdmavt/srq.c | 59 +++++++------- include/rdma/rdmavt_qp.h | 52 ++++++++----- include/uapi/rdma/rvt-abi.h | 29 +++++++ 6 files changed, 212 insertions(+), 92 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 0d804a58f954..1384060f175d 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -802,6 +802,46 @@ static void rvt_remove_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp) } } +/** + * rvt_alloc_rq - allocate memory for user or kernel buffer + * @rq: receive queue data structure + * @size: number of request queue entries + * @node: The NUMA node + * @udata: True if user data is available or not false + * + * Return: If memory allocation failed, return -ENONEM + * This function is used by both shared receive + * queues and non-shared receive queues to allocate + * memory. + */ +int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node, + struct ib_udata *udata) +{ + if (udata) { + rq->wq = vmalloc_user(sizeof(struct rvt_rwq) + size); + if (!rq->wq) + goto bail; + /* need kwq with no buffers */ + rq->kwq = kzalloc_node(sizeof(*rq->kwq), GFP_KERNEL, node); + if (!rq->kwq) + goto bail; + rq->kwq->curr_wq = rq->wq->wq; + } else { + /* need kwq with buffers */ + rq->kwq = + vzalloc_node(sizeof(struct rvt_krwq) + size, node); + if (!rq->kwq) + goto bail; + rq->kwq->curr_wq = rq->kwq->wq; + } + + spin_lock_init(&rq->lock); + return 0; +bail: + rvt_free_rq(rq); + return -ENOMEM; +} + /** * rvt_init_qp - initialize the QP state to the reset state * @qp: the QP to init or reinit @@ -852,10 +892,6 @@ static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, qp->s_tail_ack_queue = 0; qp->s_acked_ack_queue = 0; qp->s_num_rd_atomic = 0; - if (qp->r_rq.wq) { - qp->r_rq.wq->head = 0; - qp->r_rq.wq->tail = 0; - } qp->r_sge.num_sge = 0; atomic_set(&qp->s_reserved_used, 0); } @@ -1046,17 +1082,12 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, qp->r_rq.max_sge = init_attr->cap.max_recv_sge; sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) + sizeof(struct rvt_rwqe); - if (udata) - qp->r_rq.wq = vmalloc_user( - sizeof(struct rvt_rwq) + - qp->r_rq.size * sz); - else - qp->r_rq.wq = vzalloc_node( - sizeof(struct rvt_rwq) + - qp->r_rq.size * sz, - rdi->dparms.node); - if (!qp->r_rq.wq) + err = rvt_alloc_rq(&qp->r_rq, qp->r_rq.size * sz, + rdi->dparms.node, udata); + if (err) { + ret = ERR_PTR(err); goto bail_driver_priv; + } } /* @@ -1202,8 +1233,7 @@ bail_qpn: rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num); bail_rq_wq: - if (!qp->ip) - vfree(qp->r_rq.wq); + rvt_free_rq(&qp->r_rq); bail_driver_priv: rdi->driver_f.qp_priv_free(rdi, qp); @@ -1269,19 +1299,26 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err) } wc.status = IB_WC_WR_FLUSH_ERR; - if (qp->r_rq.wq) { - struct rvt_rwq *wq; + if (qp->r_rq.kwq) { u32 head; u32 tail; + struct rvt_rwq *wq = NULL; + struct rvt_krwq *kwq = NULL; spin_lock(&qp->r_rq.lock); - + /* qp->ip used to validate if there is a user buffer mmaped */ + if (qp->ip) { + wq = qp->r_rq.wq; + head = RDMA_READ_UAPI_ATOMIC(wq->head); + tail = RDMA_READ_UAPI_ATOMIC(wq->tail); + } else { + kwq = qp->r_rq.kwq; + head = kwq->head; + tail = kwq->tail; + } /* sanity check pointers before trusting them */ - wq = qp->r_rq.wq; - head = wq->head; if (head >= qp->r_rq.size) head = 0; - tail = wq->tail; if (tail >= qp->r_rq.size) tail = 0; while (tail != head) { @@ -1290,8 +1327,10 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err) tail = 0; rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); } - wq->tail = tail; - + if (qp->ip) + RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail); + else + kwq->tail = tail; spin_unlock(&qp->r_rq.lock); } else if (qp->ibqp.event_handler) { ret = 1; @@ -1634,8 +1673,7 @@ int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) if (qp->ip) kref_put(&qp->ip->ref, rvt_release_mmap_info); - else - vfree(qp->r_rq.wq); + kvfree(qp->r_rq.kwq); rdi->driver_f.qp_priv_free(rdi, qp); kfree(qp->s_ack_queue); rdma_destroy_ah_attr(&qp->remote_ah_attr); @@ -1721,7 +1759,7 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) { struct rvt_qp *qp = ibqp_to_rvtqp(ibqp); - struct rvt_rwq *wq = qp->r_rq.wq; + struct rvt_krwq *wq = qp->r_rq.kwq; unsigned long flags; int qp_err_flush = (ib_rvt_state_ops[qp->state] & RVT_FLUSH_RECV) && !qp->ibqp.srq; @@ -1746,7 +1784,7 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, next = wq->head + 1; if (next >= qp->r_rq.size) next = 0; - if (next == wq->tail) { + if (next == READ_ONCE(wq->tail)) { spin_unlock_irqrestore(&qp->r_rq.lock, flags); *bad_wr = wr; return -ENOMEM; @@ -1770,8 +1808,7 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, * Make sure queue entry is written * before the head index. */ - smp_wmb(); - wq->head = next; + smp_store_release(&wq->head, next); } spin_unlock_irqrestore(&qp->r_rq.lock, flags); } @@ -2141,7 +2178,7 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) { struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq); - struct rvt_rwq *wq; + struct rvt_krwq *wq; unsigned long flags; for (; wr; wr = wr->next) { @@ -2155,11 +2192,11 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, } spin_lock_irqsave(&srq->rq.lock, flags); - wq = srq->rq.wq; + wq = srq->rq.kwq; next = wq->head + 1; if (next >= srq->rq.size) next = 0; - if (next == wq->tail) { + if (next == READ_ONCE(wq->tail)) { spin_unlock_irqrestore(&srq->rq.lock, flags); *bad_wr = wr; return -ENOMEM; @@ -2171,8 +2208,7 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, for (i = 0; i < wr->num_sge; i++) wqe->sg_list[i] = wr->sg_list[i]; /* Make sure queue entry is written before the head index. */ - smp_wmb(); - wq->head = next; + smp_store_release(&wq->head, next); spin_unlock_irqrestore(&srq->rq.lock, flags); } return 0; @@ -2229,6 +2265,25 @@ bad_lkey: return 0; } +/** + * get_rvt_head - get head indices of the circular buffer + * @rq: data structure for request queue entry + * @ip: the QP + * + * Return - head index value + */ +static inline u32 get_rvt_head(struct rvt_rq *rq, void *ip) +{ + u32 head; + + if (ip) + head = RDMA_READ_UAPI_ATOMIC(rq->wq->head); + else + head = rq->kwq->head; + + return head; +} + /** * rvt_get_rwqe - copy the next RWQE into the QP's RWQE * @qp: the QP @@ -2243,21 +2298,26 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) { unsigned long flags; struct rvt_rq *rq; + struct rvt_krwq *kwq; struct rvt_rwq *wq; struct rvt_srq *srq; struct rvt_rwqe *wqe; void (*handler)(struct ib_event *, void *); u32 tail; + u32 head; int ret; + void *ip = NULL; if (qp->ibqp.srq) { srq = ibsrq_to_rvtsrq(qp->ibqp.srq); handler = srq->ibsrq.event_handler; rq = &srq->rq; + ip = srq->ip; } else { srq = NULL; handler = NULL; rq = &qp->r_rq; + ip = qp->ip; } spin_lock_irqsave(&rq->lock, flags); @@ -2265,17 +2325,24 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) ret = 0; goto unlock; } + if (ip) { + wq = rq->wq; + tail = RDMA_READ_UAPI_ATOMIC(wq->tail); + } else { + kwq = rq->kwq; + tail = kwq->tail; + } - wq = rq->wq; - tail = wq->tail; /* Validate tail before using it since it is user writable. */ if (tail >= rq->size) tail = 0; - if (unlikely(tail == wq->head)) { + + head = get_rvt_head(rq, ip); + if (unlikely(tail == head)) { ret = 0; goto unlock; } - /* Make sure entry is read after head index is read. */ + /* Make sure entry is read after the count is read. */ smp_rmb(); wqe = rvt_get_rwqe_ptr(rq, tail); /* @@ -2285,7 +2352,10 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) */ if (++tail >= rq->size) tail = 0; - wq->tail = tail; + if (ip) + RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail); + else + kwq->tail = tail; if (!wr_id_only && !init_sge(qp, wqe)) { ret = -1; goto unlock; @@ -2301,7 +2371,7 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) * Validate head pointer value and compute * the number of remaining WQEs. */ - n = wq->head; + n = get_rvt_head(rq, ip); if (n >= rq->size) n = 0; if (n < tail) diff --git a/drivers/infiniband/sw/rdmavt/qp.h b/drivers/infiniband/sw/rdmavt/qp.h index 6db1619389b0..2cdba1283bf6 100644 --- a/drivers/infiniband/sw/rdmavt/qp.h +++ b/drivers/infiniband/sw/rdmavt/qp.h @@ -68,4 +68,6 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr); int rvt_wss_init(struct rvt_dev_info *rdi); void rvt_wss_exit(struct rvt_dev_info *rdi); +int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node, + struct ib_udata *udata); #endif /* DEF_RVTQP_H */ diff --git a/drivers/infiniband/sw/rdmavt/rc.c b/drivers/infiniband/sw/rdmavt/rc.c index 09f0cf538be6..44cc7ee1b321 100644 --- a/drivers/infiniband/sw/rdmavt/rc.c +++ b/drivers/infiniband/sw/rdmavt/rc.c @@ -104,15 +104,19 @@ __be32 rvt_compute_aeth(struct rvt_qp *qp) } else { u32 min, max, x; u32 credits; - struct rvt_rwq *wq = qp->r_rq.wq; u32 head; u32 tail; /* sanity check pointers before trusting them */ - head = wq->head; + if (qp->ip) { + head = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->head); + tail = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->tail); + } else { + head = READ_ONCE(qp->r_rq.kwq->head); + tail = READ_ONCE(qp->r_rq.kwq->tail); + } if (head >= qp->r_rq.size) head = 0; - tail = wq->tail; if (tail >= qp->r_rq.size) tail = 0; /* diff --git a/drivers/infiniband/sw/rdmavt/srq.c b/drivers/infiniband/sw/rdmavt/srq.c index 8d6b3e764255..d306f6547cba 100644 --- a/drivers/infiniband/sw/rdmavt/srq.c +++ b/drivers/infiniband/sw/rdmavt/srq.c @@ -52,7 +52,7 @@ #include "srq.h" #include "vt.h" - +#include "qp.h" /** * rvt_driver_srq_init - init srq resources on a per driver basis * @rdi: rvt dev structure @@ -97,11 +97,8 @@ int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr, srq->rq.max_sge = srq_init_attr->attr.max_sge; sz = sizeof(struct ib_sge) * srq->rq.max_sge + sizeof(struct rvt_rwqe); - srq->rq.wq = udata ? - vmalloc_user(sizeof(struct rvt_rwq) + srq->rq.size * sz) : - vzalloc_node(sizeof(struct rvt_rwq) + srq->rq.size * sz, - dev->dparms.node); - if (!srq->rq.wq) { + if (rvt_alloc_rq(&srq->rq, srq->rq.size * sz, + dev->dparms.node, udata)) { ret = -ENOMEM; goto bail_srq; } @@ -152,7 +149,7 @@ int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr, bail_ip: kfree(srq->ip); bail_wq: - vfree(srq->rq.wq); + rvt_free_rq(&srq->rq); bail_srq: return ret; } @@ -172,11 +169,12 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, { struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq); struct rvt_dev_info *dev = ib_to_rvt(ibsrq->device); - struct rvt_rwq *wq; + struct rvt_rq tmp_rq = {}; int ret = 0; if (attr_mask & IB_SRQ_MAX_WR) { - struct rvt_rwq *owq; + struct rvt_krwq *okwq = NULL; + struct rvt_rwq *owq = NULL; struct rvt_rwqe *p; u32 sz, size, n, head, tail; @@ -185,17 +183,12 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, ((attr_mask & IB_SRQ_LIMIT) ? attr->srq_limit : srq->limit) > attr->max_wr) return -EINVAL; - sz = sizeof(struct rvt_rwqe) + srq->rq.max_sge * sizeof(struct ib_sge); size = attr->max_wr + 1; - wq = udata ? - vmalloc_user(sizeof(struct rvt_rwq) + size * sz) : - vzalloc_node(sizeof(struct rvt_rwq) + size * sz, - dev->dparms.node); - if (!wq) + if (rvt_alloc_rq(&tmp_rq, size * sz, dev->dparms.node, + udata)) return -ENOMEM; - /* Check that we can write the offset to mmap. */ if (udata && udata->inlen >= sizeof(__u64)) { __u64 offset_addr; @@ -218,9 +211,15 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, * validate head and tail pointer values and compute * the number of remaining WQEs. */ - owq = srq->rq.wq; - head = owq->head; - tail = owq->tail; + if (udata) { + owq = srq->rq.wq; + head = RDMA_READ_UAPI_ATOMIC(owq->head); + tail = RDMA_READ_UAPI_ATOMIC(owq->tail); + } else { + okwq = srq->rq.kwq; + head = okwq->head; + tail = okwq->tail; + } if (head >= srq->rq.size || tail >= srq->rq.size) { ret = -EINVAL; goto bail_unlock; @@ -235,7 +234,7 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, goto bail_unlock; } n = 0; - p = wq->wq; + p = tmp_rq.kwq->curr_wq; while (tail != head) { struct rvt_rwqe *wqe; int i; @@ -250,22 +249,29 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, if (++tail >= srq->rq.size) tail = 0; } - srq->rq.wq = wq; + srq->rq.kwq = tmp_rq.kwq; + if (udata) { + srq->rq.wq = tmp_rq.wq; + RDMA_WRITE_UAPI_ATOMIC(tmp_rq.wq->head, n); + RDMA_WRITE_UAPI_ATOMIC(tmp_rq.wq->tail, 0); + } else { + tmp_rq.kwq->head = n; + tmp_rq.kwq->tail = 0; + } srq->rq.size = size; - wq->head = n; - wq->tail = 0; if (attr_mask & IB_SRQ_LIMIT) srq->limit = attr->srq_limit; spin_unlock_irq(&srq->rq.lock); vfree(owq); + kvfree(okwq); if (srq->ip) { struct rvt_mmap_info *ip = srq->ip; struct rvt_dev_info *dev = ib_to_rvt(srq->ibsrq.device); u32 s = sizeof(struct rvt_rwq) + size * sz; - rvt_update_mmap_info(dev, ip, s, wq); + rvt_update_mmap_info(dev, ip, s, tmp_rq.wq); /* * Return the offset to mmap. @@ -301,7 +307,7 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, bail_unlock: spin_unlock_irq(&srq->rq.lock); bail_free: - vfree(wq); + rvt_free_rq(&tmp_rq); return ret; } @@ -336,6 +342,5 @@ void rvt_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) spin_unlock(&dev->n_srqs_lock); if (srq->ip) kref_put(&srq->ip->ref, rvt_release_mmap_info); - else - vfree(srq->rq.wq); + kvfree(srq->rq.kwq); } diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 7fcd687af278..ee55fd04f6da 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -52,6 +52,7 @@ #include #include #include +#include /* * Atomic bit definitions for r_aflags. */ @@ -177,33 +178,27 @@ struct rvt_swqe { struct rvt_sge sg_list[0]; }; -/* - * Receive work request queue entry. - * The size of the sg_list is determined when the QP (or SRQ) is created - * and stored in qp->r_rq.max_sge (or srq->rq.max_sge). +/** + * struct rvt_krwq - kernel struct receive work request + * @head: index of next entry to fill + * @tail: index of next entry to pull + * @count: count is aproximate of total receive enteries posted + * @rvt_rwqe: struct of receive work request queue entry + * + * This structure is used to contain the head pointer, + * tail pointer and receive work queue entries for kernel + * mode user. */ -struct rvt_rwqe { - u64 wr_id; - u8 num_sge; - struct ib_sge sg_list[0]; -}; - -/* - * This structure is used to contain the head pointer, tail pointer, - * and receive work queue entries as a single memory allocation so - * it can be mmap'ed into user space. - * Note that the wq array elements are variable size so you can't - * just index into the array to get the N'th element; - * use get_rwqe_ptr() instead. - */ -struct rvt_rwq { +struct rvt_krwq { u32 head; /* new work requests posted to the head */ u32 tail; /* receives pull requests from here. */ - struct rvt_rwqe wq[0]; + struct rvt_rwqe *curr_wq; + struct rvt_rwqe wq[]; }; struct rvt_rq { struct rvt_rwq *wq; + struct rvt_krwq *kwq; u32 size; /* size of RWQE array */ u8 max_sge; /* protect changes in this struct */ @@ -472,7 +467,7 @@ static inline struct rvt_swqe *rvt_get_swqe_ptr(struct rvt_qp *qp, static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n) { return (struct rvt_rwqe *) - ((char *)rq->wq->wq + + ((char *)rq->kwq->curr_wq + (sizeof(struct rvt_rwqe) + rq->max_sge * sizeof(struct ib_sge)) * n); } @@ -852,6 +847,21 @@ static inline u32 ib_cq_head(struct ib_cq *send_cq) ibcq_to_rvtcq(send_cq)->kqueue->head; } +/** + * rvt_free_rq - free memory allocated for rvt_rq struct + * @rvt_rq: request queue data structure + * + * This function should only be called if the rvt_mmap_info() + * has not succeeded. + */ +static inline void rvt_free_rq(struct rvt_rq *rq) +{ + kvfree(rq->kwq); + rq->kwq = NULL; + vfree(rq->wq); + rq->wq = NULL; +} + struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi, u64 v, void (*cb)(struct rvt_qp *qp, u64 v)); diff --git a/include/uapi/rdma/rvt-abi.h b/include/uapi/rdma/rvt-abi.h index 8e5f7e0c15fe..d2e35d24f1a9 100644 --- a/include/uapi/rdma/rvt-abi.h +++ b/include/uapi/rdma/rvt-abi.h @@ -10,6 +10,7 @@ #include #include +#include #ifndef RDMA_ATOMIC_UAPI #define RDMA_ATOMIC_UAPI(_type, _name) struct{ _type val; } _name #endif @@ -29,4 +30,32 @@ struct rvt_cq_wc { struct ib_uverbs_wc uqueue[]; }; +/* + * Receive work request queue entry. + * The size of the sg_list is determined when the QP (or SRQ) is created + * and stored in qp->r_rq.max_sge (or srq->rq.max_sge). + */ +struct rvt_rwqe { + __u64 wr_id; + __u8 num_sge; + __u8 padding[7]; + struct ib_sge sg_list[]; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and receive work queue entries as a single memory allocation so + * it can be mmap'ed into user space. + * Note that the wq array elements are variable size so you can't + * just index into the array to get the N'th element; + * use get_rwqe_ptr() for user space and rvt_get_rwqe_ptr() + * for kernel space. + */ +struct rvt_rwq { + /* new work requests posted to the head */ + RDMA_ATOMIC_UAPI(__u32, head); + /* receives pull requests from here. */ + RDMA_ATOMIC_UAPI(__u32, tail); + struct rvt_rwqe wq[]; +}; #endif /* RVT_ABI_USER_H */ -- cgit v1.2.3 From f592ae3c999fbe4faeeb90dfde8ff7da49ee4ae6 Mon Sep 17 00:00:00 2001 From: Kamenee Arumugam Date: Fri, 28 Jun 2019 14:04:30 -0400 Subject: IB/rdmavt: Fracture single lock used for posting and processing RWQEs Usage of single lock prevents fetching posted and processing receive work queue entries from progressing simultaneously and impacts overall performance. Fracture the single lock used for posting and processing Receive Work Queue Entries (RWQEs) to allow the circular buffer to be filled and emptied at the same time. Two new spinlocks - one for the producers and one for the consumers used for posting and processing RWQEs simultaneously and the two indices are define on two different cache lines. The threshold count is used to avoid reading other index in different cache line every time. Signed-off-by: Harish Chegondi Signed-off-by: Kamenee Arumugam Reviewed-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/qp.c | 97 ++++++++++++++++++++++++-------------- drivers/infiniband/sw/rdmavt/rc.c | 43 +++++++++-------- drivers/infiniband/sw/rdmavt/srq.c | 10 ++-- include/rdma/rdmavt_qp.h | 7 +++ 4 files changed, 97 insertions(+), 60 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 1384060f175d..200b292be63e 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -58,6 +58,8 @@ #include "vt.h" #include "trace.h" +#define RVT_RWQ_COUNT_THRESHOLD 16 + static void rvt_rc_timeout(struct timer_list *t); /* @@ -835,7 +837,8 @@ int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node, rq->kwq->curr_wq = rq->kwq->wq; } - spin_lock_init(&rq->lock); + spin_lock_init(&rq->kwq->p_lock); + spin_lock_init(&rq->kwq->c_lock); return 0; bail: rvt_free_rq(rq); @@ -892,6 +895,8 @@ static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, qp->s_tail_ack_queue = 0; qp->s_acked_ack_queue = 0; qp->s_num_rd_atomic = 0; + if (qp->r_rq.kwq) + qp->r_rq.kwq->count = qp->r_rq.size; qp->r_sge.num_sge = 0; atomic_set(&qp->s_reserved_used, 0); } @@ -1097,7 +1102,6 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, spin_lock_init(&qp->r_lock); spin_lock_init(&qp->s_hlock); spin_lock_init(&qp->s_lock); - spin_lock_init(&qp->r_rq.lock); atomic_set(&qp->refcount, 0); atomic_set(&qp->local_ops_pending, 0); init_waitqueue_head(&qp->wait); @@ -1305,7 +1309,7 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err) struct rvt_rwq *wq = NULL; struct rvt_krwq *kwq = NULL; - spin_lock(&qp->r_rq.lock); + spin_lock(&qp->r_rq.kwq->c_lock); /* qp->ip used to validate if there is a user buffer mmaped */ if (qp->ip) { wq = qp->r_rq.wq; @@ -1331,7 +1335,7 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err) RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail); else kwq->tail = tail; - spin_unlock(&qp->r_rq.lock); + spin_unlock(&qp->r_rq.kwq->c_lock); } else if (qp->ibqp.event_handler) { ret = 1; } @@ -1780,12 +1784,12 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, return -EINVAL; } - spin_lock_irqsave(&qp->r_rq.lock, flags); + spin_lock_irqsave(&qp->r_rq.kwq->p_lock, flags); next = wq->head + 1; if (next >= qp->r_rq.size) next = 0; if (next == READ_ONCE(wq->tail)) { - spin_unlock_irqrestore(&qp->r_rq.lock, flags); + spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags); *bad_wr = wr; return -ENOMEM; } @@ -1810,7 +1814,7 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, */ smp_store_release(&wq->head, next); } - spin_unlock_irqrestore(&qp->r_rq.lock, flags); + spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags); } return 0; } @@ -2191,13 +2195,13 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, return -EINVAL; } - spin_lock_irqsave(&srq->rq.lock, flags); + spin_lock_irqsave(&srq->rq.kwq->p_lock, flags); wq = srq->rq.kwq; next = wq->head + 1; if (next >= srq->rq.size) next = 0; if (next == READ_ONCE(wq->tail)) { - spin_unlock_irqrestore(&srq->rq.lock, flags); + spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags); *bad_wr = wr; return -ENOMEM; } @@ -2209,7 +2213,7 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, wqe->sg_list[i] = wr->sg_list[i]; /* Make sure queue entry is written before the head index. */ smp_store_release(&wq->head, next); - spin_unlock_irqrestore(&srq->rq.lock, flags); + spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags); } return 0; } @@ -2265,6 +2269,31 @@ bad_lkey: return 0; } +/** + * get_count - count numbers of request work queue entries + * in circular buffer + * @rq: data structure for request queue entry + * @tail: tail indices of the circular buffer + * @head: head indices of the circular buffer + * + * Return - total number of entries in the circular buffer + */ +static u32 get_count(struct rvt_rq *rq, u32 tail, u32 head) +{ + u32 count; + + count = head; + + if (count >= rq->size) + count = 0; + if (count < tail) + count += rq->size - tail; + else + count -= tail; + + return count; +} + /** * get_rvt_head - get head indices of the circular buffer * @rq: data structure for request queue entry @@ -2298,7 +2327,7 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) { unsigned long flags; struct rvt_rq *rq; - struct rvt_krwq *kwq; + struct rvt_krwq *kwq = NULL; struct rvt_rwq *wq; struct rvt_srq *srq; struct rvt_rwqe *wqe; @@ -2320,16 +2349,16 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) ip = qp->ip; } - spin_lock_irqsave(&rq->lock, flags); + spin_lock_irqsave(&rq->kwq->c_lock, flags); if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { ret = 0; goto unlock; } + kwq = rq->kwq; if (ip) { wq = rq->wq; tail = RDMA_READ_UAPI_ATOMIC(wq->tail); } else { - kwq = rq->kwq; tail = kwq->tail; } @@ -2337,8 +2366,11 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) if (tail >= rq->size) tail = 0; - head = get_rvt_head(rq, ip); - if (unlikely(tail == head)) { + if (kwq->count < RVT_RWQ_COUNT_THRESHOLD) { + head = get_rvt_head(rq, ip); + kwq->count = get_count(rq, tail, head); + } + if (unlikely(kwq->count == 0)) { ret = 0; goto unlock; } @@ -2362,36 +2394,31 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) } qp->r_wr_id = wqe->wr_id; + kwq->count--; ret = 1; set_bit(RVT_R_WRID_VALID, &qp->r_aflags); if (handler) { - u32 n; - /* * Validate head pointer value and compute * the number of remaining WQEs. */ - n = get_rvt_head(rq, ip); - if (n >= rq->size) - n = 0; - if (n < tail) - n += rq->size - tail; - else - n -= tail; - if (n < srq->limit) { - struct ib_event ev; - - srq->limit = 0; - spin_unlock_irqrestore(&rq->lock, flags); - ev.device = qp->ibqp.device; - ev.element.srq = qp->ibqp.srq; - ev.event = IB_EVENT_SRQ_LIMIT_REACHED; - handler(&ev, srq->ibsrq.srq_context); - goto bail; + if (kwq->count < srq->limit) { + kwq->count = get_count(rq, tail, get_rvt_head(rq, ip)); + if (kwq->count < srq->limit) { + struct ib_event ev; + + srq->limit = 0; + spin_unlock_irqrestore(&rq->kwq->c_lock, flags); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + handler(&ev, srq->ibsrq.srq_context); + goto bail; + } } } unlock: - spin_unlock_irqrestore(&rq->lock, flags); + spin_unlock_irqrestore(&rq->kwq->c_lock, flags); bail: return ret; } diff --git a/drivers/infiniband/sw/rdmavt/rc.c b/drivers/infiniband/sw/rdmavt/rc.c index 44cc7ee1b321..890d7b760d2e 100644 --- a/drivers/infiniband/sw/rdmavt/rc.c +++ b/drivers/infiniband/sw/rdmavt/rc.c @@ -107,27 +107,30 @@ __be32 rvt_compute_aeth(struct rvt_qp *qp) u32 head; u32 tail; - /* sanity check pointers before trusting them */ - if (qp->ip) { - head = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->head); - tail = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->tail); - } else { - head = READ_ONCE(qp->r_rq.kwq->head); - tail = READ_ONCE(qp->r_rq.kwq->tail); + credits = READ_ONCE(qp->r_rq.kwq->count); + if (credits == 0) { + /* sanity check pointers before trusting them */ + if (qp->ip) { + head = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->head); + tail = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->tail); + } else { + head = READ_ONCE(qp->r_rq.kwq->head); + tail = READ_ONCE(qp->r_rq.kwq->tail); + } + if (head >= qp->r_rq.size) + head = 0; + if (tail >= qp->r_rq.size) + tail = 0; + /* + * Compute the number of credits available (RWQEs). + * There is a small chance that the pair of reads are + * not atomic, which is OK, since the fuzziness is + * resolved as further ACKs go out. + */ + credits = head - tail; + if ((int)credits < 0) + credits += qp->r_rq.size; } - if (head >= qp->r_rq.size) - head = 0; - if (tail >= qp->r_rq.size) - tail = 0; - /* - * Compute the number of credits available (RWQEs). - * There is a small chance that the pair of reads are - * not atomic, which is OK, since the fuzziness is - * resolved as further ACKs go out. - */ - credits = head - tail; - if ((int)credits < 0) - credits += qp->r_rq.size; /* * Binary search the credit table to find the code to * use. diff --git a/drivers/infiniband/sw/rdmavt/srq.c b/drivers/infiniband/sw/rdmavt/srq.c index d306f6547cba..24fef021d51d 100644 --- a/drivers/infiniband/sw/rdmavt/srq.c +++ b/drivers/infiniband/sw/rdmavt/srq.c @@ -206,7 +206,7 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, goto bail_free; } - spin_lock_irq(&srq->rq.lock); + spin_lock_irq(&srq->rq.kwq->c_lock); /* * validate head and tail pointer values and compute * the number of remaining WQEs. @@ -261,7 +261,7 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, srq->rq.size = size; if (attr_mask & IB_SRQ_LIMIT) srq->limit = attr->srq_limit; - spin_unlock_irq(&srq->rq.lock); + spin_unlock_irq(&srq->rq.kwq->c_lock); vfree(owq); kvfree(okwq); @@ -295,17 +295,17 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, spin_unlock_irq(&dev->pending_lock); } } else if (attr_mask & IB_SRQ_LIMIT) { - spin_lock_irq(&srq->rq.lock); + spin_lock_irq(&srq->rq.kwq->c_lock); if (attr->srq_limit >= srq->rq.size) ret = -EINVAL; else srq->limit = attr->srq_limit; - spin_unlock_irq(&srq->rq.lock); + spin_unlock_irq(&srq->rq.kwq->c_lock); } return ret; bail_unlock: - spin_unlock_irq(&srq->rq.lock); + spin_unlock_irq(&srq->rq.kwq->c_lock); bail_free: rvt_free_rq(&tmp_rq); return ret; diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index ee55fd04f6da..de5915b244be 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -180,7 +180,9 @@ struct rvt_swqe { /** * struct rvt_krwq - kernel struct receive work request + * @p_lock: lock to protect producer of the kernel buffer * @head: index of next entry to fill + * @c_lock:lock to protect consumer of the kernel buffer * @tail: index of next entry to pull * @count: count is aproximate of total receive enteries posted * @rvt_rwqe: struct of receive work request queue entry @@ -190,8 +192,13 @@ struct rvt_swqe { * mode user. */ struct rvt_krwq { + spinlock_t p_lock; /* protect producer */ u32 head; /* new work requests posted to the head */ + + /* protect consumer */ + spinlock_t c_lock ____cacheline_aligned_in_smp; u32 tail; /* receives pull requests from here. */ + u32 count; /* approx count of receive entries posted */ struct rvt_rwqe *curr_wq; struct rvt_rwqe wq[]; }; -- cgit v1.2.3 From 5136bfea7e79b333af77594fac5bc70282a95313 Mon Sep 17 00:00:00 2001 From: Kamenee Arumugam Date: Fri, 28 Jun 2019 14:21:52 -0400 Subject: IB/{hfi1, qib, rdmavt}: Put qp in error state when cq is full When a completion queue is full, the associated queue pairs are not put into the error state. According to the IBTA specification, this is a violation. Quote from IBTA spec: C9-218: A Requester Class F error occurs when the CQ is inaccessible or full and an attempt is made to complete a WQE. The Affected QP shall be moved to the error state and affiliated asynchronous errors generated as described in 11.6.3.1 Affiliated Asynchronous Events on page 678. The current WQE and any subsequent WQEs are left in an unknown state. C11-37: The CI shall generate a CQ Error when a CQ overrun is detected. This condition will result in an Affiliated Asynchronous Error for any associated Work Queues when they attempt to use that CQ. Completions can no longer be added to the CQ. It is not guaranteed that completions present in the CQ at the time the error occurred can be retrieved. Possible causes include a CQ overrun or a CQ protection error. Put the qp in error state when cq is full. Implement a state called full to continue to put other associated QPs in error state. Reviewed-by: Mike Marciniszyn Reviewed-by: Michael J. Ruhl Signed-off-by: Kamenee Arumugam Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/rc.c | 3 +-- drivers/infiniband/hw/hfi1/uc.c | 3 +-- drivers/infiniband/hw/hfi1/ud.c | 5 ++-- drivers/infiniband/hw/qib/qib_rc.c | 3 +-- drivers/infiniband/hw/qib/qib_uc.c | 3 +-- drivers/infiniband/hw/qib/qib_ud.c | 6 ++--- drivers/infiniband/sw/rdmavt/cq.c | 15 +++++++++--- drivers/infiniband/sw/rdmavt/qp.c | 3 +-- drivers/infiniband/sw/rdmavt/vt.h | 9 ++++++++ include/rdma/rdmavt_cq.h | 3 ++- include/rdma/rdmavt_qp.h | 47 ++++++++++++++++++++++++++++++++++---- 11 files changed, 75 insertions(+), 25 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 235bdbc706ac..0477c14633ab 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -3008,8 +3008,7 @@ send_last: wc.dlid_path_bits = 0; wc.port_num = 0; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - ib_bth_is_solicited(ohdr)); + rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr)); break; case OP(RDMA_WRITE_ONLY): diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index 4ed4fcfabd6c..0c77f18120ed 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -476,8 +476,7 @@ last_imm: wc.dlid_path_bits = 0; wc.port_num = 0; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - ib_bth_is_solicited(ohdr)); + rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr)); break; case OP(RDMA_WRITE_FIRST): diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index 4cb0fce5c096..e16d499cfd1e 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -255,8 +255,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) wc.dlid_path_bits = rdma_ah_get_dlid(ah_attr) & ((1 << ppd->lmc) - 1); wc.port_num = qp->port_num; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - swqe->wr.send_flags & IB_SEND_SOLICITED); + rvt_recv_cq(qp, &wc, swqe->wr.send_flags & IB_SEND_SOLICITED); ibp->rvp.n_loop_pkts++; bail_unlock: spin_unlock_irqrestore(&qp->r_lock, flags); @@ -1061,7 +1060,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1); wc.port_num = qp->port_num; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, solicited); + rvt_recv_cq(qp, &wc, solicited); return; drop: diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 8d9a94d6f685..1d5e2d4ee257 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -1891,8 +1891,7 @@ send_last: wc.dlid_path_bits = 0; wc.port_num = 0; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - ib_bth_is_solicited(ohdr)); + rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr)); break; case OP(RDMA_WRITE_FIRST): diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index 30c70ad0f4bf..e17b91e2c22a 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -400,8 +400,7 @@ last_imm: wc.dlid_path_bits = 0; wc.port_num = 0; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - ib_bth_is_solicited(ohdr)); + rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr)); break; case OP(RDMA_WRITE_FIRST): diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index 5cdedba2d164..32ad0b635fc6 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -210,8 +210,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) wc.dlid_path_bits = rdma_ah_get_dlid(ah_attr) & ((1 << ppd->lmc) - 1); wc.port_num = qp->port_num; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - swqe->wr.send_flags & IB_SEND_SOLICITED); + rvt_recv_cq(qp, &wc, swqe->wr.send_flags & IB_SEND_SOLICITED); ibp->rvp.n_loop_pkts++; bail_unlock: spin_unlock_irqrestore(&qp->r_lock, flags); @@ -573,8 +572,7 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr, dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1); wc.port_num = qp->port_num; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - ib_bth_is_solicited(ohdr)); + rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr)); return; drop: diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 2602ad8b8cb0..fac87b13329d 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -60,8 +60,11 @@ static struct workqueue_struct *comp_vector_wq; * @solicited: true if @entry is solicited * * This may be called with qp->s_lock held. + * + * Return: return true on success, else return + * false if cq is full. */ -void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) +bool rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) { struct ib_uverbs_wc *uqueue = NULL; struct ib_wc *kqueue = NULL; @@ -97,7 +100,12 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) next = head + 1; } - if (unlikely(next == tail)) { + if (unlikely(next == tail || cq->cq_full)) { + struct rvt_dev_info *rdi = cq->rdi; + + if (!cq->cq_full) + rvt_pr_err_ratelimited(rdi, "CQ is full!\n"); + cq->cq_full = true; spin_unlock_irqrestore(&cq->lock, flags); if (cq->ibcq.event_handler) { struct ib_event ev; @@ -107,7 +115,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) ev.event = IB_EVENT_CQ_ERR; cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); } - return; + return false; } trace_rvt_cq_enter(cq, entry, head); if (uqueue) { @@ -146,6 +154,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) } spin_unlock_irqrestore(&cq->lock, flags); + return true; } EXPORT_SYMBOL(rvt_cq_enter); diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 200b292be63e..17e192a2c8b6 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -3103,8 +3103,7 @@ do_write: wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); wc.port_num = 1; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - wqe->wr.send_flags & IB_SEND_SOLICITED); + rvt_recv_cq(qp, &wc, wqe->wr.send_flags & IB_SEND_SOLICITED); send_comp: spin_unlock_irqrestore(&qp->r_lock, flags); diff --git a/drivers/infiniband/sw/rdmavt/vt.h b/drivers/infiniband/sw/rdmavt/vt.h index 0675ea6c3872..d19ff817c2c7 100644 --- a/drivers/infiniband/sw/rdmavt/vt.h +++ b/drivers/infiniband/sw/rdmavt/vt.h @@ -78,6 +78,12 @@ fmt, \ ##__VA_ARGS__) +#define rvt_pr_err_ratelimited(rdi, fmt, ...) \ + __rvt_pr_err_ratelimited((rdi)->driver_f.get_pci_dev(rdi), \ + rvt_get_ibdev_name(rdi), \ + fmt, \ + ##__VA_ARGS__) + #define __rvt_pr_info(pdev, name, fmt, ...) \ dev_info(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__) @@ -87,6 +93,9 @@ #define __rvt_pr_err(pdev, name, fmt, ...) \ dev_err(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__) +#define __rvt_pr_err_ratelimited(pdev, name, fmt, ...) \ + dev_err_ratelimited(&(pdev)->dev, "%s: " fmt, name, ##__VA_ARGS__) + static inline int ibport_num_to_idx(struct ib_device *ibdev, u8 port_num) { struct rvt_dev_info *rdi = ib_to_rvt(ibdev); diff --git a/include/rdma/rdmavt_cq.h b/include/rdma/rdmavt_cq.h index ab22860a63e2..04c519ef6d71 100644 --- a/include/rdma/rdmavt_cq.h +++ b/include/rdma/rdmavt_cq.h @@ -93,6 +93,7 @@ struct rvt_cq { spinlock_t lock; /* protect changes in this struct */ u8 notify; u8 triggered; + u8 cq_full; int comp_vector_cpu; struct rvt_dev_info *rdi; struct rvt_cq_wc *queue; @@ -105,6 +106,6 @@ static inline struct rvt_cq *ibcq_to_rvtcq(struct ib_cq *ibcq) return container_of(ibcq, struct rvt_cq, ibcq); } -void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited); +bool rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited); #endif /* DEF_RDMAVT_INCCQH */ diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index de5915b244be..e4be869c4f21 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -718,6 +718,48 @@ rvt_qp_swqe_incr(struct rvt_qp *qp, u32 val) return val; } +int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err); + +/** + * rvt_recv_cq - add a new entry to completion queue + * by receive queue + * @qp: receive queue + * @wc: work completion entry to add + * @solicited: true if @entry is solicited + * + * This is wrapper function for rvt_enter_cq function call by + * receive queue. If rvt_cq_enter return false, it means cq is + * full and the qp is put into error state. + */ +static inline void rvt_recv_cq(struct rvt_qp *qp, struct ib_wc *wc, + bool solicited) +{ + struct rvt_cq *cq = ibcq_to_rvtcq(qp->ibqp.recv_cq); + + if (unlikely(!rvt_cq_enter(cq, wc, solicited))) + rvt_error_qp(qp, IB_WC_LOC_QP_OP_ERR); +} + +/** + * rvt_send_cq - add a new entry to completion queue + * by send queue + * @qp: send queue + * @wc: work completion entry to add + * @solicited: true if @entry is solicited + * + * This is wrapper function for rvt_enter_cq function call by + * send queue. If rvt_cq_enter return false, it means cq is + * full and the qp is put into error state. + */ +static inline void rvt_send_cq(struct rvt_qp *qp, struct ib_wc *wc, + bool solicited) +{ + struct rvt_cq *cq = ibcq_to_rvtcq(qp->ibqp.send_cq); + + if (unlikely(!rvt_cq_enter(cq, wc, solicited))) + rvt_error_qp(qp, IB_WC_LOC_QP_OP_ERR); +} + /** * rvt_qp_complete_swqe - insert send completion * @qp - the qp @@ -768,9 +810,7 @@ rvt_qp_complete_swqe(struct rvt_qp *qp, .qp = &qp->ibqp, .byte_len = byte_len, }; - - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &w, - status != IB_WC_SUCCESS); + rvt_send_cq(qp, &w, status != IB_WC_SUCCESS); } return last; } @@ -780,7 +820,6 @@ extern const int ib_rvt_state_ops[]; struct rvt_dev_info; int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only); void rvt_comm_est(struct rvt_qp *qp); -int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err); void rvt_rc_error(struct rvt_qp *qp, enum ib_wc_status err); unsigned long rvt_rnr_tbl_to_usec(u32 index); enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t); -- cgit v1.2.3 From fe2ac04712cdc6e93d32e9c82c73bfb225554309 Mon Sep 17 00:00:00 2001 From: Michael J. Ruhl Date: Fri, 28 Jun 2019 14:21:58 -0400 Subject: IB/rdmavt: Set QP allowed opcodes after QP allocation Currently QP allowed_ops is set after the QP is completely initialized. This curtails the use of this optimization for any initialization before allowed_ops is set. Fix by adding a helper to determine the correct allowed_ops and moving the setting of the allowed_ops to just after QP allocation. Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/qp.c | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 17e192a2c8b6..b9035d969057 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 - 2018 Intel Corporation. + * Copyright(c) 2016 - 2019 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -968,6 +968,16 @@ static void rvt_free_qpn(struct rvt_qpn_table *qpt, u32 qpn) clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page); } +/** + * get_allowed_ops - Given a QP type return the appropriate allowed OP + * @type: valid, supported, QP type + */ +static u8 get_allowed_ops(enum ib_qp_type type) +{ + return type == IB_QPT_RC ? IB_OPCODE_RC : type == IB_QPT_UC ? + IB_OPCODE_UC : IB_OPCODE_UD; +} + /** * rvt_create_qp - create a queue pair for a device * @ibpd: the protection domain who's device we create the queue pair for @@ -1050,6 +1060,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, rdi->dparms.node); if (!qp) goto bail_swq; + qp->allowed_ops = get_allowed_ops(init_attr->qp_type); RCU_INIT_POINTER(qp->next, NULL); if (init_attr->qp_type == IB_QPT_RC) { @@ -1205,28 +1216,6 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, ret = &qp->ibqp; - /* - * We have our QP and its good, now keep track of what types of opcodes - * can be processed on this QP. We do this by keeping track of what the - * 3 high order bits of the opcode are. - */ - switch (init_attr->qp_type) { - case IB_QPT_SMI: - case IB_QPT_GSI: - case IB_QPT_UD: - qp->allowed_ops = IB_OPCODE_UD; - break; - case IB_QPT_RC: - qp->allowed_ops = IB_OPCODE_RC; - break; - case IB_QPT_UC: - qp->allowed_ops = IB_OPCODE_UC; - break; - default: - ret = ERR_PTR(-EINVAL); - goto bail_ip; - } - return ret; bail_ip: -- cgit v1.2.3 From d310c4bf8aeacc0256091feb6a0337b8fef763ac Mon Sep 17 00:00:00 2001 From: Michael J. Ruhl Date: Fri, 28 Jun 2019 14:22:04 -0400 Subject: IB/{rdmavt, hfi1, qib}: Remove AH refcount for UD QPs Historically rdmavt destroy_ah() has returned an -EBUSY when the AH has a non-zero reference count. IBTA 11.2.2 notes no such return value or error case: Output Modifiers: - Verb results: - Operation completed successfully. - Invalid HCA handle. - Invalid address handle. ULPs never test for this error and this will leak memory. The reference count exists to allow for driver independent progress mechanisms to process UD SWQEs in parallel with post sends. The SWQE will hold a reference count until the UD SWQE completes and then drops the reference. Fix by removing need to reference count the AH. Add a UD specific allocation to each SWQE entry to cache the necessary information for independent progress. Copy the information during the post send processing. Reviewed-by: Mike Marciniszyn Signed-off-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/qp.c | 4 +-- drivers/infiniband/hw/hfi1/ud.c | 30 ++++++++++---------- drivers/infiniband/hw/qib/qib_qp.c | 4 +-- drivers/infiniband/hw/qib/qib_ud.c | 21 +++++++------- drivers/infiniband/sw/rdmavt/ah.c | 6 +--- drivers/infiniband/sw/rdmavt/qp.c | 58 ++++++++++++++++++++++++++++++++++++-- include/rdma/rdma_vt.h | 3 +- include/rdma/rdmavt_qp.h | 22 +++++++++++++-- 8 files changed, 106 insertions(+), 42 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 41261e72c429..a84b44af7b97 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2018 Intel Corporation. + * Copyright(c) 2015 - 2019 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -348,7 +348,7 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send) break; case IB_QPT_GSI: case IB_QPT_UD: - ah = ibah_to_rvtah(wqe->ud_wr.ah); + ah = ibah_to_rvtah(wqe->ud_wr.wr.ah); if (wqe->length > (1 << ah->log_pmtu)) return -EINVAL; if (ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)] == 0xf) diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index e16d499cfd1e..f8e796e45517 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2018 Intel Corporation. + * Copyright(c) 2015 - 2019 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -87,7 +87,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) rcu_read_lock(); qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp, - swqe->ud_wr.remote_qpn); + swqe->ud_wr.wr.remote_qpn); if (!qp) { ibp->rvp.n_pkt_drops++; rcu_read_unlock(); @@ -105,7 +105,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) goto drop; } - ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr; + ah_attr = swqe->ud_wr.attr; ppd = ppd_from_ibp(ibp); if (qp->ibqp.qp_num > 1) { @@ -135,8 +135,8 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (qp->ibqp.qp_num) { u32 qkey; - qkey = (int)swqe->ud_wr.remote_qkey < 0 ? - sqp->qkey : swqe->ud_wr.remote_qkey; + qkey = (int)swqe->ud_wr.wr.remote_qkey < 0 ? + sqp->qkey : swqe->ud_wr.wr.remote_qkey; if (unlikely(qkey != qp->qkey)) goto drop; /* silently drop per IBTA spec */ } @@ -240,7 +240,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) { if (sqp->ibqp.qp_type == IB_QPT_GSI || sqp->ibqp.qp_type == IB_QPT_SMI) - wc.pkey_index = swqe->ud_wr.pkey_index; + wc.pkey_index = swqe->ud_wr.wr.pkey_index; else wc.pkey_index = sqp->s_pkey_index; } else { @@ -282,20 +282,20 @@ static void hfi1_make_bth_deth(struct rvt_qp *qp, struct rvt_swqe *wqe, bth0 |= IB_BTH_SOLICITED; bth0 |= extra_bytes << 20; if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) - *pkey = hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index); + *pkey = hfi1_get_pkey(ibp, wqe->ud_wr.wr.pkey_index); else *pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); if (!bypass) bth0 |= *pkey; ohdr->bth[0] = cpu_to_be32(bth0); - ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn); + ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.wr.remote_qpn); ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn)); /* * Qkeys with the high order bit set mean use the * qkey from the QP context instead of the WR (see 10.2.5). */ - ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ? - qp->qkey : wqe->ud_wr.remote_qkey); + ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.wr.remote_qkey < 0 ? + qp->qkey : wqe->ud_wr.wr.remote_qkey); ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); } @@ -315,7 +315,7 @@ void hfi1_make_ud_req_9B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); - ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr; + ah_attr = wqe->ud_wr.attr; extra_bytes = -wqe->length & 3; nwords = ((wqe->length + extra_bytes) >> 2) + SIZE_OF_CRC; @@ -379,7 +379,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, struct hfi1_pportdata *ppd; struct hfi1_ibport *ibp; u32 dlid, slid, nwords, extra_bytes; - u32 dest_qp = wqe->ud_wr.remote_qpn; + u32 dest_qp = wqe->ud_wr.wr.remote_qpn; u32 src_qp = qp->ibqp.qp_num; u16 len, pkey; u8 l4, sc5; @@ -387,7 +387,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); - ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr; + ah_attr = wqe->ud_wr.attr; /* * Build 16B Management Packet if either the destination @@ -449,7 +449,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, if (is_mgmt) { l4 = OPA_16B_L4_FM; - pkey = hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index); + pkey = hfi1_get_pkey(ibp, wqe->ud_wr.wr.pkey_index); hfi1_16B_set_qpn(&ps->s_txreq->phdr.hdr.opah.u.mgmt, dest_qp, src_qp); } else { @@ -514,7 +514,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) /* Construct the header. */ ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); - ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr; + ah_attr = wqe->ud_wr.attr; priv->hdr_type = hfi1_get_hdr_type(ppd->lid, ah_attr); if ((!hfi1_check_mcast(rdma_ah_get_dlid(ah_attr))) || (rdma_ah_get_dlid(ah_attr) == be32_to_cpu(OPA_LID_PERMISSIVE))) { diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index a81905df2d0f..0e1d0d692891 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 - 2017 Intel Corporation. All rights reserved. + * Copyright (c) 2012 - 2019 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. * All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * @@ -398,7 +398,7 @@ int qib_check_send_wqe(struct rvt_qp *qp, case IB_QPT_SMI: case IB_QPT_GSI: case IB_QPT_UD: - ah = ibah_to_rvtah(wqe->ud_wr.ah); + ah = ibah_to_rvtah(wqe->ud_wr.wr.ah); if (wqe->length > (1 << ah->log_pmtu)) return -EINVAL; /* progress hint */ diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index 32ad0b635fc6..d8c2c968909f 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2012 - 2019 Intel Corporation. All rights reserved. * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * @@ -63,7 +64,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) enum ib_qp_type sqptype, dqptype; rcu_read_lock(); - qp = rvt_lookup_qpn(rdi, &ibp->rvp, swqe->ud_wr.remote_qpn); + qp = rvt_lookup_qpn(rdi, &ibp->rvp, swqe->ud_wr.wr.remote_qpn); if (!qp) { ibp->rvp.n_pkt_drops++; goto drop; @@ -80,7 +81,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) goto drop; } - ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr; + ah_attr = swqe->ud_wr.attr; ppd = ppd_from_ibp(ibp); if (qp->ibqp.qp_num > 1) { @@ -110,8 +111,8 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (qp->ibqp.qp_num) { u32 qkey; - qkey = (int)swqe->ud_wr.remote_qkey < 0 ? - sqp->qkey : swqe->ud_wr.remote_qkey; + qkey = (int)swqe->ud_wr.wr.remote_qkey < 0 ? + sqp->qkey : swqe->ud_wr.wr.remote_qkey; if (unlikely(qkey != qp->qkey)) goto drop; } @@ -203,7 +204,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) wc.qp = &qp->ibqp; wc.src_qp = sqp->ibqp.qp_num; wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ? - swqe->ud_wr.pkey_index : 0; + swqe->ud_wr.wr.pkey_index : 0; wc.slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) & ((1 << ppd->lmc) - 1)); wc.sl = rdma_ah_get_sl(ah_attr); @@ -270,7 +271,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) /* Construct the header. */ ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); - ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr; + ah_attr = wqe->ud_wr.attr; if (rdma_ah_get_dlid(ah_attr) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) { if (rdma_ah_get_dlid(ah_attr) != be16_to_cpu(IB_LID_PERMISSIVE)) @@ -362,7 +363,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) bth0 |= extra_bytes << 20; bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? QIB_DEFAULT_P_KEY : qib_get_pkey(ibp, qp->ibqp.qp_type == IB_QPT_GSI ? - wqe->ud_wr.pkey_index : qp->s_pkey_index); + wqe->ud_wr.wr.pkey_index : qp->s_pkey_index); ohdr->bth[0] = cpu_to_be32(bth0); /* * Use the multicast QP if the destination LID is a multicast LID. @@ -371,14 +372,14 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) be16_to_cpu(IB_MULTICAST_LID_BASE) && rdma_ah_get_dlid(ah_attr) != be16_to_cpu(IB_LID_PERMISSIVE) ? cpu_to_be32(QIB_MULTICAST_QPN) : - cpu_to_be32(wqe->ud_wr.remote_qpn); + cpu_to_be32(wqe->ud_wr.wr.remote_qpn); ohdr->bth[2] = cpu_to_be32(wqe->psn & QIB_PSN_MASK); /* * Qkeys with the high order bit set mean use the * qkey from the QP context instead of the WR (see 10.2.5). */ - ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ? - qp->qkey : wqe->ud_wr.remote_qkey); + ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.wr.remote_qkey < 0 ? + qp->qkey : wqe->ud_wr.wr.remote_qkey); ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); done: diff --git a/drivers/infiniband/sw/rdmavt/ah.c b/drivers/infiniband/sw/rdmavt/ah.c index 0e147b32cbe9..fe99da0ff060 100644 --- a/drivers/infiniband/sw/rdmavt/ah.c +++ b/drivers/infiniband/sw/rdmavt/ah.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2019 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -119,8 +119,6 @@ int rvt_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr, rdma_copy_ah_attr(&ah->attr, ah_attr); - atomic_set(&ah->refcount, 0); - if (dev->driver_f.notify_new_ah) dev->driver_f.notify_new_ah(ibah->device, ah_attr, ah); @@ -141,8 +139,6 @@ void rvt_destroy_ah(struct ib_ah *ibah, u32 destroy_flags) struct rvt_ah *ah = ibah_to_rvtah(ibah); unsigned long flags; - WARN_ON_ONCE(atomic_read(&ah->refcount)); - spin_lock_irqsave(&dev->n_ahs_lock, flags); dev->n_ahs_allocated--; spin_unlock_irqrestore(&dev->n_ahs_lock, flags); diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index b9035d969057..de7d2edb9781 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -978,6 +978,51 @@ static u8 get_allowed_ops(enum ib_qp_type type) IB_OPCODE_UC : IB_OPCODE_UD; } +/** + * free_ud_wq_attr - Clean up AH attribute cache for UD QPs + * @qp: Valid QP with allowed_ops set + * + * The rvt_swqe data structure being used is a union, so this is + * only valid for UD QPs. + */ +static void free_ud_wq_attr(struct rvt_qp *qp) +{ + struct rvt_swqe *wqe; + int i; + + for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) { + wqe = rvt_get_swqe_ptr(qp, i); + kfree(wqe->ud_wr.attr); + wqe->ud_wr.attr = NULL; + } +} + +/** + * alloc_ud_wq_attr - AH attribute cache for UD QPs + * @qp: Valid QP with allowed_ops set + * @node: Numa node for allocation + * + * The rvt_swqe data structure being used is a union, so this is + * only valid for UD QPs. + */ +static int alloc_ud_wq_attr(struct rvt_qp *qp, int node) +{ + struct rvt_swqe *wqe; + int i; + + for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) { + wqe = rvt_get_swqe_ptr(qp, i); + wqe->ud_wr.attr = kzalloc_node(sizeof(*wqe->ud_wr.attr), + GFP_KERNEL, node); + if (!wqe->ud_wr.attr) { + free_ud_wq_attr(qp); + return -ENOMEM; + } + } + + return 0; +} + /** * rvt_create_qp - create a queue pair for a device * @ibpd: the protection domain who's device we create the queue pair for @@ -1124,6 +1169,11 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, qp->s_max_sge = init_attr->cap.max_send_sge; if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR) qp->s_flags = RVT_S_SIGNAL_REQ_WR; + err = alloc_ud_wq_attr(qp, rdi->dparms.node); + if (err) { + ret = (ERR_PTR(err)); + goto bail_driver_priv; + } err = alloc_qpn(rdi, &rdi->qp_dev->qpn_table, init_attr->qp_type, @@ -1227,6 +1277,7 @@ bail_qpn: bail_rq_wq: rvt_free_rq(&qp->r_rq); + free_ud_wq_attr(qp); bail_driver_priv: rdi->driver_f.qp_priv_free(rdi, qp); @@ -1671,6 +1722,7 @@ int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) kfree(qp->s_ack_queue); rdma_destroy_ah_attr(&qp->remote_ah_attr); rdma_destroy_ah_attr(&qp->alt_ah_attr); + free_ud_wq_attr(qp); vfree(qp->s_wq); kfree(qp); return 0; @@ -2037,10 +2089,10 @@ static int rvt_post_one_wr(struct rvt_qp *qp, */ log_pmtu = qp->log_pmtu; if (qp->allowed_ops == IB_OPCODE_UD) { - struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.ah); + struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.wr.ah); log_pmtu = ah->log_pmtu; - atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount); + rdma_copy_ah_attr(wqe->ud_wr.attr, &ah->attr); } if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) { @@ -2085,7 +2137,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp, bail_inval_free_ref: if (qp->allowed_ops == IB_OPCODE_UD) - atomic_dec(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount); + rdma_destroy_ah_attr(wqe->ud_wr.attr); bail_inval_free: /* release mr holds */ while (j) { diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 997f42678806..525848e227dc 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -2,7 +2,7 @@ #define DEF_RDMA_VT_H /* - * Copyright(c) 2016 - 2018 Intel Corporation. + * Copyright(c) 2016 - 2019 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -202,7 +202,6 @@ struct rvt_pd { struct rvt_ah { struct ib_ah ibah; struct rdma_ah_attr attr; - atomic_t refcount; u8 vl; u8 log_pmtu; }; diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index e4be869c4f21..9531de2fabe2 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -2,7 +2,7 @@ #define DEF_RDMAVT_INCQP_H /* - * Copyright(c) 2016 - 2018 Intel Corporation. + * Copyright(c) 2016 - 2019 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -157,6 +157,22 @@ #define RVT_SEND_RESERVE_USED IB_SEND_RESERVED_START #define RVT_SEND_COMPLETION_ONLY (IB_SEND_RESERVED_START << 1) +/** + * rvt_ud_wr - IB UD work plus AH cache + * @wr: valid IB work request + * @attr: pointer to an allocated AH attribute + * + * Special case the UD WR so we can keep track of the AH attributes. + * + * NOTE: This data structure is stricly ordered wr then attr. I.e the attr + * MUST come after wr. The ib_ud_wr is sized and copied in rvt_post_one_wr. + * The copy assumes that wr is first. + */ +struct rvt_ud_wr { + struct ib_ud_wr wr; + struct rdma_ah_attr *attr; +}; + /* * Send work request queue entry. * The size of the sg_list is determined when the QP is created and stored @@ -165,7 +181,7 @@ struct rvt_swqe { union { struct ib_send_wr wr; /* don't use wr.sg_list */ - struct ib_ud_wr ud_wr; + struct rvt_ud_wr ud_wr; struct ib_reg_wr reg_wr; struct ib_rdma_wr rdma_wr; struct ib_atomic_wr atomic_wr; @@ -700,7 +716,7 @@ static inline void rvt_put_qp_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) { rvt_put_swqe(wqe); if (qp->allowed_ops == IB_OPCODE_UD) - atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount); + rdma_destroy_ah_attr(wqe->ud_wr.attr); } /** -- cgit v1.2.3 From 2b0ad2da8fd4c32f63d9142f2de43a4d34fdd679 Mon Sep 17 00:00:00 2001 From: Michael J. Ruhl Date: Fri, 28 Jun 2019 14:22:11 -0400 Subject: IB/{rdmavt, hfi1, qib}: Add helpers to hide SWQE WR details Add some helper functions to hide struct rvt_swqe details. Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/qp.c | 2 +- drivers/infiniband/hw/hfi1/ud.c | 29 +++++++++++----------- drivers/infiniband/hw/qib/qib_qp.c | 2 +- drivers/infiniband/hw/qib/qib_ud.c | 21 ++++++++-------- drivers/infiniband/sw/rdmavt/qp.c | 2 +- include/rdma/rdmavt_qp.h | 50 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 79 insertions(+), 27 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index a84b44af7b97..f8e733aa3bb8 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -348,7 +348,7 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send) break; case IB_QPT_GSI: case IB_QPT_UD: - ah = ibah_to_rvtah(wqe->ud_wr.wr.ah); + ah = rvt_get_swqe_ah(wqe); if (wqe->length > (1 << ah->log_pmtu)) return -EINVAL; if (ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)] == 0xf) diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index f8e796e45517..e804af71b629 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -87,7 +87,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) rcu_read_lock(); qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp, - swqe->ud_wr.wr.remote_qpn); + rvt_get_swqe_remote_qpn(swqe)); if (!qp) { ibp->rvp.n_pkt_drops++; rcu_read_unlock(); @@ -105,7 +105,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) goto drop; } - ah_attr = swqe->ud_wr.attr; + ah_attr = rvt_get_swqe_ah_attr(swqe); ppd = ppd_from_ibp(ibp); if (qp->ibqp.qp_num > 1) { @@ -135,8 +135,8 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (qp->ibqp.qp_num) { u32 qkey; - qkey = (int)swqe->ud_wr.wr.remote_qkey < 0 ? - sqp->qkey : swqe->ud_wr.wr.remote_qkey; + qkey = (int)rvt_get_swqe_remote_qkey(swqe) < 0 ? + sqp->qkey : rvt_get_swqe_remote_qkey(swqe); if (unlikely(qkey != qp->qkey)) goto drop; /* silently drop per IBTA spec */ } @@ -240,7 +240,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) { if (sqp->ibqp.qp_type == IB_QPT_GSI || sqp->ibqp.qp_type == IB_QPT_SMI) - wc.pkey_index = swqe->ud_wr.wr.pkey_index; + wc.pkey_index = rvt_get_swqe_pkey_index(swqe); else wc.pkey_index = sqp->s_pkey_index; } else { @@ -282,20 +282,21 @@ static void hfi1_make_bth_deth(struct rvt_qp *qp, struct rvt_swqe *wqe, bth0 |= IB_BTH_SOLICITED; bth0 |= extra_bytes << 20; if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) - *pkey = hfi1_get_pkey(ibp, wqe->ud_wr.wr.pkey_index); + *pkey = hfi1_get_pkey(ibp, rvt_get_swqe_pkey_index(wqe)); else *pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); if (!bypass) bth0 |= *pkey; ohdr->bth[0] = cpu_to_be32(bth0); - ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.wr.remote_qpn); + ohdr->bth[1] = cpu_to_be32(rvt_get_swqe_remote_qpn(wqe)); ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn)); /* * Qkeys with the high order bit set mean use the * qkey from the QP context instead of the WR (see 10.2.5). */ - ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.wr.remote_qkey < 0 ? - qp->qkey : wqe->ud_wr.wr.remote_qkey); + ohdr->u.ud.deth[0] = + cpu_to_be32((int)rvt_get_swqe_remote_qkey(wqe) < 0 ? qp->qkey : + rvt_get_swqe_remote_qkey(wqe)); ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); } @@ -315,7 +316,7 @@ void hfi1_make_ud_req_9B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); - ah_attr = wqe->ud_wr.attr; + ah_attr = rvt_get_swqe_ah_attr(wqe); extra_bytes = -wqe->length & 3; nwords = ((wqe->length + extra_bytes) >> 2) + SIZE_OF_CRC; @@ -379,7 +380,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, struct hfi1_pportdata *ppd; struct hfi1_ibport *ibp; u32 dlid, slid, nwords, extra_bytes; - u32 dest_qp = wqe->ud_wr.wr.remote_qpn; + u32 dest_qp = rvt_get_swqe_remote_qpn(wqe); u32 src_qp = qp->ibqp.qp_num; u16 len, pkey; u8 l4, sc5; @@ -387,7 +388,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); - ah_attr = wqe->ud_wr.attr; + ah_attr = rvt_get_swqe_ah_attr(wqe); /* * Build 16B Management Packet if either the destination @@ -449,7 +450,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, if (is_mgmt) { l4 = OPA_16B_L4_FM; - pkey = hfi1_get_pkey(ibp, wqe->ud_wr.wr.pkey_index); + pkey = hfi1_get_pkey(ibp, rvt_get_swqe_pkey_index(wqe)); hfi1_16B_set_qpn(&ps->s_txreq->phdr.hdr.opah.u.mgmt, dest_qp, src_qp); } else { @@ -514,7 +515,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) /* Construct the header. */ ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); - ah_attr = wqe->ud_wr.attr; + ah_attr = rvt_get_swqe_ah_attr(wqe); priv->hdr_type = hfi1_get_hdr_type(ppd->lid, ah_attr); if ((!hfi1_check_mcast(rdma_ah_get_dlid(ah_attr))) || (rdma_ah_get_dlid(ah_attr) == be32_to_cpu(OPA_LID_PERMISSIVE))) { diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 0e1d0d692891..8d0563ef5be1 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -398,7 +398,7 @@ int qib_check_send_wqe(struct rvt_qp *qp, case IB_QPT_SMI: case IB_QPT_GSI: case IB_QPT_UD: - ah = ibah_to_rvtah(wqe->ud_wr.wr.ah); + ah = rvt_get_swqe_ah(wqe); if (wqe->length > (1 << ah->log_pmtu)) return -EINVAL; /* progress hint */ diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index d8c2c968909f..93ca21347959 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -64,7 +64,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) enum ib_qp_type sqptype, dqptype; rcu_read_lock(); - qp = rvt_lookup_qpn(rdi, &ibp->rvp, swqe->ud_wr.wr.remote_qpn); + qp = rvt_lookup_qpn(rdi, &ibp->rvp, rvt_get_swqe_remote_qpn(swqe)); if (!qp) { ibp->rvp.n_pkt_drops++; goto drop; @@ -81,7 +81,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) goto drop; } - ah_attr = swqe->ud_wr.attr; + ah_attr = rvt_get_swqe_ah_attr(swqe); ppd = ppd_from_ibp(ibp); if (qp->ibqp.qp_num > 1) { @@ -111,8 +111,8 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (qp->ibqp.qp_num) { u32 qkey; - qkey = (int)swqe->ud_wr.wr.remote_qkey < 0 ? - sqp->qkey : swqe->ud_wr.wr.remote_qkey; + qkey = (int)rvt_get_swqe_remote_qkey(swqe) < 0 ? + sqp->qkey : rvt_get_swqe_remote_qkey(swqe); if (unlikely(qkey != qp->qkey)) goto drop; } @@ -204,7 +204,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) wc.qp = &qp->ibqp; wc.src_qp = sqp->ibqp.qp_num; wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ? - swqe->ud_wr.wr.pkey_index : 0; + rvt_get_swqe_pkey_index(swqe) : 0; wc.slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) & ((1 << ppd->lmc) - 1)); wc.sl = rdma_ah_get_sl(ah_attr); @@ -271,7 +271,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) /* Construct the header. */ ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); - ah_attr = wqe->ud_wr.attr; + ah_attr = rvt_get_swqe_ah_attr(wqe); if (rdma_ah_get_dlid(ah_attr) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) { if (rdma_ah_get_dlid(ah_attr) != be16_to_cpu(IB_LID_PERMISSIVE)) @@ -363,7 +363,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) bth0 |= extra_bytes << 20; bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? QIB_DEFAULT_P_KEY : qib_get_pkey(ibp, qp->ibqp.qp_type == IB_QPT_GSI ? - wqe->ud_wr.wr.pkey_index : qp->s_pkey_index); + rvt_get_swqe_pkey_index(wqe) : qp->s_pkey_index); ohdr->bth[0] = cpu_to_be32(bth0); /* * Use the multicast QP if the destination LID is a multicast LID. @@ -372,14 +372,15 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) be16_to_cpu(IB_MULTICAST_LID_BASE) && rdma_ah_get_dlid(ah_attr) != be16_to_cpu(IB_LID_PERMISSIVE) ? cpu_to_be32(QIB_MULTICAST_QPN) : - cpu_to_be32(wqe->ud_wr.wr.remote_qpn); + cpu_to_be32(rvt_get_swqe_remote_qpn(wqe)); ohdr->bth[2] = cpu_to_be32(wqe->psn & QIB_PSN_MASK); /* * Qkeys with the high order bit set mean use the * qkey from the QP context instead of the WR (see 10.2.5). */ - ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.wr.remote_qkey < 0 ? - qp->qkey : wqe->ud_wr.wr.remote_qkey); + ohdr->u.ud.deth[0] = + cpu_to_be32((int)rvt_get_swqe_remote_qkey(wqe) < 0 ? qp->qkey : + rvt_get_swqe_remote_qkey(wqe)); ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); done: diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index de7d2edb9781..11b4d3c1efd4 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -2089,7 +2089,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp, */ log_pmtu = qp->log_pmtu; if (qp->allowed_ops == IB_OPCODE_UD) { - struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.wr.ah); + struct rvt_ah *ah = rvt_get_swqe_ah(wqe); log_pmtu = ah->log_pmtu; rdma_copy_ah_attr(wqe->ud_wr.attr, &ah->attr); diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 9531de2fabe2..0eeea520a853 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -219,6 +219,56 @@ struct rvt_krwq { struct rvt_rwqe wq[]; }; +/* + * rvt_get_swqe_ah - Return the pointer to the struct rvt_ah + * @swqe: valid Send WQE + * + */ +static inline struct rvt_ah *rvt_get_swqe_ah(struct rvt_swqe *swqe) +{ + return ibah_to_rvtah(swqe->ud_wr.wr.ah); +} + +/** + * rvt_get_swqe_ah_attr - Return the cached ah attribute information + * @swqe: valid Send WQE + * + */ +static inline struct rdma_ah_attr *rvt_get_swqe_ah_attr(struct rvt_swqe *swqe) +{ + return swqe->ud_wr.attr; +} + +/** + * rvt_get_swqe_remote_qpn - Access the remote QPN value + * @swqe: valid Send WQE + * + */ +static inline u32 rvt_get_swqe_remote_qpn(struct rvt_swqe *swqe) +{ + return swqe->ud_wr.wr.remote_qpn; +} + +/** + * rvt_get_swqe_remote_qkey - Acces the remote qkey value + * @swqe: valid Send WQE + * + */ +static inline u32 rvt_get_swqe_remote_qkey(struct rvt_swqe *swqe) +{ + return swqe->ud_wr.wr.remote_qkey; +} + +/** + * rvt_get_swqe_pkey_index - Access the pkey index + * @swqe: valid Send WQE + * + */ +static inline u16 rvt_get_swqe_pkey_index(struct rvt_swqe *swqe) +{ + return swqe->ud_wr.wr.pkey_index; +} + struct rvt_rq { struct rvt_rwq *wq; struct rvt_krwq *kwq; -- cgit v1.2.3 From 315aed110c16ac806f25fd85eff8b34579ed101d Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 28 Jun 2019 14:22:33 -0400 Subject: IB/rdmavt: Enhance trace information for FRWR debug This patch enhances the MR trace information to enable more focused debug of MR issues. Reviewed-by: Kaike Wan Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/mr.c | 2 +- drivers/infiniband/sw/rdmavt/trace_mr.h | 20 +++++++++++++++++--- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 0867a11d074e..23ddc63cae61 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -612,8 +612,8 @@ static int rvt_set_page(struct ib_mr *ibmr, u64 addr) n = mapped_segs % RVT_SEGSZ; mr->mr.map[m]->segs[n].vaddr = (void *)addr; mr->mr.map[m]->segs[n].length = ps; - trace_rvt_mr_page_seg(&mr->mr, m, n, (void *)addr, ps); mr->mr.length += ps; + trace_rvt_mr_page_seg(&mr->mr, m, n, (void *)addr, ps); return 0; } diff --git a/drivers/infiniband/sw/rdmavt/trace_mr.h b/drivers/infiniband/sw/rdmavt/trace_mr.h index 976e482930a3..f43e477a8c91 100644 --- a/drivers/infiniband/sw/rdmavt/trace_mr.h +++ b/drivers/infiniband/sw/rdmavt/trace_mr.h @@ -64,8 +64,12 @@ DECLARE_EVENT_CLASS( RDI_DEV_ENTRY(ib_to_rvt(mr->pd->device)) __field(void *, vaddr) __field(struct page *, page) + __field(u64, iova) + __field(u64, user_base) __field(size_t, len) + __field(size_t, length) __field(u32, lkey) + __field(u32, offset) __field(u16, m) __field(u16, n) ), @@ -73,18 +77,28 @@ DECLARE_EVENT_CLASS( RDI_DEV_ASSIGN(ib_to_rvt(mr->pd->device)); __entry->vaddr = v; __entry->page = virt_to_page(v); + __entry->iova = mr->iova; + __entry->user_base = mr->user_base; + __entry->lkey = mr->lkey; __entry->m = m; __entry->n = n; __entry->len = len; + __entry->length = mr->length; + __entry->offset = mr->offset; ), TP_printk( - "[%s] vaddr %p page %p m %u n %u len %ld", + "[%s] lkey %x iova %llx user_base %llx mr_len %lu vaddr %llx page %p m %u n %u len %lu off %u", __get_str(dev), - __entry->vaddr, + __entry->lkey, + __entry->iova, + __entry->user_base, + __entry->length, + (unsigned long long)__entry->vaddr, __entry->page, __entry->m, __entry->n, - __entry->len + __entry->len, + __entry->offset ) ); -- cgit v1.2.3 From 8bd516bd0d53f71594340dc644f6fbc4278a8ab1 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 28 Jun 2019 14:22:39 -0400 Subject: IB/rdmavt: Add trace for map_mr_sg Add trace to debug map_mr_sg handling. Reviewed-by: Kaike Wan Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/mr.c | 1 + drivers/infiniband/sw/rdmavt/trace_mr.h | 36 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 23ddc63cae61..a6a39f01dca3 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -642,6 +642,7 @@ int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, mr->mr.iova = ibmr->iova; mr->mr.offset = ibmr->iova - (u64)mr->mr.map[0]->segs[0].vaddr; mr->mr.length = (size_t)ibmr->length; + trace_rvt_map_mr_sg(ibmr, sg_nents, sg_offset); return ret; } diff --git a/drivers/infiniband/sw/rdmavt/trace_mr.h b/drivers/infiniband/sw/rdmavt/trace_mr.h index f43e477a8c91..95b8a0e3b8bd 100644 --- a/drivers/infiniband/sw/rdmavt/trace_mr.h +++ b/drivers/infiniband/sw/rdmavt/trace_mr.h @@ -54,6 +54,8 @@ #include #include +#include "mr.h" + #undef TRACE_SYSTEM #define TRACE_SYSTEM rvt_mr DECLARE_EVENT_CLASS( @@ -179,6 +181,40 @@ DEFINE_EVENT( TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge), TP_ARGS(sge, isge)); +TRACE_EVENT( + rvt_map_mr_sg, + TP_PROTO(struct ib_mr *ibmr, int sg_nents, unsigned int *sg_offset), + TP_ARGS(ibmr, sg_nents, sg_offset), + TP_STRUCT__entry( + RDI_DEV_ENTRY(ib_to_rvt(to_imr(ibmr)->mr.pd->device)) + __field(u64, iova) + __field(u64, ibmr_iova) + __field(u64, user_base) + __field(u64, ibmr_length) + __field(int, sg_nents) + __field(uint, sg_offset) + ), + TP_fast_assign( + RDI_DEV_ASSIGN(ib_to_rvt(to_imr(ibmr)->mr.pd->device)) + __entry->ibmr_iova = ibmr->iova; + __entry->iova = to_imr(ibmr)->mr.iova; + __entry->user_base = to_imr(ibmr)->mr.user_base; + __entry->ibmr_length = to_imr(ibmr)->mr.length; + __entry->sg_nents = sg_nents; + __entry->sg_offset = sg_offset ? *sg_offset : 0; + ), + TP_printk( + "[%s] ibmr_iova %llx iova %llx user_base %llx length %llx sg_nents %d sg_offset %u", + __get_str(dev), + __entry->ibmr_iova, + __entry->iova, + __entry->user_base, + __entry->ibmr_length, + __entry->sg_nents, + __entry->sg_offset + ) +); + #endif /* __RVT_TRACE_MR_H */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 0e935ae6afcdbe6f0c0aa457ae57feccc63bb9be Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:23 +0200 Subject: rdma/siw: iWarp wire packet format Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/iwarp.h | 380 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 380 insertions(+) create mode 100644 drivers/infiniband/sw/siw/iwarp.h (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/iwarp.h b/drivers/infiniband/sw/siw/iwarp.h new file mode 100644 index 000000000000..e8a04d9c89cb --- /dev/null +++ b/drivers/infiniband/sw/siw/iwarp.h @@ -0,0 +1,380 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _IWARP_H +#define _IWARP_H + +#include /* RDMA_MAX_PRIVATE_DATA */ +#include +#include + +#define RDMAP_VERSION 1 +#define DDP_VERSION 1 +#define MPA_REVISION_1 1 +#define MPA_REVISION_2 2 +#define MPA_MAX_PRIVDATA RDMA_MAX_PRIVATE_DATA +#define MPA_KEY_REQ "MPA ID Req Frame" +#define MPA_KEY_REP "MPA ID Rep Frame" +#define MPA_IRD_ORD_MASK 0x3fff + +struct mpa_rr_params { + __be16 bits; + __be16 pd_len; +}; + +/* + * MPA request/response header bits & fields + */ +enum { + MPA_RR_FLAG_MARKERS = cpu_to_be16(0x8000), + MPA_RR_FLAG_CRC = cpu_to_be16(0x4000), + MPA_RR_FLAG_REJECT = cpu_to_be16(0x2000), + MPA_RR_FLAG_ENHANCED = cpu_to_be16(0x1000), + MPA_RR_FLAG_GSO_EXP = cpu_to_be16(0x0800), + MPA_RR_MASK_REVISION = cpu_to_be16(0x00ff) +}; + +/* + * MPA request/reply header + */ +struct mpa_rr { + __u8 key[16]; + struct mpa_rr_params params; +}; + +static inline void __mpa_rr_set_revision(__be16 *bits, u8 rev) +{ + *bits = (*bits & ~MPA_RR_MASK_REVISION) | + (cpu_to_be16(rev) & MPA_RR_MASK_REVISION); +} + +static inline u8 __mpa_rr_revision(__be16 mpa_rr_bits) +{ + __be16 rev = mpa_rr_bits & MPA_RR_MASK_REVISION; + + return be16_to_cpu(rev); +} + +enum mpa_v2_ctrl { + MPA_V2_PEER_TO_PEER = cpu_to_be16(0x8000), + MPA_V2_ZERO_LENGTH_RTR = cpu_to_be16(0x4000), + MPA_V2_RDMA_WRITE_RTR = cpu_to_be16(0x8000), + MPA_V2_RDMA_READ_RTR = cpu_to_be16(0x4000), + MPA_V2_RDMA_NO_RTR = cpu_to_be16(0x0000), + MPA_V2_MASK_IRD_ORD = cpu_to_be16(0x3fff) +}; + +struct mpa_v2_data { + __be16 ird; + __be16 ord; +}; + +struct mpa_marker { + __be16 rsvd; + __be16 fpdu_hmd; /* FPDU header-marker distance (= MPA's FPDUPTR) */ +}; + +/* + * maximum MPA trailer + */ +struct mpa_trailer { + __u8 pad[4]; + __be32 crc; +}; + +#define MPA_HDR_SIZE 2 +#define MPA_CRC_SIZE 4 + +/* + * Common portion of iWARP headers (MPA, DDP, RDMAP) + * for any FPDU + */ +struct iwarp_ctrl { + __be16 mpa_len; + __be16 ddp_rdmap_ctrl; +}; + +/* + * DDP/RDMAP Hdr bits & fields + */ +enum { + DDP_FLAG_TAGGED = cpu_to_be16(0x8000), + DDP_FLAG_LAST = cpu_to_be16(0x4000), + DDP_MASK_RESERVED = cpu_to_be16(0x3C00), + DDP_MASK_VERSION = cpu_to_be16(0x0300), + RDMAP_MASK_VERSION = cpu_to_be16(0x00C0), + RDMAP_MASK_RESERVED = cpu_to_be16(0x0030), + RDMAP_MASK_OPCODE = cpu_to_be16(0x000f) +}; + +static inline u8 __ddp_get_version(struct iwarp_ctrl *ctrl) +{ + return be16_to_cpu(ctrl->ddp_rdmap_ctrl & DDP_MASK_VERSION) >> 8; +} + +static inline void __ddp_set_version(struct iwarp_ctrl *ctrl, u8 version) +{ + ctrl->ddp_rdmap_ctrl = + (ctrl->ddp_rdmap_ctrl & ~DDP_MASK_VERSION) | + (cpu_to_be16((u16)version << 8) & DDP_MASK_VERSION); +} + +static inline u8 __rdmap_get_version(struct iwarp_ctrl *ctrl) +{ + __be16 ver = ctrl->ddp_rdmap_ctrl & RDMAP_MASK_VERSION; + + return be16_to_cpu(ver) >> 6; +} + +static inline void __rdmap_set_version(struct iwarp_ctrl *ctrl, u8 version) +{ + ctrl->ddp_rdmap_ctrl = (ctrl->ddp_rdmap_ctrl & ~RDMAP_MASK_VERSION) | + (cpu_to_be16(version << 6) & RDMAP_MASK_VERSION); +} + +static inline u8 __rdmap_get_opcode(struct iwarp_ctrl *ctrl) +{ + return be16_to_cpu(ctrl->ddp_rdmap_ctrl & RDMAP_MASK_OPCODE); +} + +static inline void __rdmap_set_opcode(struct iwarp_ctrl *ctrl, u8 opcode) +{ + ctrl->ddp_rdmap_ctrl = (ctrl->ddp_rdmap_ctrl & ~RDMAP_MASK_OPCODE) | + (cpu_to_be16(opcode) & RDMAP_MASK_OPCODE); +} + +struct iwarp_rdma_write { + struct iwarp_ctrl ctrl; + __be32 sink_stag; + __be64 sink_to; +}; + +struct iwarp_rdma_rreq { + struct iwarp_ctrl ctrl; + __be32 rsvd; + __be32 ddp_qn; + __be32 ddp_msn; + __be32 ddp_mo; + __be32 sink_stag; + __be64 sink_to; + __be32 read_size; + __be32 source_stag; + __be64 source_to; +}; + +struct iwarp_rdma_rresp { + struct iwarp_ctrl ctrl; + __be32 sink_stag; + __be64 sink_to; +}; + +struct iwarp_send { + struct iwarp_ctrl ctrl; + __be32 rsvd; + __be32 ddp_qn; + __be32 ddp_msn; + __be32 ddp_mo; +}; + +struct iwarp_send_inv { + struct iwarp_ctrl ctrl; + __be32 inval_stag; + __be32 ddp_qn; + __be32 ddp_msn; + __be32 ddp_mo; +}; + +struct iwarp_terminate { + struct iwarp_ctrl ctrl; + __be32 rsvd; + __be32 ddp_qn; + __be32 ddp_msn; + __be32 ddp_mo; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __be32 layer : 4; + __be32 etype : 4; + __be32 ecode : 8; + __be32 flag_m : 1; + __be32 flag_d : 1; + __be32 flag_r : 1; + __be32 reserved : 13; +#elif defined(__BIG_ENDIAN_BITFIELD) + __be32 reserved : 13; + __be32 flag_r : 1; + __be32 flag_d : 1; + __be32 flag_m : 1; + __be32 ecode : 8; + __be32 etype : 4; + __be32 layer : 4; +#else +#error "undefined byte order" +#endif +}; + +/* + * Terminate Hdr bits & fields + */ +enum { + TERM_MASK_LAYER = cpu_to_be32(0xf0000000), + TERM_MASK_ETYPE = cpu_to_be32(0x0f000000), + TERM_MASK_ECODE = cpu_to_be32(0x00ff0000), + TERM_FLAG_M = cpu_to_be32(0x00008000), + TERM_FLAG_D = cpu_to_be32(0x00004000), + TERM_FLAG_R = cpu_to_be32(0x00002000), + TERM_MASK_RESVD = cpu_to_be32(0x00001fff) +}; + +static inline u8 __rdmap_term_layer(struct iwarp_terminate *term) +{ + return term->layer; +} + +static inline void __rdmap_term_set_layer(struct iwarp_terminate *term, + u8 layer) +{ + term->layer = layer & 0xf; +} + +static inline u8 __rdmap_term_etype(struct iwarp_terminate *term) +{ + return term->etype; +} + +static inline void __rdmap_term_set_etype(struct iwarp_terminate *term, + u8 etype) +{ + term->etype = etype & 0xf; +} + +static inline u8 __rdmap_term_ecode(struct iwarp_terminate *term) +{ + return term->ecode; +} + +static inline void __rdmap_term_set_ecode(struct iwarp_terminate *term, + u8 ecode) +{ + term->ecode = ecode; +} + +/* + * Common portion of iWARP headers (MPA, DDP, RDMAP) + * for an FPDU carrying an untagged DDP segment + */ +struct iwarp_ctrl_untagged { + struct iwarp_ctrl ctrl; + __be32 rsvd; + __be32 ddp_qn; + __be32 ddp_msn; + __be32 ddp_mo; +}; + +/* + * Common portion of iWARP headers (MPA, DDP, RDMAP) + * for an FPDU carrying a tagged DDP segment + */ +struct iwarp_ctrl_tagged { + struct iwarp_ctrl ctrl; + __be32 ddp_stag; + __be64 ddp_to; +}; + +union iwarp_hdr { + struct iwarp_ctrl ctrl; + struct iwarp_ctrl_untagged c_untagged; + struct iwarp_ctrl_tagged c_tagged; + struct iwarp_rdma_write rwrite; + struct iwarp_rdma_rreq rreq; + struct iwarp_rdma_rresp rresp; + struct iwarp_terminate terminate; + struct iwarp_send send; + struct iwarp_send_inv send_inv; +}; + +enum term_elayer { + TERM_ERROR_LAYER_RDMAP = 0x00, + TERM_ERROR_LAYER_DDP = 0x01, + TERM_ERROR_LAYER_LLP = 0x02 /* eg., MPA */ +}; + +enum ddp_etype { + DDP_ETYPE_CATASTROPHIC = 0x0, + DDP_ETYPE_TAGGED_BUF = 0x1, + DDP_ETYPE_UNTAGGED_BUF = 0x2, + DDP_ETYPE_RSVD = 0x3 +}; + +enum ddp_ecode { + /* unspecified, set to zero */ + DDP_ECODE_CATASTROPHIC = 0x00, + /* Tagged Buffer Errors */ + DDP_ECODE_T_INVALID_STAG = 0x00, + DDP_ECODE_T_BASE_BOUNDS = 0x01, + DDP_ECODE_T_STAG_NOT_ASSOC = 0x02, + DDP_ECODE_T_TO_WRAP = 0x03, + DDP_ECODE_T_VERSION = 0x04, + /* Untagged Buffer Errors */ + DDP_ECODE_UT_INVALID_QN = 0x01, + DDP_ECODE_UT_INVALID_MSN_NOBUF = 0x02, + DDP_ECODE_UT_INVALID_MSN_RANGE = 0x03, + DDP_ECODE_UT_INVALID_MO = 0x04, + DDP_ECODE_UT_MSG_TOOLONG = 0x05, + DDP_ECODE_UT_VERSION = 0x06 +}; + +enum rdmap_untagged_qn { + RDMAP_UNTAGGED_QN_SEND = 0, + RDMAP_UNTAGGED_QN_RDMA_READ = 1, + RDMAP_UNTAGGED_QN_TERMINATE = 2, + RDMAP_UNTAGGED_QN_COUNT = 3 +}; + +enum rdmap_etype { + RDMAP_ETYPE_CATASTROPHIC = 0x0, + RDMAP_ETYPE_REMOTE_PROTECTION = 0x1, + RDMAP_ETYPE_REMOTE_OPERATION = 0x2 +}; + +enum rdmap_ecode { + RDMAP_ECODE_INVALID_STAG = 0x00, + RDMAP_ECODE_BASE_BOUNDS = 0x01, + RDMAP_ECODE_ACCESS_RIGHTS = 0x02, + RDMAP_ECODE_STAG_NOT_ASSOC = 0x03, + RDMAP_ECODE_TO_WRAP = 0x04, + RDMAP_ECODE_VERSION = 0x05, + RDMAP_ECODE_OPCODE = 0x06, + RDMAP_ECODE_CATASTROPHIC_STREAM = 0x07, + RDMAP_ECODE_CATASTROPHIC_GLOBAL = 0x08, + RDMAP_ECODE_CANNOT_INVALIDATE = 0x09, + RDMAP_ECODE_UNSPECIFIED = 0xff +}; + +enum llp_ecode { + LLP_ECODE_TCP_STREAM_LOST = 0x01, /* How to transfer this ?? */ + LLP_ECODE_RECEIVED_CRC = 0x02, + LLP_ECODE_FPDU_START = 0x03, + LLP_ECODE_INVALID_REQ_RESP = 0x04, + + /* Errors for Enhanced Connection Establishment only */ + LLP_ECODE_LOCAL_CATASTROPHIC = 0x05, + LLP_ECODE_INSUFFICIENT_IRD = 0x06, + LLP_ECODE_NO_MATCHING_RTR = 0x07 +}; + +enum llp_etype { LLP_ETYPE_MPA = 0x00 }; + +enum rdma_opcode { + RDMAP_RDMA_WRITE = 0x0, + RDMAP_RDMA_READ_REQ = 0x1, + RDMAP_RDMA_READ_RESP = 0x2, + RDMAP_SEND = 0x3, + RDMAP_SEND_INVAL = 0x4, + RDMAP_SEND_SE = 0x5, + RDMAP_SEND_SE_INVAL = 0x6, + RDMAP_TERMINATE = 0x7, + RDMAP_NOT_SUPPORTED = RDMAP_TERMINATE + 1 +}; + +#endif -- cgit v1.2.3 From a531975279f3e5dd7323da09077ec848067bb313 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:24 +0200 Subject: rdma/siw: main include file Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw.h | 745 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 745 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw.h (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h new file mode 100644 index 000000000000..03fd7b2f595f --- /dev/null +++ b/drivers/infiniband/sw/siw/siw.h @@ -0,0 +1,745 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _SIW_H +#define _SIW_H + +#include +#include +#include +#include +#include +#include + +#include +#include "iwarp.h" + +#define SIW_VENDOR_ID 0x626d74 /* ascii 'bmt' for now */ +#define SIW_VENDORT_PART_ID 0 +#define SIW_MAX_QP (1024 * 100) +#define SIW_MAX_QP_WR (1024 * 32) +#define SIW_MAX_ORD_QP 128 +#define SIW_MAX_IRD_QP 128 +#define SIW_MAX_SGE_PBL 256 /* max num sge's for PBL */ +#define SIW_MAX_SGE_RD 1 /* iwarp limitation. we could relax */ +#define SIW_MAX_CQ (1024 * 100) +#define SIW_MAX_CQE (SIW_MAX_QP_WR * 100) +#define SIW_MAX_MR (SIW_MAX_QP * 10) +#define SIW_MAX_PD SIW_MAX_QP +#define SIW_MAX_MW 0 /* to be set if MW's are supported */ +#define SIW_MAX_FMR SIW_MAX_MR +#define SIW_MAX_SRQ SIW_MAX_QP +#define SIW_MAX_SRQ_WR (SIW_MAX_QP_WR * 10) +#define SIW_MAX_CONTEXT SIW_MAX_PD + +/* Min number of bytes for using zero copy transmit */ +#define SENDPAGE_THRESH PAGE_SIZE + +/* Maximum number of frames which can be send in one SQ processing */ +#define SQ_USER_MAXBURST 100 + +/* Maximum number of consecutive IRQ elements which get served + * if SQ has pending work. Prevents starving local SQ processing + * by serving peer Read Requests. + */ +#define SIW_IRQ_MAXBURST_SQ_ACTIVE 4 + +struct siw_dev_cap { + int max_qp; + int max_qp_wr; + int max_ord; /* max. outbound read queue depth */ + int max_ird; /* max. inbound read queue depth */ + int max_sge; + int max_sge_rd; + int max_cq; + int max_cqe; + int max_mr; + int max_pd; + int max_mw; + int max_fmr; + int max_srq; + int max_srq_wr; + int max_srq_sge; +}; + +struct siw_pd { + struct ib_pd base_pd; +}; + +struct siw_device { + struct ib_device base_dev; + struct net_device *netdev; + struct siw_dev_cap attrs; + + u32 vendor_part_id; + int numa_node; + + /* physical port state (only one port per device) */ + enum ib_port_state state; + + spinlock_t lock; + + struct xarray qp_xa; + struct xarray mem_xa; + + struct list_head cep_list; + struct list_head qp_list; + + /* active objects statistics to enforce limits */ + atomic_t num_qp; + atomic_t num_cq; + atomic_t num_pd; + atomic_t num_mr; + atomic_t num_srq; + atomic_t num_ctx; + + struct work_struct netdev_down; +}; + +struct siw_uobj { + void *addr; + u32 size; +}; + +struct siw_ucontext { + struct ib_ucontext base_ucontext; + struct siw_device *sdev; + + /* xarray of user mappable objects */ + struct xarray xa; + u32 uobj_nextkey; +}; + +/* + * The RDMA core does not define LOCAL_READ access, which is always + * enabled implictely. + */ +#define IWARP_ACCESS_MASK \ + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | \ + IB_ACCESS_REMOTE_READ) + +/* + * siw presentation of user memory registered as source + * or target of RDMA operations. + */ + +struct siw_page_chunk { + struct page **plist; +}; + +struct siw_umem { + struct siw_page_chunk *page_chunk; + int num_pages; + bool writable; + u64 fp_addr; /* First page base address */ + struct mm_struct *owning_mm; +}; + +struct siw_pble { + u64 addr; /* Address of assigned user buffer */ + u64 size; /* Size of this entry */ + u64 pbl_off; /* Total offset from start of PBL */ +}; + +struct siw_pbl { + unsigned int num_buf; + unsigned int max_buf; + struct siw_pble pbe[1]; +}; + +struct siw_mr; + +/* + * Generic memory representation for registered siw memory. + * Memory lookup always via higher 24 bit of STag (STag index). + */ +struct siw_mem { + struct siw_device *sdev; + struct kref ref; + u64 va; /* VA of memory */ + u64 len; /* length of the memory buffer in bytes */ + u32 stag; /* iWarp memory access steering tag */ + u8 stag_valid; /* VALID or INVALID */ + u8 is_pbl; /* PBL or user space mem */ + u8 is_mw; /* Memory Region or Memory Window */ + enum ib_access_flags perms; /* local/remote READ & WRITE */ + union { + struct siw_umem *umem; + struct siw_pbl *pbl; + void *mem_obj; + }; + struct ib_pd *pd; +}; + +struct siw_mr { + struct ib_mr base_mr; + struct siw_mem *mem; + struct rcu_head rcu; +}; + +/* + * Error codes for local or remote + * access to registered memory + */ +enum siw_access_state { + E_ACCESS_OK, + E_STAG_INVALID, + E_BASE_BOUNDS, + E_ACCESS_PERM, + E_PD_MISMATCH +}; + +enum siw_wr_state { + SIW_WR_IDLE, + SIW_WR_QUEUED, /* processing has not started yet */ + SIW_WR_INPROGRESS /* initiated processing of the WR */ +}; + +/* The WQE currently being processed (RX or TX) */ +struct siw_wqe { + /* Copy of applications SQE or RQE */ + union { + struct siw_sqe sqe; + struct siw_rqe rqe; + }; + struct siw_mem *mem[SIW_MAX_SGE]; /* per sge's resolved mem */ + enum siw_wr_state wr_status; + enum siw_wc_status wc_status; + u32 bytes; /* total bytes to process */ + u32 processed; /* bytes processed */ +}; + +struct siw_cq { + struct ib_cq base_cq; + spinlock_t lock; + u64 *notify; + struct siw_cqe *queue; + u32 cq_put; + u32 cq_get; + u32 num_cqe; + bool kernel_verbs; + u32 xa_cq_index; /* mmap information for CQE array */ + u32 id; /* For debugging only */ +}; + +enum siw_qp_state { + SIW_QP_STATE_IDLE, + SIW_QP_STATE_RTR, + SIW_QP_STATE_RTS, + SIW_QP_STATE_CLOSING, + SIW_QP_STATE_TERMINATE, + SIW_QP_STATE_ERROR, + SIW_QP_STATE_COUNT +}; + +enum siw_qp_flags { + SIW_RDMA_BIND_ENABLED = (1 << 0), + SIW_RDMA_WRITE_ENABLED = (1 << 1), + SIW_RDMA_READ_ENABLED = (1 << 2), + SIW_SIGNAL_ALL_WR = (1 << 3), + SIW_MPA_CRC = (1 << 4), + SIW_QP_IN_DESTROY = (1 << 5) +}; + +enum siw_qp_attr_mask { + SIW_QP_ATTR_STATE = (1 << 0), + SIW_QP_ATTR_ACCESS_FLAGS = (1 << 1), + SIW_QP_ATTR_LLP_HANDLE = (1 << 2), + SIW_QP_ATTR_ORD = (1 << 3), + SIW_QP_ATTR_IRD = (1 << 4), + SIW_QP_ATTR_SQ_SIZE = (1 << 5), + SIW_QP_ATTR_RQ_SIZE = (1 << 6), + SIW_QP_ATTR_MPA = (1 << 7) +}; + +struct siw_srq { + struct ib_srq base_srq; + spinlock_t lock; + u32 max_sge; + u32 limit; /* low watermark for async event */ + struct siw_rqe *recvq; + u32 rq_put; + u32 rq_get; + u32 num_rqe; /* max # of wqe's allowed */ + u32 xa_srq_index; /* mmap information for SRQ array */ + char armed; /* inform user if limit hit */ + char kernel_verbs; /* '1' if kernel client */ +}; + +struct siw_qp_attrs { + enum siw_qp_state state; + u32 sq_size; + u32 rq_size; + u32 orq_size; + u32 irq_size; + u32 sq_max_sges; + u32 rq_max_sges; + enum siw_qp_flags flags; + + struct socket *sk; +}; + +enum siw_tx_ctx { + SIW_SEND_HDR, /* start or continue sending HDR */ + SIW_SEND_DATA, /* start or continue sending DDP payload */ + SIW_SEND_TRAILER, /* start or continue sending TRAILER */ + SIW_SEND_SHORT_FPDU/* send whole FPDU hdr|data|trailer at once */ +}; + +enum siw_rx_state { + SIW_GET_HDR, /* await new hdr or within hdr */ + SIW_GET_DATA_START, /* start of inbound DDP payload */ + SIW_GET_DATA_MORE, /* continuation of (misaligned) DDP payload */ + SIW_GET_TRAILER/* await new trailer or within trailer */ +}; + +struct siw_rx_stream { + struct sk_buff *skb; + int skb_new; /* pending unread bytes in skb */ + int skb_offset; /* offset in skb */ + int skb_copied; /* processed bytes in skb */ + + union iwarp_hdr hdr; + struct mpa_trailer trailer; + + enum siw_rx_state state; + + /* + * For each FPDU, main RX loop runs through 3 stages: + * Receiving protocol headers, placing DDP payload and receiving + * trailer information (CRC + possibly padding). + * Next two variables keep state on receive status of the + * current FPDU part (hdr, data, trailer). + */ + int fpdu_part_rcvd; /* bytes in pkt part copied */ + int fpdu_part_rem; /* bytes in pkt part not seen */ + + /* + * Next expected DDP MSN for each QN + + * expected steering tag + + * expected DDP tagget offset (all HBO) + */ + u32 ddp_msn[RDMAP_UNTAGGED_QN_COUNT]; + u32 ddp_stag; + u64 ddp_to; + u32 inval_stag; /* Stag to be invalidated */ + + struct shash_desc *mpa_crc_hd; + u8 rx_suspend : 1; + u8 pad : 2; /* # of pad bytes expected */ + u8 rdmap_op : 4; /* opcode of current frame */ +}; + +struct siw_rx_fpdu { + /* + * Local destination memory of inbound RDMA operation. + * Valid, according to wqe->wr_status + */ + struct siw_wqe wqe_active; + + unsigned int pbl_idx; /* Index into current PBL */ + unsigned int sge_idx; /* current sge in rx */ + unsigned int sge_off; /* already rcvd in curr. sge */ + + char first_ddp_seg; /* this is the first DDP seg */ + char more_ddp_segs; /* more DDP segs expected */ + u8 prev_rdmap_op : 4; /* opcode of prev frame */ +}; + +/* + * Shorthands for short packets w/o payload + * to be transmitted more efficient. + */ +struct siw_send_pkt { + struct iwarp_send send; + __be32 crc; +}; + +struct siw_write_pkt { + struct iwarp_rdma_write write; + __be32 crc; +}; + +struct siw_rreq_pkt { + struct iwarp_rdma_rreq rreq; + __be32 crc; +}; + +struct siw_rresp_pkt { + struct iwarp_rdma_rresp rresp; + __be32 crc; +}; + +struct siw_iwarp_tx { + union { + union iwarp_hdr hdr; + + /* Generic part of FPDU header */ + struct iwarp_ctrl ctrl; + struct iwarp_ctrl_untagged c_untagged; + struct iwarp_ctrl_tagged c_tagged; + + /* FPDU headers */ + struct iwarp_rdma_write rwrite; + struct iwarp_rdma_rreq rreq; + struct iwarp_rdma_rresp rresp; + struct iwarp_terminate terminate; + struct iwarp_send send; + struct iwarp_send_inv send_inv; + + /* complete short FPDUs */ + struct siw_send_pkt send_pkt; + struct siw_write_pkt write_pkt; + struct siw_rreq_pkt rreq_pkt; + struct siw_rresp_pkt rresp_pkt; + } pkt; + + struct mpa_trailer trailer; + /* DDP MSN for untagged messages */ + u32 ddp_msn[RDMAP_UNTAGGED_QN_COUNT]; + + enum siw_tx_ctx state; + u16 ctrl_len; /* ddp+rdmap hdr */ + u16 ctrl_sent; + int burst; + int bytes_unsent; /* ddp payload bytes */ + + struct shash_desc *mpa_crc_hd; + + u8 do_crc : 1; /* do crc for segment */ + u8 use_sendpage : 1; /* send w/o copy */ + u8 tx_suspend : 1; /* stop sending DDP segs. */ + u8 pad : 2; /* # pad in current fpdu */ + u8 orq_fence : 1; /* ORQ full or Send fenced */ + u8 in_syscall : 1; /* TX out of user context */ + u8 zcopy_tx : 1; /* Use TCP_SENDPAGE if possible */ + u8 gso_seg_limit; /* Maximum segments for GSO, 0 = unbound */ + + u16 fpdu_len; /* len of FPDU to tx */ + unsigned int tcp_seglen; /* remaining tcp seg space */ + + struct siw_wqe wqe_active; + + int pbl_idx; /* Index into current PBL */ + int sge_idx; /* current sge in tx */ + u32 sge_off; /* already sent in curr. sge */ +}; + +struct siw_qp { + struct siw_device *sdev; + struct ib_qp *ib_qp; + struct kref ref; + u32 qp_num; + struct list_head devq; + int tx_cpu; + bool kernel_verbs; + struct siw_qp_attrs attrs; + + struct siw_cep *cep; + struct rw_semaphore state_lock; + + struct ib_pd *pd; + struct siw_cq *scq; + struct siw_cq *rcq; + struct siw_srq *srq; + + struct siw_iwarp_tx tx_ctx; /* Transmit context */ + spinlock_t sq_lock; + struct siw_sqe *sendq; /* send queue element array */ + uint32_t sq_get; /* consumer index into sq array */ + uint32_t sq_put; /* kernel prod. index into sq array */ + struct llist_node tx_list; + + struct siw_sqe *orq; /* outbound read queue element array */ + spinlock_t orq_lock; + uint32_t orq_get; /* consumer index into orq array */ + uint32_t orq_put; /* shared producer index for ORQ */ + + struct siw_rx_stream rx_stream; + struct siw_rx_fpdu *rx_fpdu; + struct siw_rx_fpdu rx_tagged; + struct siw_rx_fpdu rx_untagged; + spinlock_t rq_lock; + struct siw_rqe *recvq; /* recv queue element array */ + uint32_t rq_get; /* consumer index into rq array */ + uint32_t rq_put; /* kernel prod. index into rq array */ + + struct siw_sqe *irq; /* inbound read queue element array */ + uint32_t irq_get; /* consumer index into irq array */ + uint32_t irq_put; /* producer index into irq array */ + int irq_burst; + + struct { /* information to be carried in TERMINATE pkt, if valid */ + u8 valid; + u8 in_tx; + u8 layer : 4, etype : 4; + u8 ecode; + } term_info; + u32 xa_sq_index; /* mmap information for SQE array */ + u32 xa_rq_index; /* mmap information for RQE array */ + struct rcu_head rcu; +}; + +struct siw_base_qp { + struct ib_qp base_qp; + struct siw_qp *qp; +}; + +/* helper macros */ +#define rx_qp(rx) container_of(rx, struct siw_qp, rx_stream) +#define tx_qp(tx) container_of(tx, struct siw_qp, tx_ctx) +#define tx_wqe(qp) (&(qp)->tx_ctx.wqe_active) +#define rx_wqe(rctx) (&(rctx)->wqe_active) +#define rx_mem(rctx) ((rctx)->wqe_active.mem[0]) +#define tx_type(wqe) ((wqe)->sqe.opcode) +#define rx_type(wqe) ((wqe)->rqe.opcode) +#define tx_flags(wqe) ((wqe)->sqe.flags) + +struct iwarp_msg_info { + int hdr_len; + struct iwarp_ctrl ctrl; + int (*rx_data)(struct siw_qp *qp); +}; + +/* Global siw parameters. Currently set in siw_main.c */ +extern const bool zcopy_tx; +extern const bool try_gso; +extern const bool loopback_enabled; +extern const bool mpa_crc_required; +extern const bool mpa_crc_strict; +extern const bool siw_tcp_nagle; +extern u_char mpa_version; +extern const bool peer_to_peer; +extern struct task_struct *siw_tx_thread[]; + +extern struct crypto_shash *siw_crypto_shash; +extern struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1]; + +/* QP general functions */ +int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attr, + enum siw_qp_attr_mask mask); +int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl); +void siw_qp_llp_close(struct siw_qp *qp); +void siw_qp_cm_drop(struct siw_qp *qp, int schedule); +void siw_send_terminate(struct siw_qp *qp); + +void siw_qp_get_ref(struct ib_qp *qp); +void siw_qp_put_ref(struct ib_qp *qp); +int siw_qp_add(struct siw_device *sdev, struct siw_qp *qp); +void siw_free_qp(struct kref *ref); + +void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer, + u8 etype, u8 ecode, int in_tx); +enum ddp_ecode siw_tagged_error(enum siw_access_state state); +enum rdmap_ecode siw_rdmap_error(enum siw_access_state state); + +void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe); +int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes, + enum siw_wc_status status); +int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes, + u32 inval_stag, enum siw_wc_status status); +void siw_qp_llp_data_ready(struct sock *sk); +void siw_qp_llp_write_space(struct sock *sk); + +/* QP TX path functions */ +int siw_run_sq(void *arg); +int siw_qp_sq_process(struct siw_qp *qp); +int siw_sq_start(struct siw_qp *qp); +int siw_activate_tx(struct siw_qp *qp); +void siw_stop_tx_thread(int nr_cpu); +int siw_get_tx_cpu(struct siw_device *sdev); +void siw_put_tx_cpu(int cpu); + +/* QP RX path functions */ +int siw_proc_send(struct siw_qp *qp); +int siw_proc_rreq(struct siw_qp *qp); +int siw_proc_rresp(struct siw_qp *qp); +int siw_proc_write(struct siw_qp *qp); +int siw_proc_terminate(struct siw_qp *qp); + +int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int off, size_t len); + +static inline void set_rx_fpdu_context(struct siw_qp *qp, u8 opcode) +{ + if (opcode == RDMAP_RDMA_WRITE || opcode == RDMAP_RDMA_READ_RESP) + qp->rx_fpdu = &qp->rx_tagged; + else + qp->rx_fpdu = &qp->rx_untagged; + + qp->rx_stream.rdmap_op = opcode; +} + +static inline struct siw_ucontext *to_siw_ctx(struct ib_ucontext *base_ctx) +{ + return container_of(base_ctx, struct siw_ucontext, base_ucontext); +} + +static inline struct siw_base_qp *to_siw_base_qp(struct ib_qp *base_qp) +{ + return container_of(base_qp, struct siw_base_qp, base_qp); +} + +static inline struct siw_qp *to_siw_qp(struct ib_qp *base_qp) +{ + return to_siw_base_qp(base_qp)->qp; +} + +static inline struct siw_cq *to_siw_cq(struct ib_cq *base_cq) +{ + return container_of(base_cq, struct siw_cq, base_cq); +} + +static inline struct siw_srq *to_siw_srq(struct ib_srq *base_srq) +{ + return container_of(base_srq, struct siw_srq, base_srq); +} + +static inline struct siw_device *to_siw_dev(struct ib_device *base_dev) +{ + return container_of(base_dev, struct siw_device, base_dev); +} + +static inline struct siw_mr *to_siw_mr(struct ib_mr *base_mr) +{ + return container_of(base_mr, struct siw_mr, base_mr); +} + +static inline struct siw_qp *siw_qp_id2obj(struct siw_device *sdev, int id) +{ + struct siw_qp *qp; + + rcu_read_lock(); + qp = xa_load(&sdev->qp_xa, id); + if (likely(qp && kref_get_unless_zero(&qp->ref))) { + rcu_read_unlock(); + return qp; + } + rcu_read_unlock(); + return NULL; +} + +static inline u32 qp_id(struct siw_qp *qp) +{ + return qp->qp_num; +} + +static inline void siw_qp_get(struct siw_qp *qp) +{ + kref_get(&qp->ref); +} + +static inline void siw_qp_put(struct siw_qp *qp) +{ + kref_put(&qp->ref, siw_free_qp); +} + +static inline int siw_sq_empty(struct siw_qp *qp) +{ + struct siw_sqe *sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size]; + + return READ_ONCE(sqe->flags) == 0; +} + +static inline struct siw_sqe *sq_get_next(struct siw_qp *qp) +{ + struct siw_sqe *sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size]; + + if (READ_ONCE(sqe->flags) & SIW_WQE_VALID) + return sqe; + + return NULL; +} + +static inline struct siw_sqe *orq_get_current(struct siw_qp *qp) +{ + return &qp->orq[qp->orq_get % qp->attrs.orq_size]; +} + +static inline struct siw_sqe *orq_get_tail(struct siw_qp *qp) +{ + return &qp->orq[qp->orq_put % qp->attrs.orq_size]; +} + +static inline struct siw_sqe *orq_get_free(struct siw_qp *qp) +{ + struct siw_sqe *orq_e = orq_get_tail(qp); + + if (orq_e && READ_ONCE(orq_e->flags) == 0) + return orq_e; + + return NULL; +} + +static inline int siw_orq_empty(struct siw_qp *qp) +{ + return qp->orq[qp->orq_get % qp->attrs.orq_size].flags == 0 ? 1 : 0; +} + +static inline struct siw_sqe *irq_alloc_free(struct siw_qp *qp) +{ + struct siw_sqe *irq_e = &qp->irq[qp->irq_put % qp->attrs.irq_size]; + + if (READ_ONCE(irq_e->flags) == 0) { + qp->irq_put++; + return irq_e; + } + return NULL; +} + +static inline __wsum siw_csum_update(const void *buff, int len, __wsum sum) +{ + return (__force __wsum)crc32c((__force __u32)sum, buff, len); +} + +static inline __wsum siw_csum_combine(__wsum csum, __wsum csum2, int offset, + int len) +{ + return (__force __wsum)__crc32c_le_combine((__force __u32)csum, + (__force __u32)csum2, len); +} + +static inline void siw_crc_skb(struct siw_rx_stream *srx, unsigned int len) +{ + const struct skb_checksum_ops siw_cs_ops = { + .update = siw_csum_update, + .combine = siw_csum_combine, + }; + __wsum crc = *(u32 *)shash_desc_ctx(srx->mpa_crc_hd); + + crc = __skb_checksum(srx->skb, srx->skb_offset, len, crc, + &siw_cs_ops); + *(u32 *)shash_desc_ctx(srx->mpa_crc_hd) = crc; +} + +#define siw_dbg(ibdev, fmt, ...) \ + ibdev_dbg(ibdev, "%s: " fmt, __func__, ##__VA_ARGS__) + +#define siw_dbg_qp(qp, fmt, ...) \ + ibdev_dbg(&qp->sdev->base_dev, "QP[%u] %s: " fmt, qp_id(qp), __func__, \ + ##__VA_ARGS__) + +#define siw_dbg_cq(cq, fmt, ...) \ + ibdev_dbg(cq->base_cq.device, "CQ[%u] %s: " fmt, cq->id, __func__, \ + ##__VA_ARGS__) + +#define siw_dbg_pd(pd, fmt, ...) \ + ibdev_dbg(pd->device, "PD[%u] %s: " fmt, pd->res.id, __func__, \ + ##__VA_ARGS__) + +#define siw_dbg_mem(mem, fmt, ...) \ + ibdev_dbg(&mem->sdev->base_dev, \ + "MEM[0x%08x] %s: " fmt, mem->stag, __func__, ##__VA_ARGS__) + +#define siw_dbg_cep(cep, fmt, ...) \ + ibdev_dbg(&cep->sdev->base_dev, "CEP[0x%p] %s: " fmt, \ + cep, __func__, ##__VA_ARGS__) + +void siw_cq_flush(struct siw_cq *cq); +void siw_sq_flush(struct siw_qp *qp); +void siw_rq_flush(struct siw_qp *qp); +int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc); + +#endif -- cgit v1.2.3 From bdcf26bf9b3acb03c8f90387cfc6474fc8ac5521 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:25 +0200 Subject: rdma/siw: network and RDMA core interface Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_main.c | 687 +++++++++++++++++++++++++++++++++++ 1 file changed, 687 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw_main.c (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c new file mode 100644 index 000000000000..3f5f3d27ebe5 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -0,0 +1,687 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "siw.h" +#include "siw_verbs.h" + +MODULE_AUTHOR("Bernard Metzler"); +MODULE_DESCRIPTION("Software iWARP Driver"); +MODULE_LICENSE("Dual BSD/GPL"); + +/* transmit from user buffer, if possible */ +const bool zcopy_tx = true; + +/* Restrict usage of GSO, if hardware peer iwarp is unable to process + * large packets. try_gso = true lets siw try to use local GSO, + * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth. + */ +const bool try_gso; + +/* Attach siw also with loopback devices */ +const bool loopback_enabled = true; + +/* We try to negotiate CRC on, if true */ +const bool mpa_crc_required; + +/* MPA CRC on/off enforced */ +const bool mpa_crc_strict; + +/* Control TCP_NODELAY socket option */ +const bool siw_tcp_nagle; + +/* Select MPA version to be used during connection setup */ +u_char mpa_version = MPA_REVISION_2; + +/* Selects MPA P2P mode (additional handshake during connection + * setup, if true. + */ +const bool peer_to_peer; + +struct task_struct *siw_tx_thread[NR_CPUS]; +struct crypto_shash *siw_crypto_shash; + +static int siw_device_register(struct siw_device *sdev, const char *name) +{ + struct ib_device *base_dev = &sdev->base_dev; + static int dev_id = 1; + int rv; + + rv = ib_register_device(base_dev, name); + if (rv) { + pr_warn("siw: device registration error %d\n", rv); + return rv; + } + sdev->vendor_part_id = dev_id++; + + siw_dbg(base_dev, "HWaddr=%pM\n", sdev->netdev->dev_addr); + + return 0; +} + +static void siw_device_cleanup(struct ib_device *base_dev) +{ + struct siw_device *sdev = to_siw_dev(base_dev); + + xa_destroy(&sdev->qp_xa); + xa_destroy(&sdev->mem_xa); +} + +static int siw_create_tx_threads(void) +{ + int cpu, rv, assigned = 0; + + for_each_online_cpu(cpu) { + /* Skip HT cores */ + if (cpu % cpumask_weight(topology_sibling_cpumask(cpu))) + continue; + + siw_tx_thread[cpu] = + kthread_create(siw_run_sq, (unsigned long *)(long)cpu, + "siw_tx/%d", cpu); + if (IS_ERR(siw_tx_thread[cpu])) { + rv = PTR_ERR(siw_tx_thread[cpu]); + siw_tx_thread[cpu] = NULL; + pr_info("Creating TX thread for CPU %d failed", cpu); + continue; + } + kthread_bind(siw_tx_thread[cpu], cpu); + + wake_up_process(siw_tx_thread[cpu]); + assigned++; + } + return assigned; +} + +static int siw_dev_qualified(struct net_device *netdev) +{ + /* + * Additional hardware support can be added here + * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see + * for type identifiers. + */ + if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 || + (netdev->type == ARPHRD_LOOPBACK && loopback_enabled)) + return 1; + + return 0; +} + +static DEFINE_PER_CPU(atomic_t, use_cnt = ATOMIC_INIT(0)); + +static struct { + struct cpumask **tx_valid_cpus; + int num_nodes; +} siw_cpu_info; + +static int siw_init_cpulist(void) +{ + int i, num_nodes = num_possible_nodes(); + + memset(siw_tx_thread, 0, sizeof(siw_tx_thread)); + + siw_cpu_info.num_nodes = num_nodes; + + siw_cpu_info.tx_valid_cpus = + kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL); + if (!siw_cpu_info.tx_valid_cpus) { + siw_cpu_info.num_nodes = 0; + return -ENOMEM; + } + for (i = 0; i < siw_cpu_info.num_nodes; i++) { + siw_cpu_info.tx_valid_cpus[i] = + kzalloc(sizeof(struct cpumask), GFP_KERNEL); + if (!siw_cpu_info.tx_valid_cpus[i]) + goto out_err; + + cpumask_clear(siw_cpu_info.tx_valid_cpus[i]); + } + for_each_possible_cpu(i) + cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]); + + return 0; + +out_err: + siw_cpu_info.num_nodes = 0; + while (i) { + kfree(siw_cpu_info.tx_valid_cpus[i]); + siw_cpu_info.tx_valid_cpus[i--] = NULL; + } + kfree(siw_cpu_info.tx_valid_cpus); + siw_cpu_info.tx_valid_cpus = NULL; + + return -ENOMEM; +} + +static void siw_destroy_cpulist(void) +{ + int i = 0; + + while (i < siw_cpu_info.num_nodes) + kfree(siw_cpu_info.tx_valid_cpus[i++]); + + kfree(siw_cpu_info.tx_valid_cpus); +} + +/* + * Choose CPU with least number of active QP's from NUMA node of + * TX interface. + */ +int siw_get_tx_cpu(struct siw_device *sdev) +{ + const struct cpumask *tx_cpumask; + int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1; + + if (node < 0) + tx_cpumask = cpu_online_mask; + else + tx_cpumask = siw_cpu_info.tx_valid_cpus[node]; + + num_cpus = cpumask_weight(tx_cpumask); + if (!num_cpus) { + /* no CPU on this NUMA node */ + tx_cpumask = cpu_online_mask; + num_cpus = cpumask_weight(tx_cpumask); + } + if (!num_cpus) + goto out; + + cpu = cpumask_first(tx_cpumask); + + for (i = 0, min_use = SIW_MAX_QP; i < num_cpus; + i++, cpu = cpumask_next(cpu, tx_cpumask)) { + int usage; + + /* Skip any cores which have no TX thread */ + if (!siw_tx_thread[cpu]) + continue; + + usage = atomic_read(&per_cpu(use_cnt, cpu)); + if (usage <= min_use) { + tx_cpu = cpu; + min_use = usage; + } + } + siw_dbg(&sdev->base_dev, + "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use); + +out: + if (tx_cpu >= 0) + atomic_inc(&per_cpu(use_cnt, tx_cpu)); + else + pr_warn("siw: no tx cpu found\n"); + + return tx_cpu; +} + +void siw_put_tx_cpu(int cpu) +{ + atomic_dec(&per_cpu(use_cnt, cpu)); +} + +static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id) +{ + struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id); + + if (qp) { + /* + * siw_qp_id2obj() increments object reference count + */ + siw_qp_put(qp); + return qp->ib_qp; + } + return NULL; +} + +static void siw_verbs_sq_flush(struct ib_qp *base_qp) +{ + struct siw_qp *qp = to_siw_qp(base_qp); + + down_write(&qp->state_lock); + siw_sq_flush(qp); + up_write(&qp->state_lock); +} + +static void siw_verbs_rq_flush(struct ib_qp *base_qp) +{ + struct siw_qp *qp = to_siw_qp(base_qp); + + down_write(&qp->state_lock); + siw_rq_flush(qp); + up_write(&qp->state_lock); +} + +static const struct ib_device_ops siw_device_ops = { + .owner = THIS_MODULE, + .uverbs_abi_ver = SIW_ABI_VERSION, + .driver_id = RDMA_DRIVER_SIW, + + .alloc_mr = siw_alloc_mr, + .alloc_pd = siw_alloc_pd, + .alloc_ucontext = siw_alloc_ucontext, + .create_cq = siw_create_cq, + .create_qp = siw_create_qp, + .create_srq = siw_create_srq, + .dealloc_driver = siw_device_cleanup, + .dealloc_pd = siw_dealloc_pd, + .dealloc_ucontext = siw_dealloc_ucontext, + .dereg_mr = siw_dereg_mr, + .destroy_cq = siw_destroy_cq, + .destroy_qp = siw_destroy_qp, + .destroy_srq = siw_destroy_srq, + .drain_rq = siw_verbs_rq_flush, + .drain_sq = siw_verbs_sq_flush, + .get_dma_mr = siw_get_dma_mr, + .get_port_immutable = siw_get_port_immutable, + .iw_accept = siw_accept, + .iw_add_ref = siw_qp_get_ref, + .iw_connect = siw_connect, + .iw_create_listen = siw_create_listen, + .iw_destroy_listen = siw_destroy_listen, + .iw_get_qp = siw_get_base_qp, + .iw_reject = siw_reject, + .iw_rem_ref = siw_qp_put_ref, + .map_mr_sg = siw_map_mr_sg, + .mmap = siw_mmap, + .modify_qp = siw_verbs_modify_qp, + .modify_srq = siw_modify_srq, + .poll_cq = siw_poll_cq, + .post_recv = siw_post_receive, + .post_send = siw_post_send, + .post_srq_recv = siw_post_srq_recv, + .query_device = siw_query_device, + .query_gid = siw_query_gid, + .query_pkey = siw_query_pkey, + .query_port = siw_query_port, + .query_qp = siw_query_qp, + .query_srq = siw_query_srq, + .req_notify_cq = siw_req_notify_cq, + .reg_user_mr = siw_reg_user_mr, + + INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq), + INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd), + INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq), + INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext), +}; + +static struct siw_device *siw_device_create(struct net_device *netdev) +{ + struct siw_device *sdev = NULL; + struct ib_device *base_dev; + struct device *parent = netdev->dev.parent; + int rv; + + if (!parent) { + /* + * The loopback device has no parent device, + * so it appears as a top-level device. To support + * loopback device connectivity, take this device + * as the parent device. Skip all other devices + * w/o parent device. + */ + if (netdev->type != ARPHRD_LOOPBACK) { + pr_warn("siw: device %s error: no parent device\n", + netdev->name); + return NULL; + } + parent = &netdev->dev; + } + sdev = ib_alloc_device(siw_device, base_dev); + if (!sdev) + return NULL; + + base_dev = &sdev->base_dev; + + sdev->netdev = netdev; + + if (netdev->type != ARPHRD_LOOPBACK) { + memcpy(&base_dev->node_guid, netdev->dev_addr, 6); + } else { + /* + * The loopback device does not have a HW address, + * but connection mangagement lib expects gid != 0 + */ + size_t gidlen = min_t(size_t, strlen(base_dev->name), 6); + + memcpy(&base_dev->node_guid, base_dev->name, gidlen); + } + base_dev->uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ); + + base_dev->node_type = RDMA_NODE_RNIC; + memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON, + sizeof(SIW_NODE_DESC_COMMON)); + + /* + * Current model (one-to-one device association): + * One Softiwarp device per net_device or, equivalently, + * per physical port. + */ + base_dev->phys_port_cnt = 1; + base_dev->dev.parent = parent; + base_dev->dev.dma_ops = &dma_virt_ops; + base_dev->num_comp_vectors = num_possible_cpus(); + + ib_set_device_ops(base_dev, &siw_device_ops); + rv = ib_device_set_netdev(base_dev, netdev, 1); + if (rv) + goto error; + + memcpy(base_dev->iw_ifname, netdev->name, + sizeof(base_dev->iw_ifname)); + + /* Disable TCP port mapping */ + base_dev->iw_driver_flags = IW_F_NO_PORT_MAP, + + sdev->attrs.max_qp = SIW_MAX_QP; + sdev->attrs.max_qp_wr = SIW_MAX_QP_WR; + sdev->attrs.max_ord = SIW_MAX_ORD_QP; + sdev->attrs.max_ird = SIW_MAX_IRD_QP; + sdev->attrs.max_sge = SIW_MAX_SGE; + sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD; + sdev->attrs.max_cq = SIW_MAX_CQ; + sdev->attrs.max_cqe = SIW_MAX_CQE; + sdev->attrs.max_mr = SIW_MAX_MR; + sdev->attrs.max_pd = SIW_MAX_PD; + sdev->attrs.max_mw = SIW_MAX_MW; + sdev->attrs.max_fmr = SIW_MAX_FMR; + sdev->attrs.max_srq = SIW_MAX_SRQ; + sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR; + sdev->attrs.max_srq_sge = SIW_MAX_SGE; + + xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1); + xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1); + + INIT_LIST_HEAD(&sdev->cep_list); + INIT_LIST_HEAD(&sdev->qp_list); + + atomic_set(&sdev->num_ctx, 0); + atomic_set(&sdev->num_srq, 0); + atomic_set(&sdev->num_qp, 0); + atomic_set(&sdev->num_cq, 0); + atomic_set(&sdev->num_mr, 0); + atomic_set(&sdev->num_pd, 0); + + sdev->numa_node = dev_to_node(parent); + spin_lock_init(&sdev->lock); + + return sdev; +error: + ib_dealloc_device(base_dev); + + return NULL; +} + +/* + * Network link becomes unavailable. Mark all + * affected QP's accordingly. + */ +static void siw_netdev_down(struct work_struct *work) +{ + struct siw_device *sdev = + container_of(work, struct siw_device, netdev_down); + + struct siw_qp_attrs qp_attrs; + struct list_head *pos, *tmp; + + memset(&qp_attrs, 0, sizeof(qp_attrs)); + qp_attrs.state = SIW_QP_STATE_ERROR; + + list_for_each_safe(pos, tmp, &sdev->qp_list) { + struct siw_qp *qp = list_entry(pos, struct siw_qp, devq); + + down_write(&qp->state_lock); + WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE)); + up_write(&qp->state_lock); + } + ib_device_put(&sdev->base_dev); +} + +static void siw_device_goes_down(struct siw_device *sdev) +{ + if (ib_device_try_get(&sdev->base_dev)) { + INIT_WORK(&sdev->netdev_down, siw_netdev_down); + schedule_work(&sdev->netdev_down); + } +} + +static int siw_netdev_event(struct notifier_block *nb, unsigned long event, + void *arg) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(arg); + struct ib_device *base_dev; + struct siw_device *sdev; + + dev_dbg(&netdev->dev, "siw: event %lu\n", event); + + if (dev_net(netdev) != &init_net) + return NOTIFY_OK; + + base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); + if (!base_dev) + return NOTIFY_OK; + + sdev = to_siw_dev(base_dev); + + switch (event) { + case NETDEV_UP: + sdev->state = IB_PORT_ACTIVE; + siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE); + break; + + case NETDEV_GOING_DOWN: + siw_device_goes_down(sdev); + break; + + case NETDEV_DOWN: + sdev->state = IB_PORT_DOWN; + siw_port_event(sdev, 1, IB_EVENT_PORT_ERR); + break; + + case NETDEV_REGISTER: + /* + * Device registration now handled only by + * rdma netlink commands. So it shall be impossible + * to end up here with a valid siw device. + */ + siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n"); + break; + + case NETDEV_UNREGISTER: + ib_unregister_device_queued(&sdev->base_dev); + break; + + case NETDEV_CHANGEADDR: + siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE); + break; + /* + * Todo: Below netdev events are currently not handled. + */ + case NETDEV_CHANGEMTU: + case NETDEV_CHANGE: + break; + + default: + break; + } + ib_device_put(&sdev->base_dev); + + return NOTIFY_OK; +} + +static struct notifier_block siw_netdev_nb = { + .notifier_call = siw_netdev_event, +}; + +static int siw_newlink(const char *basedev_name, struct net_device *netdev) +{ + struct ib_device *base_dev; + struct siw_device *sdev = NULL; + int rv = -ENOMEM; + + if (!siw_dev_qualified(netdev)) + return -EINVAL; + + base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); + if (base_dev) { + ib_device_put(base_dev); + return -EEXIST; + } + sdev = siw_device_create(netdev); + if (sdev) { + dev_dbg(&netdev->dev, "siw: new device\n"); + + if (netif_running(netdev) && netif_carrier_ok(netdev)) + sdev->state = IB_PORT_ACTIVE; + else + sdev->state = IB_PORT_DOWN; + + rv = siw_device_register(sdev, basedev_name); + if (rv) + ib_dealloc_device(&sdev->base_dev); + } + return rv; +} + +static struct rdma_link_ops siw_link_ops = { + .type = "siw", + .newlink = siw_newlink, +}; + +/* + * siw_init_module - Initialize Softiwarp module and register with netdev + * subsystem. + */ +static __init int siw_init_module(void) +{ + int rv; + int nr_cpu; + + if (SENDPAGE_THRESH < SIW_MAX_INLINE) { + pr_info("siw: sendpage threshold too small: %u\n", + (int)SENDPAGE_THRESH); + rv = -EINVAL; + goto out_error; + } + rv = siw_init_cpulist(); + if (rv) + goto out_error; + + rv = siw_cm_init(); + if (rv) + goto out_error; + + if (!siw_create_tx_threads()) { + pr_info("siw: Could not start any TX thread\n"); + goto out_error; + } + /* + * Locate CRC32 algorithm. If unsuccessful, fail + * loading siw only, if CRC is required. + */ + siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(siw_crypto_shash)) { + pr_info("siw: Loading CRC32c failed: %ld\n", + PTR_ERR(siw_crypto_shash)); + siw_crypto_shash = NULL; + if (mpa_crc_required) { + rv = -EOPNOTSUPP; + goto out_error; + } + } + rv = register_netdevice_notifier(&siw_netdev_nb); + if (rv) + goto out_error; + + rdma_link_register(&siw_link_ops); + + pr_info("SoftiWARP attached\n"); + return 0; + +out_error: + for (nr_cpu = 0; nr_cpu < nr_cpu_ids; nr_cpu++) { + if (siw_tx_thread[nr_cpu]) { + siw_stop_tx_thread(nr_cpu); + siw_tx_thread[nr_cpu] = NULL; + } + } + if (siw_crypto_shash) + crypto_free_shash(siw_crypto_shash); + + pr_info("SoftIWARP attach failed. Error: %d\n", rv); + + siw_cm_exit(); + siw_destroy_cpulist(); + + return rv; +} + +static void __exit siw_exit_module(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (siw_tx_thread[cpu]) { + siw_stop_tx_thread(cpu); + siw_tx_thread[cpu] = NULL; + } + } + unregister_netdevice_notifier(&siw_netdev_nb); + rdma_link_unregister(&siw_link_ops); + ib_unregister_driver(RDMA_DRIVER_SIW); + + siw_cm_exit(); + + siw_destroy_cpulist(); + + if (siw_crypto_shash) + crypto_free_shash(siw_crypto_shash); + + pr_info("SoftiWARP detached\n"); +} + +module_init(siw_init_module); +module_exit(siw_exit_module); + +MODULE_ALIAS_RDMA_LINK("siw"); -- cgit v1.2.3 From 6c52fdc244b5ccc468006fd65a504d4ee33743c7 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:26 +0200 Subject: rdma/siw: connection management Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_cm.c | 2072 ++++++++++++++++++++++++++++++++++++ drivers/infiniband/sw/siw/siw_cm.h | 133 +++ 2 files changed, 2205 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw_cm.c create mode 100644 drivers/infiniband/sw/siw/siw_cm.h (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c new file mode 100644 index 000000000000..8e618cb7261f --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -0,0 +1,2072 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler */ +/* Fredy Neeser */ +/* Greg Joyce */ +/* Copyright (c) 2008-2019, IBM Corporation */ +/* Copyright (c) 2017, Open Grid Computing, Inc. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "siw.h" +#include "siw_cm.h" + +/* + * Set to any combination of + * MPA_V2_RDMA_NO_RTR, MPA_V2_RDMA_READ_RTR, MPA_V2_RDMA_WRITE_RTR + */ +static __be16 rtr_type = MPA_V2_RDMA_READ_RTR | MPA_V2_RDMA_WRITE_RTR; +static const bool relaxed_ird_negotiation = 1; + +static void siw_cm_llp_state_change(struct sock *s); +static void siw_cm_llp_data_ready(struct sock *s); +static void siw_cm_llp_write_space(struct sock *s); +static void siw_cm_llp_error_report(struct sock *s); +static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, + int status); + +static void siw_sk_assign_cm_upcalls(struct sock *sk) +{ + write_lock_bh(&sk->sk_callback_lock); + sk->sk_state_change = siw_cm_llp_state_change; + sk->sk_data_ready = siw_cm_llp_data_ready; + sk->sk_write_space = siw_cm_llp_write_space; + sk->sk_error_report = siw_cm_llp_error_report; + write_unlock_bh(&sk->sk_callback_lock); +} + +static void siw_sk_save_upcalls(struct sock *sk) +{ + struct siw_cep *cep = sk_to_cep(sk); + + write_lock_bh(&sk->sk_callback_lock); + cep->sk_state_change = sk->sk_state_change; + cep->sk_data_ready = sk->sk_data_ready; + cep->sk_write_space = sk->sk_write_space; + cep->sk_error_report = sk->sk_error_report; + write_unlock_bh(&sk->sk_callback_lock); +} + +static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep) +{ + sk->sk_state_change = cep->sk_state_change; + sk->sk_data_ready = cep->sk_data_ready; + sk->sk_write_space = cep->sk_write_space; + sk->sk_error_report = cep->sk_error_report; + sk->sk_user_data = NULL; +} + +static void siw_qp_socket_assoc(struct siw_cep *cep, struct siw_qp *qp) +{ + struct socket *s = cep->sock; + struct sock *sk = s->sk; + + write_lock_bh(&sk->sk_callback_lock); + + qp->attrs.sk = s; + sk->sk_data_ready = siw_qp_llp_data_ready; + sk->sk_write_space = siw_qp_llp_write_space; + + write_unlock_bh(&sk->sk_callback_lock); +} + +static void siw_socket_disassoc(struct socket *s) +{ + struct sock *sk = s->sk; + struct siw_cep *cep; + + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + cep = sk_to_cep(sk); + if (cep) { + siw_sk_restore_upcalls(sk, cep); + siw_cep_put(cep); + } else { + pr_warn("siw: cannot restore sk callbacks: no ep\n"); + } + write_unlock_bh(&sk->sk_callback_lock); + } else { + pr_warn("siw: cannot restore sk callbacks: no sk\n"); + } +} + +static void siw_rtr_data_ready(struct sock *sk) +{ + struct siw_cep *cep; + struct siw_qp *qp = NULL; + read_descriptor_t rd_desc; + + read_lock(&sk->sk_callback_lock); + + cep = sk_to_cep(sk); + if (!cep) { + WARN(1, "No connection endpoint\n"); + goto out; + } + qp = sk_to_qp(sk); + + memset(&rd_desc, 0, sizeof(rd_desc)); + rd_desc.arg.data = qp; + rd_desc.count = 1; + + tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data); + /* + * Check if first frame was successfully processed. + * Signal connection full establishment if yes. + * Failed data processing would have already scheduled + * connection drop. + */ + if (!qp->rx_stream.rx_suspend) + siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); +out: + read_unlock(&sk->sk_callback_lock); + if (qp) + siw_qp_socket_assoc(cep, qp); +} + +static void siw_sk_assign_rtr_upcalls(struct siw_cep *cep) +{ + struct sock *sk = cep->sock->sk; + + write_lock_bh(&sk->sk_callback_lock); + sk->sk_data_ready = siw_rtr_data_ready; + sk->sk_write_space = siw_qp_llp_write_space; + write_unlock_bh(&sk->sk_callback_lock); +} + +static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s) +{ + cep->sock = s; + siw_cep_get(cep); + s->sk->sk_user_data = cep; + + siw_sk_save_upcalls(s->sk); + siw_sk_assign_cm_upcalls(s->sk); +} + +static struct siw_cep *siw_cep_alloc(struct siw_device *sdev) +{ + struct siw_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL); + unsigned long flags; + + if (!cep) + return NULL; + + INIT_LIST_HEAD(&cep->listenq); + INIT_LIST_HEAD(&cep->devq); + INIT_LIST_HEAD(&cep->work_freelist); + + kref_init(&cep->ref); + cep->state = SIW_EPSTATE_IDLE; + init_waitqueue_head(&cep->waitq); + spin_lock_init(&cep->lock); + cep->sdev = sdev; + cep->enhanced_rdma_conn_est = false; + + spin_lock_irqsave(&sdev->lock, flags); + list_add_tail(&cep->devq, &sdev->cep_list); + spin_unlock_irqrestore(&sdev->lock, flags); + + siw_dbg_cep(cep, "new endpoint\n"); + return cep; +} + +static void siw_cm_free_work(struct siw_cep *cep) +{ + struct list_head *w, *tmp; + struct siw_cm_work *work; + + list_for_each_safe(w, tmp, &cep->work_freelist) { + work = list_entry(w, struct siw_cm_work, list); + list_del(&work->list); + kfree(work); + } +} + +static void siw_cancel_mpatimer(struct siw_cep *cep) +{ + spin_lock_bh(&cep->lock); + if (cep->mpa_timer) { + if (cancel_delayed_work(&cep->mpa_timer->work)) { + siw_cep_put(cep); + kfree(cep->mpa_timer); /* not needed again */ + } + cep->mpa_timer = NULL; + } + spin_unlock_bh(&cep->lock); +} + +static void siw_put_work(struct siw_cm_work *work) +{ + INIT_LIST_HEAD(&work->list); + spin_lock_bh(&work->cep->lock); + list_add(&work->list, &work->cep->work_freelist); + spin_unlock_bh(&work->cep->lock); +} + +static void siw_cep_set_inuse(struct siw_cep *cep) +{ + unsigned long flags; + int rv; +retry: + spin_lock_irqsave(&cep->lock, flags); + + if (cep->in_use) { + spin_unlock_irqrestore(&cep->lock, flags); + rv = wait_event_interruptible(cep->waitq, !cep->in_use); + if (signal_pending(current)) + flush_signals(current); + goto retry; + } else { + cep->in_use = 1; + spin_unlock_irqrestore(&cep->lock, flags); + } +} + +static void siw_cep_set_free(struct siw_cep *cep) +{ + unsigned long flags; + + spin_lock_irqsave(&cep->lock, flags); + cep->in_use = 0; + spin_unlock_irqrestore(&cep->lock, flags); + + wake_up(&cep->waitq); +} + +static void __siw_cep_dealloc(struct kref *ref) +{ + struct siw_cep *cep = container_of(ref, struct siw_cep, ref); + struct siw_device *sdev = cep->sdev; + unsigned long flags; + + WARN_ON(cep->listen_cep); + + /* kfree(NULL) is safe */ + kfree(cep->mpa.pdata); + spin_lock_bh(&cep->lock); + if (!list_empty(&cep->work_freelist)) + siw_cm_free_work(cep); + spin_unlock_bh(&cep->lock); + + spin_lock_irqsave(&sdev->lock, flags); + list_del(&cep->devq); + spin_unlock_irqrestore(&sdev->lock, flags); + + siw_dbg_cep(cep, "free endpoint\n"); + kfree(cep); +} + +static struct siw_cm_work *siw_get_work(struct siw_cep *cep) +{ + struct siw_cm_work *work = NULL; + + spin_lock_bh(&cep->lock); + if (!list_empty(&cep->work_freelist)) { + work = list_entry(cep->work_freelist.next, struct siw_cm_work, + list); + list_del_init(&work->list); + } + spin_unlock_bh(&cep->lock); + return work; +} + +static int siw_cm_alloc_work(struct siw_cep *cep, int num) +{ + struct siw_cm_work *work; + + while (num--) { + work = kmalloc(sizeof(*work), GFP_KERNEL); + if (!work) { + if (!(list_empty(&cep->work_freelist))) + siw_cm_free_work(cep); + return -ENOMEM; + } + work->cep = cep; + INIT_LIST_HEAD(&work->list); + list_add(&work->list, &cep->work_freelist); + } + return 0; +} + +/* + * siw_cm_upcall() + * + * Upcall to IWCM to inform about async connection events + */ +static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, + int status) +{ + struct iw_cm_event event; + struct iw_cm_id *id; + + memset(&event, 0, sizeof(event)); + event.status = status; + event.event = reason; + + if (reason == IW_CM_EVENT_CONNECT_REQUEST) { + event.provider_data = cep; + id = cep->listen_cep->cm_id; + } else { + id = cep->cm_id; + } + /* Signal IRD and ORD */ + if (reason == IW_CM_EVENT_ESTABLISHED || + reason == IW_CM_EVENT_CONNECT_REPLY) { + /* Signal negotiated IRD/ORD values we will use */ + event.ird = cep->ird; + event.ord = cep->ord; + } else if (reason == IW_CM_EVENT_CONNECT_REQUEST) { + event.ird = cep->ord; + event.ord = cep->ird; + } + /* Signal private data and address information */ + if (reason == IW_CM_EVENT_CONNECT_REQUEST || + reason == IW_CM_EVENT_CONNECT_REPLY) { + u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len); + + if (pd_len) { + /* + * hand over MPA private data + */ + event.private_data_len = pd_len; + event.private_data = cep->mpa.pdata; + + /* Hide MPA V2 IRD/ORD control */ + if (cep->enhanced_rdma_conn_est) { + event.private_data_len -= + sizeof(struct mpa_v2_data); + event.private_data += + sizeof(struct mpa_v2_data); + } + } + getname_local(cep->sock, &event.local_addr); + getname_peer(cep->sock, &event.remote_addr); + } + siw_dbg_cep(cep, "[QP %u]: id 0x%p, reason=%d, status=%d\n", + cep->qp ? qp_id(cep->qp) : -1, id, reason, status); + + return id->event_handler(id, &event); +} + +/* + * siw_qp_cm_drop() + * + * Drops established LLP connection if present and not already + * scheduled for dropping. Called from user context, SQ workqueue + * or receive IRQ. Caller signals if socket can be immediately + * closed (basically, if not in IRQ). + */ +void siw_qp_cm_drop(struct siw_qp *qp, int schedule) +{ + struct siw_cep *cep = qp->cep; + + qp->rx_stream.rx_suspend = 1; + qp->tx_ctx.tx_suspend = 1; + + if (!qp->cep) + return; + + if (schedule) { + siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP); + } else { + siw_cep_set_inuse(cep); + + if (cep->state == SIW_EPSTATE_CLOSED) { + siw_dbg_cep(cep, "already closed\n"); + goto out; + } + siw_dbg_cep(cep, "immediate close, state %d\n", cep->state); + + if (qp->term_info.valid) + siw_send_terminate(qp); + + if (cep->cm_id) { + switch (cep->state) { + case SIW_EPSTATE_AWAIT_MPAREP: + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -EINVAL); + break; + + case SIW_EPSTATE_RDMA_MODE: + siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); + break; + + case SIW_EPSTATE_IDLE: + case SIW_EPSTATE_LISTENING: + case SIW_EPSTATE_CONNECTING: + case SIW_EPSTATE_AWAIT_MPAREQ: + case SIW_EPSTATE_RECVD_MPAREQ: + case SIW_EPSTATE_CLOSED: + default: + break; + } + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + siw_cep_put(cep); + } + cep->state = SIW_EPSTATE_CLOSED; + + if (cep->sock) { + siw_socket_disassoc(cep->sock); + /* + * Immediately close socket + */ + sock_release(cep->sock); + cep->sock = NULL; + } + if (cep->qp) { + cep->qp = NULL; + siw_qp_put(qp); + } +out: + siw_cep_set_free(cep); + } +} + +void siw_cep_put(struct siw_cep *cep) +{ + WARN_ON(kref_read(&cep->ref) < 1); + kref_put(&cep->ref, __siw_cep_dealloc); +} + +void siw_cep_get(struct siw_cep *cep) +{ + kref_get(&cep->ref); +} + +/* + * Expects params->pd_len in host byte order + */ +static int siw_send_mpareqrep(struct siw_cep *cep, const void *pdata, u8 pd_len) +{ + struct socket *s = cep->sock; + struct mpa_rr *rr = &cep->mpa.hdr; + struct kvec iov[3]; + struct msghdr msg; + int rv; + int iovec_num = 0; + int mpa_len; + + memset(&msg, 0, sizeof(msg)); + + iov[iovec_num].iov_base = rr; + iov[iovec_num].iov_len = sizeof(*rr); + mpa_len = sizeof(*rr); + + if (cep->enhanced_rdma_conn_est) { + iovec_num++; + iov[iovec_num].iov_base = &cep->mpa.v2_ctrl; + iov[iovec_num].iov_len = sizeof(cep->mpa.v2_ctrl); + mpa_len += sizeof(cep->mpa.v2_ctrl); + } + if (pd_len) { + iovec_num++; + iov[iovec_num].iov_base = (char *)pdata; + iov[iovec_num].iov_len = pd_len; + mpa_len += pd_len; + } + if (cep->enhanced_rdma_conn_est) + pd_len += sizeof(cep->mpa.v2_ctrl); + + rr->params.pd_len = cpu_to_be16(pd_len); + + rv = kernel_sendmsg(s, &msg, iov, iovec_num + 1, mpa_len); + + return rv < 0 ? rv : 0; +} + +/* + * Receive MPA Request/Reply header. + * + * Returns 0 if complete MPA Request/Reply header including + * eventual private data was received. Returns -EAGAIN if + * header was partially received or negative error code otherwise. + * + * Context: May be called in process context only + */ +static int siw_recv_mpa_rr(struct siw_cep *cep) +{ + struct mpa_rr *hdr = &cep->mpa.hdr; + struct socket *s = cep->sock; + u16 pd_len; + int rcvd, to_rcv; + + if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) { + rcvd = ksock_recv(s, (char *)hdr + cep->mpa.bytes_rcvd, + sizeof(struct mpa_rr) - cep->mpa.bytes_rcvd, + 0); + if (rcvd <= 0) + return -ECONNABORTED; + + cep->mpa.bytes_rcvd += rcvd; + + if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) + return -EAGAIN; + + if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA) + return -EPROTO; + } + pd_len = be16_to_cpu(hdr->params.pd_len); + + /* + * At least the MPA Request/Reply header (frame not including + * private data) has been received. + * Receive (or continue receiving) any private data. + */ + to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr)); + + if (!to_rcv) { + /* + * We must have hdr->params.pd_len == 0 and thus received a + * complete MPA Request/Reply frame. + * Check against peer protocol violation. + */ + u32 word; + + rcvd = ksock_recv(s, (char *)&word, sizeof(word), MSG_DONTWAIT); + if (rcvd == -EAGAIN) + return 0; + + if (rcvd == 0) { + siw_dbg_cep(cep, "peer EOF\n"); + return -EPIPE; + } + if (rcvd < 0) { + siw_dbg_cep(cep, "error: %d\n", rcvd); + return rcvd; + } + siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd); + + return -EPROTO; + } + + /* + * At this point, we must have hdr->params.pd_len != 0. + * A private data buffer gets allocated if hdr->params.pd_len != 0. + */ + if (!cep->mpa.pdata) { + cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL); + if (!cep->mpa.pdata) + return -ENOMEM; + } + rcvd = ksock_recv( + s, cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr), + to_rcv + 4, MSG_DONTWAIT); + + if (rcvd < 0) + return rcvd; + + if (rcvd > to_rcv) + return -EPROTO; + + cep->mpa.bytes_rcvd += rcvd; + + if (to_rcv == rcvd) { + siw_dbg_cep(cep, "%d bytes private data received\n", pd_len); + return 0; + } + return -EAGAIN; +} + +/* + * siw_proc_mpareq() + * + * Read MPA Request from socket and signal new connection to IWCM + * if success. Caller must hold lock on corresponding listening CEP. + */ +static int siw_proc_mpareq(struct siw_cep *cep) +{ + struct mpa_rr *req; + int version, rv; + u16 pd_len; + + rv = siw_recv_mpa_rr(cep); + if (rv) + return rv; + + req = &cep->mpa.hdr; + + version = __mpa_rr_revision(req->params.bits); + pd_len = be16_to_cpu(req->params.pd_len); + + if (version > MPA_REVISION_2) + /* allow for 0, 1, and 2 only */ + return -EPROTO; + + if (memcmp(req->key, MPA_KEY_REQ, 16)) + return -EPROTO; + + /* Prepare for sending MPA reply */ + memcpy(req->key, MPA_KEY_REP, 16); + + if (version == MPA_REVISION_2 && + (req->params.bits & MPA_RR_FLAG_ENHANCED)) { + /* + * MPA version 2 must signal IRD/ORD values and P2P mode + * in private data if header flag MPA_RR_FLAG_ENHANCED + * is set. + */ + if (pd_len < sizeof(struct mpa_v2_data)) + goto reject_conn; + + cep->enhanced_rdma_conn_est = true; + } + + /* MPA Markers: currently not supported. Marker TX to be added. */ + if (req->params.bits & MPA_RR_FLAG_MARKERS) + goto reject_conn; + + if (req->params.bits & MPA_RR_FLAG_CRC) { + /* + * RFC 5044, page 27: CRC MUST be used if peer requests it. + * siw specific: 'mpa_crc_strict' parameter to reject + * connection with CRC if local CRC off enforced by + * 'mpa_crc_strict' module parameter. + */ + if (!mpa_crc_required && mpa_crc_strict) + goto reject_conn; + + /* Enable CRC if requested by module parameter */ + if (mpa_crc_required) + req->params.bits |= MPA_RR_FLAG_CRC; + } + if (cep->enhanced_rdma_conn_est) { + struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata; + + /* + * Peer requested ORD becomes requested local IRD, + * peer requested IRD becomes requested local ORD. + * IRD and ORD get limited by global maximum values. + */ + cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK; + cep->ord = min(cep->ord, SIW_MAX_ORD_QP); + cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK; + cep->ird = min(cep->ird, SIW_MAX_IRD_QP); + + /* May get overwritten by locally negotiated values */ + cep->mpa.v2_ctrl.ird = htons(cep->ird); + cep->mpa.v2_ctrl.ord = htons(cep->ord); + + /* + * Support for peer sent zero length Write or Read to + * let local side enter RTS. Writes are preferred. + * Sends would require pre-posting a Receive and are + * not supported. + * Propose zero length Write if none of Read and Write + * is indicated. + */ + if (v2->ird & MPA_V2_PEER_TO_PEER) { + cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; + + if (v2->ord & MPA_V2_RDMA_WRITE_RTR) + cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; + else if (v2->ord & MPA_V2_RDMA_READ_RTR) + cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_READ_RTR; + else + cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; + } + } + + cep->state = SIW_EPSTATE_RECVD_MPAREQ; + + /* Keep reference until IWCM accepts/rejects */ + siw_cep_get(cep); + rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0); + if (rv) + siw_cep_put(cep); + + return rv; + +reject_conn: + siw_dbg_cep(cep, "reject: crc %d:%d:%d, m %d:%d\n", + req->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, + mpa_crc_required, mpa_crc_strict, + req->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); + + req->params.bits &= ~MPA_RR_FLAG_MARKERS; + req->params.bits |= MPA_RR_FLAG_REJECT; + + if (!mpa_crc_required && mpa_crc_strict) + req->params.bits &= ~MPA_RR_FLAG_CRC; + + if (pd_len) + kfree(cep->mpa.pdata); + + cep->mpa.pdata = NULL; + + siw_send_mpareqrep(cep, NULL, 0); + + return -EOPNOTSUPP; +} + +static int siw_proc_mpareply(struct siw_cep *cep) +{ + struct siw_qp_attrs qp_attrs; + enum siw_qp_attr_mask qp_attr_mask; + struct siw_qp *qp = cep->qp; + struct mpa_rr *rep; + int rv; + u16 rep_ord; + u16 rep_ird; + bool ird_insufficient = false; + enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR; + + rv = siw_recv_mpa_rr(cep); + if (rv != -EAGAIN) + siw_cancel_mpatimer(cep); + if (rv) + goto out_err; + + rep = &cep->mpa.hdr; + + if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) { + /* allow for 0, 1, and 2 only */ + rv = -EPROTO; + goto out_err; + } + if (memcmp(rep->key, MPA_KEY_REP, 16)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, + LLP_ECODE_INVALID_REQ_RESP, 0); + siw_send_terminate(qp); + rv = -EPROTO; + goto out_err; + } + if (rep->params.bits & MPA_RR_FLAG_REJECT) { + siw_dbg_cep(cep, "got mpa reject\n"); + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); + + return -ECONNRESET; + } + if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) { + siw_dbg_cep(cep, "peer allows GSO on TX\n"); + qp->tx_ctx.gso_seg_limit = 0; + } + if ((rep->params.bits & MPA_RR_FLAG_MARKERS) || + (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) || + (mpa_crc_strict && !mpa_crc_required && + (rep->params.bits & MPA_RR_FLAG_CRC))) { + siw_dbg_cep(cep, "reply unsupp: crc %d:%d:%d, m %d:%d\n", + rep->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, + mpa_crc_required, mpa_crc_strict, + rep->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); + + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED); + + return -EINVAL; + } + if (cep->enhanced_rdma_conn_est) { + struct mpa_v2_data *v2; + + if (__mpa_rr_revision(rep->params.bits) < MPA_REVISION_2 || + !(rep->params.bits & MPA_RR_FLAG_ENHANCED)) { + /* + * Protocol failure: The responder MUST reply with + * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED. + */ + siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n", + __mpa_rr_revision(rep->params.bits), + rep->params.bits & MPA_RR_FLAG_ENHANCED ? + 1 : + 0); + + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -ECONNRESET); + return -EINVAL; + } + v2 = (struct mpa_v2_data *)cep->mpa.pdata; + rep_ird = ntohs(v2->ird) & MPA_IRD_ORD_MASK; + rep_ord = ntohs(v2->ord) & MPA_IRD_ORD_MASK; + + if (cep->ird < rep_ord && + (relaxed_ird_negotiation == false || + rep_ord > cep->sdev->attrs.max_ird)) { + siw_dbg_cep(cep, "ird %d, rep_ord %d, max_ord %d\n", + cep->ird, rep_ord, + cep->sdev->attrs.max_ord); + ird_insufficient = true; + } + if (cep->ord > rep_ird && relaxed_ird_negotiation == false) { + siw_dbg_cep(cep, "ord %d, rep_ird %d\n", cep->ord, + rep_ird); + ird_insufficient = true; + } + /* + * Always report negotiated peer values to user, + * even if IRD/ORD negotiation failed + */ + cep->ird = rep_ord; + cep->ord = rep_ird; + + if (ird_insufficient) { + /* + * If the initiator IRD is insuffient for the + * responder ORD, send a TERM. + */ + siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, + LLP_ETYPE_MPA, + LLP_ECODE_INSUFFICIENT_IRD, 0); + siw_send_terminate(qp); + rv = -ENOMEM; + goto out_err; + } + if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER) + mpa_p2p_mode = + cep->mpa.v2_ctrl_req.ord & + (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR); + + /* + * Check if we requested P2P mode, and if peer agrees + */ + if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { + if ((mpa_p2p_mode & v2->ord) == 0) { + /* + * We requested RTR mode(s), but the peer + * did not pick any mode we support. + */ + siw_dbg_cep(cep, + "rtr mode: req %2x, got %2x\n", + mpa_p2p_mode, + v2->ord & (MPA_V2_RDMA_WRITE_RTR | + MPA_V2_RDMA_READ_RTR)); + + siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, + LLP_ETYPE_MPA, + LLP_ECODE_NO_MATCHING_RTR, + 0); + siw_send_terminate(qp); + rv = -EPROTO; + goto out_err; + } + mpa_p2p_mode = v2->ord & (MPA_V2_RDMA_WRITE_RTR | + MPA_V2_RDMA_READ_RTR); + } + } + memset(&qp_attrs, 0, sizeof(qp_attrs)); + + if (rep->params.bits & MPA_RR_FLAG_CRC) + qp_attrs.flags = SIW_MPA_CRC; + + qp_attrs.irq_size = cep->ird; + qp_attrs.orq_size = cep->ord; + qp_attrs.sk = cep->sock; + qp_attrs.state = SIW_QP_STATE_RTS; + + qp_attr_mask = SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | + SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA; + + /* Move socket RX/TX under QP control */ + down_write(&qp->state_lock); + if (qp->attrs.state > SIW_QP_STATE_RTR) { + rv = -EINVAL; + up_write(&qp->state_lock); + goto out_err; + } + rv = siw_qp_modify(qp, &qp_attrs, qp_attr_mask); + + siw_qp_socket_assoc(cep, qp); + + up_write(&qp->state_lock); + + /* Send extra RDMA frame to trigger peer RTS if negotiated */ + if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { + rv = siw_qp_mpa_rts(qp, mpa_p2p_mode); + if (rv) + goto out_err; + } + if (!rv) { + rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0); + if (!rv) + cep->state = SIW_EPSTATE_RDMA_MODE; + + return 0; + } + +out_err: + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); + + return rv; +} + +/* + * siw_accept_newconn - accept an incoming pending connection + * + */ +static void siw_accept_newconn(struct siw_cep *cep) +{ + struct socket *s = cep->sock; + struct socket *new_s = NULL; + struct siw_cep *new_cep = NULL; + int rv = 0; /* debug only. should disappear */ + + if (cep->state != SIW_EPSTATE_LISTENING) + goto error; + + new_cep = siw_cep_alloc(cep->sdev); + if (!new_cep) + goto error; + + /* + * 4: Allocate a sufficient number of work elements + * to allow concurrent handling of local + peer close + * events, MPA header processing + MPA timeout. + */ + if (siw_cm_alloc_work(new_cep, 4) != 0) + goto error; + + /* + * Copy saved socket callbacks from listening CEP + * and assign new socket with new CEP + */ + new_cep->sk_state_change = cep->sk_state_change; + new_cep->sk_data_ready = cep->sk_data_ready; + new_cep->sk_write_space = cep->sk_write_space; + new_cep->sk_error_report = cep->sk_error_report; + + rv = kernel_accept(s, &new_s, O_NONBLOCK); + if (rv != 0) { + /* + * Connection already aborted by peer..? + */ + siw_dbg_cep(cep, "kernel_accept() error: %d\n", rv); + goto error; + } + new_cep->sock = new_s; + siw_cep_get(new_cep); + new_s->sk->sk_user_data = new_cep; + + siw_dbg_cep(cep, "listen socket 0x%p, new 0x%p\n", s, new_s); + + if (siw_tcp_nagle == false) { + int val = 1; + + rv = kernel_setsockopt(new_s, SOL_TCP, TCP_NODELAY, + (char *)&val, sizeof(val)); + if (rv) { + siw_dbg_cep(cep, "setsockopt NODELAY error: %d\n", rv); + goto error; + } + } + new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ; + + rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT); + if (rv) + goto error; + /* + * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep. + */ + new_cep->listen_cep = cep; + siw_cep_get(cep); + + if (atomic_read(&new_s->sk->sk_rmem_alloc)) { + /* + * MPA REQ already queued + */ + siw_dbg_cep(cep, "immediate mpa request\n"); + + siw_cep_set_inuse(new_cep); + rv = siw_proc_mpareq(new_cep); + siw_cep_set_free(new_cep); + + if (rv != -EAGAIN) { + siw_cep_put(cep); + new_cep->listen_cep = NULL; + if (rv) + goto error; + } + } + return; + +error: + if (new_cep) + siw_cep_put(new_cep); + + if (new_s) { + siw_socket_disassoc(new_s); + sock_release(new_s); + new_cep->sock = NULL; + } + siw_dbg_cep(cep, "error %d\n", rv); +} + +static void siw_cm_work_handler(struct work_struct *w) +{ + struct siw_cm_work *work; + struct siw_cep *cep; + int release_cep = 0, rv = 0; + + work = container_of(w, struct siw_cm_work, work.work); + cep = work->cep; + + siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n", + cep->qp ? qp_id(cep->qp) : -1, work->type, cep->state); + + siw_cep_set_inuse(cep); + + switch (work->type) { + case SIW_CM_WORK_ACCEPT: + siw_accept_newconn(cep); + break; + + case SIW_CM_WORK_READ_MPAHDR: + if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { + if (cep->listen_cep) { + siw_cep_set_inuse(cep->listen_cep); + + if (cep->listen_cep->state == + SIW_EPSTATE_LISTENING) + rv = siw_proc_mpareq(cep); + else + rv = -EFAULT; + + siw_cep_set_free(cep->listen_cep); + + if (rv != -EAGAIN) { + siw_cep_put(cep->listen_cep); + cep->listen_cep = NULL; + if (rv) + siw_cep_put(cep); + } + } + } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { + rv = siw_proc_mpareply(cep); + } else { + /* + * CEP already moved out of MPA handshake. + * any connection management already done. + * silently ignore the mpa packet. + */ + if (cep->state == SIW_EPSTATE_RDMA_MODE) { + cep->sock->sk->sk_data_ready(cep->sock->sk); + siw_dbg_cep(cep, "already in RDMA mode"); + } else { + siw_dbg_cep(cep, "out of state: %d\n", + cep->state); + } + } + if (rv && rv != EAGAIN) + release_cep = 1; + break; + + case SIW_CM_WORK_CLOSE_LLP: + /* + * QP scheduled LLP close + */ + if (cep->qp && cep->qp->term_info.valid) + siw_send_terminate(cep->qp); + + if (cep->cm_id) + siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); + + release_cep = 1; + break; + + case SIW_CM_WORK_PEER_CLOSE: + if (cep->cm_id) { + if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { + /* + * MPA reply not received, but connection drop + */ + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -ECONNRESET); + } else if (cep->state == SIW_EPSTATE_RDMA_MODE) { + /* + * NOTE: IW_CM_EVENT_DISCONNECT is given just + * to transition IWCM into CLOSING. + */ + siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0); + siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); + } + /* + * for other states there is no connection + * known to the IWCM. + */ + } else { + if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) { + /* + * Wait for the ulp/CM to call accept/reject + */ + siw_dbg_cep(cep, + "mpa req recvd, wait for ULP\n"); + } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { + /* + * Socket close before MPA request received. + */ + siw_dbg_cep(cep, "no mpareq: drop listener\n"); + siw_cep_put(cep->listen_cep); + cep->listen_cep = NULL; + } + } + release_cep = 1; + break; + + case SIW_CM_WORK_MPATIMEOUT: + cep->mpa_timer = NULL; + + if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { + /* + * MPA request timed out: + * Hide any partially received private data and signal + * timeout + */ + cep->mpa.hdr.params.pd_len = 0; + + if (cep->cm_id) + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -ETIMEDOUT); + release_cep = 1; + + } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { + /* + * No MPA request received after peer TCP stream setup. + */ + if (cep->listen_cep) { + siw_cep_put(cep->listen_cep); + cep->listen_cep = NULL; + } + release_cep = 1; + } + break; + + default: + WARN(1, "Undefined CM work type: %d\n", work->type); + } + if (release_cep) { + siw_dbg_cep(cep, + "release: timer=%s, QP[%u], id 0x%p\n", + cep->mpa_timer ? "y" : "n", + cep->qp ? qp_id(cep->qp) : -1, cep->cm_id); + + siw_cancel_mpatimer(cep); + + cep->state = SIW_EPSTATE_CLOSED; + + if (cep->qp) { + struct siw_qp *qp = cep->qp; + /* + * Serialize a potential race with application + * closing the QP and calling siw_qp_cm_drop() + */ + siw_qp_get(qp); + siw_cep_set_free(cep); + + siw_qp_llp_close(qp); + siw_qp_put(qp); + + siw_cep_set_inuse(cep); + cep->qp = NULL; + siw_qp_put(qp); + } + if (cep->sock) { + siw_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + } + if (cep->cm_id) { + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + siw_cep_put(cep); + } + } + siw_cep_set_free(cep); + siw_put_work(work); + siw_cep_put(cep); +} + +static struct workqueue_struct *siw_cm_wq; + +int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type) +{ + struct siw_cm_work *work = siw_get_work(cep); + unsigned long delay = 0; + + if (!work) { + siw_dbg_cep(cep, "failed with no work available\n"); + return -ENOMEM; + } + work->type = type; + work->cep = cep; + + siw_cep_get(cep); + + INIT_DELAYED_WORK(&work->work, siw_cm_work_handler); + + if (type == SIW_CM_WORK_MPATIMEOUT) { + cep->mpa_timer = work; + + if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) + delay = MPAREQ_TIMEOUT; + else + delay = MPAREP_TIMEOUT; + } + siw_dbg_cep(cep, "[QP %u]: work type: %d, work 0x%p, timeout %lu\n", + cep->qp ? qp_id(cep->qp) : -1, type, work, delay); + + queue_delayed_work(siw_cm_wq, &work->work, delay); + + return 0; +} + +static void siw_cm_llp_data_ready(struct sock *sk) +{ + struct siw_cep *cep; + + read_lock(&sk->sk_callback_lock); + + cep = sk_to_cep(sk); + if (!cep) { + WARN_ON(1); + goto out; + } + siw_dbg_cep(cep, "state: %d\n", cep->state); + + switch (cep->state) { + case SIW_EPSTATE_RDMA_MODE: + /* fall through */ + case SIW_EPSTATE_LISTENING: + break; + + case SIW_EPSTATE_AWAIT_MPAREQ: + /* fall through */ + case SIW_EPSTATE_AWAIT_MPAREP: + siw_cm_queue_work(cep, SIW_CM_WORK_READ_MPAHDR); + break; + + default: + siw_dbg_cep(cep, "unexpected data, state %d\n", cep->state); + break; + } +out: + read_unlock(&sk->sk_callback_lock); +} + +static void siw_cm_llp_write_space(struct sock *sk) +{ + struct siw_cep *cep = sk_to_cep(sk); + + if (cep) + siw_dbg_cep(cep, "state: %d\n", cep->state); +} + +static void siw_cm_llp_error_report(struct sock *sk) +{ + struct siw_cep *cep = sk_to_cep(sk); + + if (cep) { + siw_dbg_cep(cep, "error %d, socket state: %d, cep state: %d\n", + sk->sk_err, sk->sk_state, cep->state); + cep->sk_error_report(sk); + } +} + +static void siw_cm_llp_state_change(struct sock *sk) +{ + struct siw_cep *cep; + struct socket *s; + void (*orig_state_change)(struct sock *s); + + read_lock(&sk->sk_callback_lock); + + cep = sk_to_cep(sk); + if (!cep) { + /* endpoint already disassociated */ + read_unlock(&sk->sk_callback_lock); + return; + } + orig_state_change = cep->sk_state_change; + + s = sk->sk_socket; + + siw_dbg_cep(cep, "state: %d\n", cep->state); + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + /* + * handle accepting socket as special case where only + * new connection is possible + */ + siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT); + break; + + case TCP_CLOSE: + case TCP_CLOSE_WAIT: + if (cep->qp) + cep->qp->tx_ctx.tx_suspend = 1; + siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE); + break; + + default: + siw_dbg_cep(cep, "unexpected socket state %d\n", sk->sk_state); + } + read_unlock(&sk->sk_callback_lock); + orig_state_change(sk); +} + +static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, + struct sockaddr *raddr) +{ + int rv, flags = 0, s_val = 1; + size_t size = laddr->sa_family == AF_INET ? + sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); + + /* + * Make address available again asap. + */ + rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val, + sizeof(s_val)); + if (rv < 0) + return rv; + + rv = s->ops->bind(s, laddr, size); + if (rv < 0) + return rv; + + rv = s->ops->connect(s, raddr, size, flags); + + return rv < 0 ? rv : 0; +} + +int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) +{ + struct siw_device *sdev = to_siw_dev(id->device); + struct siw_qp *qp; + struct siw_cep *cep = NULL; + struct socket *s = NULL; + struct sockaddr *laddr = (struct sockaddr *)&id->local_addr, + *raddr = (struct sockaddr *)&id->remote_addr; + bool p2p_mode = peer_to_peer, v4 = true; + u16 pd_len = params->private_data_len; + int version = mpa_version, rv; + + if (pd_len > MPA_MAX_PRIVDATA) + return -EINVAL; + + if (params->ird > sdev->attrs.max_ird || + params->ord > sdev->attrs.max_ord) + return -ENOMEM; + + if (laddr->sa_family == AF_INET6) + v4 = false; + else if (laddr->sa_family != AF_INET) + return -EAFNOSUPPORT; + + /* + * Respect any iwarp port mapping: Use mapped remote address + * if valid. Local address must not be mapped, since siw + * uses kernel TCP stack. + */ + if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) || + to_sockaddr_in6(id->remote_addr).sin6_port != 0) + raddr = (struct sockaddr *)&id->m_remote_addr; + + qp = siw_qp_id2obj(sdev, params->qpn); + if (!qp) { + WARN(1, "[QP %u] does not exist\n", params->qpn); + rv = -EINVAL; + goto error; + } + if (v4) + siw_dbg_qp(qp, + "id 0x%p, pd_len %d, laddr %pI4 %d, raddr %pI4 %d\n", + id, pd_len, + &((struct sockaddr_in *)(laddr))->sin_addr, + ntohs(((struct sockaddr_in *)(laddr))->sin_port), + &((struct sockaddr_in *)(raddr))->sin_addr, + ntohs(((struct sockaddr_in *)(raddr))->sin_port)); + else + siw_dbg_qp(qp, + "id 0x%p, pd_len %d, laddr %pI6 %d, raddr %pI6 %d\n", + id, pd_len, + &((struct sockaddr_in6 *)(laddr))->sin6_addr, + ntohs(((struct sockaddr_in6 *)(laddr))->sin6_port), + &((struct sockaddr_in6 *)(raddr))->sin6_addr, + ntohs(((struct sockaddr_in6 *)(raddr))->sin6_port)); + + rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s); + if (rv < 0) + goto error; + + /* + * NOTE: For simplification, connect() is called in blocking + * mode. Might be reconsidered for async connection setup at + * TCP level. + */ + rv = kernel_bindconnect(s, laddr, raddr); + if (rv != 0) { + siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv); + goto error; + } + if (siw_tcp_nagle == false) { + int val = 1; + + rv = kernel_setsockopt(s, SOL_TCP, TCP_NODELAY, (char *)&val, + sizeof(val)); + if (rv) { + siw_dbg_qp(qp, "setsockopt NODELAY error: %d\n", rv); + goto error; + } + } + cep = siw_cep_alloc(sdev); + if (!cep) { + rv = -ENOMEM; + goto error; + } + siw_cep_set_inuse(cep); + + /* Associate QP with CEP */ + siw_cep_get(cep); + qp->cep = cep; + + /* siw_qp_get(qp) already done by QP lookup */ + cep->qp = qp; + + id->add_ref(id); + cep->cm_id = id; + + /* + * 4: Allocate a sufficient number of work elements + * to allow concurrent handling of local + peer close + * events, MPA header processing + MPA timeout. + */ + rv = siw_cm_alloc_work(cep, 4); + if (rv != 0) { + rv = -ENOMEM; + goto error; + } + cep->ird = params->ird; + cep->ord = params->ord; + + if (p2p_mode && cep->ord == 0) + cep->ord = 1; + + cep->state = SIW_EPSTATE_CONNECTING; + + /* + * Associate CEP with socket + */ + siw_cep_socket_assoc(cep, s); + + cep->state = SIW_EPSTATE_AWAIT_MPAREP; + + /* + * Set MPA Request bits: CRC if required, no MPA Markers, + * MPA Rev. according to module parameter 'mpa_version', Key 'Request'. + */ + cep->mpa.hdr.params.bits = 0; + if (version > MPA_REVISION_2) { + pr_warn("Setting MPA version to %u\n", MPA_REVISION_2); + version = MPA_REVISION_2; + /* Adjust also module parameter */ + mpa_version = MPA_REVISION_2; + } + __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version); + + if (try_gso) + cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP; + + if (mpa_crc_required) + cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC; + + /* + * If MPA version == 2: + * o Include ORD and IRD. + * o Indicate peer-to-peer mode, if required by module + * parameter 'peer_to_peer'. + */ + if (version == MPA_REVISION_2) { + cep->enhanced_rdma_conn_est = true; + cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED; + + cep->mpa.v2_ctrl.ird = htons(cep->ird); + cep->mpa.v2_ctrl.ord = htons(cep->ord); + + if (p2p_mode) { + cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; + cep->mpa.v2_ctrl.ord |= rtr_type; + } + /* Remember own P2P mode requested */ + cep->mpa.v2_ctrl_req.ird = cep->mpa.v2_ctrl.ird; + cep->mpa.v2_ctrl_req.ord = cep->mpa.v2_ctrl.ord; + } + memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, 16); + + rv = siw_send_mpareqrep(cep, params->private_data, pd_len); + /* + * Reset private data. + */ + cep->mpa.hdr.params.pd_len = 0; + + if (rv >= 0) { + rv = siw_cm_queue_work(cep, SIW_CM_WORK_MPATIMEOUT); + if (!rv) { + siw_dbg_cep(cep, "id 0x%p, [QP %u]: exit\n", id, + qp_id(qp)); + siw_cep_set_free(cep); + return 0; + } + } +error: + siw_dbg_qp(qp, "failed: %d\n", rv); + + if (cep) { + siw_socket_disassoc(s); + sock_release(s); + cep->sock = NULL; + + cep->qp = NULL; + + cep->cm_id = NULL; + id->rem_ref(id); + siw_cep_put(cep); + + qp->cep = NULL; + siw_cep_put(cep); + + cep->state = SIW_EPSTATE_CLOSED; + + siw_cep_set_free(cep); + + siw_cep_put(cep); + + } else if (s) { + sock_release(s); + } + siw_qp_put(qp); + + return rv; +} + +/* + * siw_accept - Let SoftiWARP accept an RDMA connection request + * + * @id: New connection management id to be used for accepted + * connection request + * @params: Connection parameters provided by ULP for accepting connection + * + * Transition QP to RTS state, associate new CM id @id with accepted CEP + * and get prepared for TCP input by installing socket callbacks. + * Then send MPA Reply and generate the "connection established" event. + * Socket callbacks must be installed before sending MPA Reply, because + * the latter may cause a first RDMA message to arrive from the RDMA Initiator + * side very quickly, at which time the socket callbacks must be ready. + */ +int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) +{ + struct siw_device *sdev = to_siw_dev(id->device); + struct siw_cep *cep = (struct siw_cep *)id->provider_data; + struct siw_qp *qp; + struct siw_qp_attrs qp_attrs; + int rv, max_priv_data = MPA_MAX_PRIVDATA; + bool wait_for_peer_rts = false; + + siw_cep_set_inuse(cep); + siw_cep_put(cep); + + /* Free lingering inbound private data */ + if (cep->mpa.hdr.params.pd_len) { + cep->mpa.hdr.params.pd_len = 0; + kfree(cep->mpa.pdata); + cep->mpa.pdata = NULL; + } + siw_cancel_mpatimer(cep); + + if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { + siw_dbg_cep(cep, "id 0x%p: out of state\n", id); + + siw_cep_set_free(cep); + siw_cep_put(cep); + + return -ECONNRESET; + } + qp = siw_qp_id2obj(sdev, params->qpn); + if (!qp) { + WARN(1, "[QP %d] does not exist\n", params->qpn); + siw_cep_set_free(cep); + siw_cep_put(cep); + + return -EINVAL; + } + down_write(&qp->state_lock); + if (qp->attrs.state > SIW_QP_STATE_RTR) { + rv = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + siw_dbg_cep(cep, "id 0x%p\n", id); + + if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) { + siw_dbg_cep(cep, "peer allows GSO on TX\n"); + qp->tx_ctx.gso_seg_limit = 0; + } + if (params->ord > sdev->attrs.max_ord || + params->ird > sdev->attrs.max_ird) { + siw_dbg_cep( + cep, + "id 0x%p, [QP %u]: ord %d (max %d), ird %d (max %d)\n", + id, qp_id(qp), params->ord, sdev->attrs.max_ord, + params->ird, sdev->attrs.max_ird); + rv = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + if (cep->enhanced_rdma_conn_est) + max_priv_data -= sizeof(struct mpa_v2_data); + + if (params->private_data_len > max_priv_data) { + siw_dbg_cep( + cep, + "id 0x%p, [QP %u]: private data length: %d (max %d)\n", + id, qp_id(qp), params->private_data_len, max_priv_data); + rv = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + if (cep->enhanced_rdma_conn_est) { + if (params->ord > cep->ord) { + if (relaxed_ird_negotiation) { + params->ord = cep->ord; + } else { + cep->ird = params->ird; + cep->ord = params->ord; + rv = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + } + if (params->ird < cep->ird) { + if (relaxed_ird_negotiation && + cep->ird <= sdev->attrs.max_ird) + params->ird = cep->ird; + else { + rv = -ENOMEM; + up_write(&qp->state_lock); + goto error; + } + } + if (cep->mpa.v2_ctrl.ord & + (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR)) + wait_for_peer_rts = true; + /* + * Signal back negotiated IRD and ORD values + */ + cep->mpa.v2_ctrl.ord = + htons(params->ord & MPA_IRD_ORD_MASK) | + (cep->mpa.v2_ctrl.ord & ~MPA_V2_MASK_IRD_ORD); + cep->mpa.v2_ctrl.ird = + htons(params->ird & MPA_IRD_ORD_MASK) | + (cep->mpa.v2_ctrl.ird & ~MPA_V2_MASK_IRD_ORD); + } + cep->ird = params->ird; + cep->ord = params->ord; + + cep->cm_id = id; + id->add_ref(id); + + memset(&qp_attrs, 0, sizeof(qp_attrs)); + qp_attrs.orq_size = cep->ord; + qp_attrs.irq_size = cep->ird; + qp_attrs.sk = cep->sock; + if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC) + qp_attrs.flags = SIW_MPA_CRC; + qp_attrs.state = SIW_QP_STATE_RTS; + + siw_dbg_cep(cep, "id 0x%p, [QP%u]: moving to rts\n", id, qp_id(qp)); + + /* Associate QP with CEP */ + siw_cep_get(cep); + qp->cep = cep; + + /* siw_qp_get(qp) already done by QP lookup */ + cep->qp = qp; + + cep->state = SIW_EPSTATE_RDMA_MODE; + + /* Move socket RX/TX under QP control */ + rv = siw_qp_modify(qp, &qp_attrs, + SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | + SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | + SIW_QP_ATTR_MPA); + up_write(&qp->state_lock); + + if (rv) + goto error; + + siw_dbg_cep(cep, "id 0x%p, [QP %u]: send mpa reply, %d byte pdata\n", + id, qp_id(qp), params->private_data_len); + + rv = siw_send_mpareqrep(cep, params->private_data, + params->private_data_len); + if (rv != 0) + goto error; + + if (wait_for_peer_rts) { + siw_sk_assign_rtr_upcalls(cep); + } else { + siw_qp_socket_assoc(cep, qp); + rv = siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); + if (rv) + goto error; + } + siw_cep_set_free(cep); + + return 0; +error: + siw_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + + cep->state = SIW_EPSTATE_CLOSED; + + if (cep->cm_id) { + cep->cm_id->rem_ref(id); + cep->cm_id = NULL; + } + if (qp->cep) { + siw_cep_put(cep); + qp->cep = NULL; + } + cep->qp = NULL; + siw_qp_put(qp); + + siw_cep_set_free(cep); + siw_cep_put(cep); + + return rv; +} + +/* + * siw_reject() + * + * Local connection reject case. Send private data back to peer, + * close connection and dereference connection id. + */ +int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len) +{ + struct siw_cep *cep = (struct siw_cep *)id->provider_data; + + siw_cep_set_inuse(cep); + siw_cep_put(cep); + + siw_cancel_mpatimer(cep); + + if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { + siw_dbg_cep(cep, "id 0x%p: out of state\n", id); + + siw_cep_set_free(cep); + siw_cep_put(cep); /* put last reference */ + + return -ECONNRESET; + } + siw_dbg_cep(cep, "id 0x%p, cep->state %d, pd_len %d\n", id, cep->state, + pd_len); + + if (__mpa_rr_revision(cep->mpa.hdr.params.bits) >= MPA_REVISION_1) { + cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */ + siw_send_mpareqrep(cep, pdata, pd_len); + } + siw_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + + cep->state = SIW_EPSTATE_CLOSED; + + siw_cep_set_free(cep); + siw_cep_put(cep); + + return 0; +} + +static int siw_listen_address(struct iw_cm_id *id, int backlog, + struct sockaddr *laddr, int addr_family) +{ + struct socket *s; + struct siw_cep *cep = NULL; + struct siw_device *sdev = to_siw_dev(id->device); + int rv = 0, s_val; + + rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s); + if (rv < 0) + return rv; + + /* + * Allow binding local port when still in TIME_WAIT from last close. + */ + s_val = 1; + rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val, + sizeof(s_val)); + if (rv) { + siw_dbg(id->device, "id 0x%p: setsockopt error: %d\n", id, rv); + goto error; + } + rv = s->ops->bind(s, laddr, addr_family == AF_INET ? + sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6)); + if (rv) { + siw_dbg(id->device, "id 0x%p: socket bind error: %d\n", id, rv); + goto error; + } + cep = siw_cep_alloc(sdev); + if (!cep) { + rv = -ENOMEM; + goto error; + } + siw_cep_socket_assoc(cep, s); + + rv = siw_cm_alloc_work(cep, backlog); + if (rv) { + siw_dbg(id->device, + "id 0x%p: alloc_work error %d, backlog %d\n", id, + rv, backlog); + goto error; + } + rv = s->ops->listen(s, backlog); + if (rv) { + siw_dbg(id->device, "id 0x%p: listen error %d\n", id, rv); + goto error; + } + cep->cm_id = id; + id->add_ref(id); + + /* + * In case of a wildcard rdma_listen on a multi-homed device, + * a listener's IWCM id is associated with more than one listening CEP. + * + * We currently use id->provider_data in three different ways: + * + * o For a listener's IWCM id, id->provider_data points to + * the list_head of the list of listening CEPs. + * Uses: siw_create_listen(), siw_destroy_listen() + * + * o For each accepted passive-side IWCM id, id->provider_data + * points to the CEP itself. This is a consequence of + * - siw_cm_upcall() setting event.provider_data = cep and + * - the IWCM's cm_conn_req_handler() setting provider_data of the + * new passive-side IWCM id equal to event.provider_data + * Uses: siw_accept(), siw_reject() + * + * o For an active-side IWCM id, id->provider_data is not used at all. + * + */ + if (!id->provider_data) { + id->provider_data = + kmalloc(sizeof(struct list_head), GFP_KERNEL); + if (!id->provider_data) { + rv = -ENOMEM; + goto error; + } + INIT_LIST_HEAD((struct list_head *)id->provider_data); + } + list_add_tail(&cep->listenq, (struct list_head *)id->provider_data); + cep->state = SIW_EPSTATE_LISTENING; + + if (addr_family == AF_INET) + siw_dbg(id->device, "Listen at laddr %pI4 %u\n", + &(((struct sockaddr_in *)laddr)->sin_addr), + ((struct sockaddr_in *)laddr)->sin_port); + else + siw_dbg(id->device, "Listen at laddr %pI6 %u\n", + &(((struct sockaddr_in6 *)laddr)->sin6_addr), + ((struct sockaddr_in6 *)laddr)->sin6_port); + + return 0; + +error: + siw_dbg(id->device, "failed: %d\n", rv); + + if (cep) { + siw_cep_set_inuse(cep); + + if (cep->cm_id) { + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + } + cep->sock = NULL; + siw_socket_disassoc(s); + cep->state = SIW_EPSTATE_CLOSED; + + siw_cep_set_free(cep); + siw_cep_put(cep); + } + sock_release(s); + + return rv; +} + +static void siw_drop_listeners(struct iw_cm_id *id) +{ + struct list_head *p, *tmp; + + /* + * In case of a wildcard rdma_listen on a multi-homed device, + * a listener's IWCM id is associated with more than one listening CEP. + */ + list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) { + struct siw_cep *cep = list_entry(p, struct siw_cep, listenq); + + list_del(p); + + siw_dbg_cep(cep, "id 0x%p: drop cep, state %d\n", id, + cep->state); + + siw_cep_set_inuse(cep); + + if (cep->cm_id) { + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + } + if (cep->sock) { + siw_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + } + cep->state = SIW_EPSTATE_CLOSED; + siw_cep_set_free(cep); + siw_cep_put(cep); + } +} + +/* + * siw_create_listen - Create resources for a listener's IWCM ID @id + * + * Listens on the socket addresses id->local_addr and id->remote_addr. + * + * If the listener's @id provides a specific local IP address, at most one + * listening socket is created and associated with @id. + * + * If the listener's @id provides the wildcard (zero) local IP address, + * a separate listen is performed for each local IP address of the device + * by creating a listening socket and binding to that local IP address. + * + */ +int siw_create_listen(struct iw_cm_id *id, int backlog) +{ + struct net_device *dev = to_siw_dev(id->device)->netdev; + int rv = 0, listeners = 0; + + siw_dbg(id->device, "id 0x%p: backlog %d\n", id, backlog); + + /* + * For each attached address of the interface, create a + * listening socket, if id->local_addr is the wildcard + * IP address or matches the IP address. + */ + if (id->local_addr.ss_family == AF_INET) { + struct in_device *in_dev = in_dev_get(dev); + struct sockaddr_in s_laddr, *s_raddr; + + memcpy(&s_laddr, &id->local_addr, sizeof(s_laddr)); + s_raddr = (struct sockaddr_in *)&id->remote_addr; + + siw_dbg(id->device, + "id 0x%p: laddr %pI4:%d, raddr %pI4:%d\n", + id, &s_laddr.sin_addr, ntohs(s_laddr.sin_port), + &s_raddr->sin_addr, ntohs(s_raddr->sin_port)); + + for_ifa(in_dev) + { + if (ipv4_is_zeronet(s_laddr.sin_addr.s_addr) || + s_laddr.sin_addr.s_addr == ifa->ifa_address) { + s_laddr.sin_addr.s_addr = ifa->ifa_address; + + rv = siw_listen_address(id, backlog, + (struct sockaddr *)&s_laddr, + AF_INET); + if (!rv) + listeners++; + } + } + endfor_ifa(in_dev); + in_dev_put(in_dev); + } else if (id->local_addr.ss_family == AF_INET6) { + struct inet6_dev *in6_dev = in6_dev_get(dev); + struct inet6_ifaddr *ifp; + struct sockaddr_in6 *s_laddr = &to_sockaddr_in6(id->local_addr), + *s_raddr = &to_sockaddr_in6(id->remote_addr); + + siw_dbg(id->device, + "id 0x%p: laddr %pI6:%d, raddr %pI6:%d\n", + id, &s_laddr->sin6_addr, ntohs(s_laddr->sin6_port), + &s_raddr->sin6_addr, ntohs(s_raddr->sin6_port)); + + read_lock_bh(&in6_dev->lock); + list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { + struct sockaddr_in6 bind_addr; + + if (ipv6_addr_any(&s_laddr->sin6_addr) || + ipv6_addr_equal(&s_laddr->sin6_addr, &ifp->addr)) { + bind_addr.sin6_family = AF_INET6; + bind_addr.sin6_port = s_laddr->sin6_port; + bind_addr.sin6_flowinfo = 0; + bind_addr.sin6_addr = ifp->addr; + bind_addr.sin6_scope_id = dev->ifindex; + + rv = siw_listen_address(id, backlog, + (struct sockaddr *)&bind_addr, + AF_INET6); + if (!rv) + listeners++; + } + } + read_unlock_bh(&in6_dev->lock); + + in6_dev_put(in6_dev); + } else { + return -EAFNOSUPPORT; + } + if (listeners) + rv = 0; + else if (!rv) + rv = -EINVAL; + + siw_dbg(id->device, "id 0x%p: %s\n", id, rv ? "FAIL" : "OK"); + + return rv; +} + +int siw_destroy_listen(struct iw_cm_id *id) +{ + siw_dbg(id->device, "id 0x%p\n", id); + + if (!id->provider_data) { + siw_dbg(id->device, "id 0x%p: no cep(s)\n", id); + return 0; + } + siw_drop_listeners(id); + kfree(id->provider_data); + id->provider_data = NULL; + + return 0; +} + +int siw_cm_init(void) +{ + /* + * create_single_workqueue for strict ordering + */ + siw_cm_wq = create_singlethread_workqueue("siw_cm_wq"); + if (!siw_cm_wq) + return -ENOMEM; + + return 0; +} + +void siw_cm_exit(void) +{ + if (siw_cm_wq) { + flush_workqueue(siw_cm_wq); + destroy_workqueue(siw_cm_wq); + } +} diff --git a/drivers/infiniband/sw/siw/siw_cm.h b/drivers/infiniband/sw/siw/siw_cm.h new file mode 100644 index 000000000000..8c59cb3e2868 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_cm.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler */ +/* Greg Joyce */ +/* Copyright (c) 2008-2019, IBM Corporation */ +/* Copyright (c) 2017, Open Grid Computing, Inc. */ + +#ifndef _SIW_CM_H +#define _SIW_CM_H + +#include +#include + +#include + +enum siw_cep_state { + SIW_EPSTATE_IDLE = 1, + SIW_EPSTATE_LISTENING, + SIW_EPSTATE_CONNECTING, + SIW_EPSTATE_AWAIT_MPAREQ, + SIW_EPSTATE_RECVD_MPAREQ, + SIW_EPSTATE_AWAIT_MPAREP, + SIW_EPSTATE_RDMA_MODE, + SIW_EPSTATE_CLOSED +}; + +struct siw_mpa_info { + struct mpa_rr hdr; /* peer mpa hdr in host byte order */ + struct mpa_v2_data v2_ctrl; + struct mpa_v2_data v2_ctrl_req; + char *pdata; + int bytes_rcvd; +}; + +struct siw_device; + +struct siw_cep { + struct iw_cm_id *cm_id; + struct siw_device *sdev; + struct list_head devq; + spinlock_t lock; + struct kref ref; + int in_use; + wait_queue_head_t waitq; + enum siw_cep_state state; + + struct list_head listenq; + struct siw_cep *listen_cep; + + struct siw_qp *qp; + struct socket *sock; + + struct siw_cm_work *mpa_timer; + struct list_head work_freelist; + + struct siw_mpa_info mpa; + int ord; + int ird; + bool enhanced_rdma_conn_est; + + /* Saved upcalls of socket */ + void (*sk_state_change)(struct sock *sk); + void (*sk_data_ready)(struct sock *sk); + void (*sk_write_space)(struct sock *sk); + void (*sk_error_report)(struct sock *sk); +}; + +/* + * Connection initiator waits 10 seconds to receive an + * MPA reply after sending out MPA request. Reponder waits for + * 5 seconds for MPA request to arrive if new TCP connection + * was set up. + */ +#define MPAREQ_TIMEOUT (HZ * 10) +#define MPAREP_TIMEOUT (HZ * 5) + +enum siw_work_type { + SIW_CM_WORK_ACCEPT = 1, + SIW_CM_WORK_READ_MPAHDR, + SIW_CM_WORK_CLOSE_LLP, /* close socket */ + SIW_CM_WORK_PEER_CLOSE, /* socket indicated peer close */ + SIW_CM_WORK_MPATIMEOUT +}; + +struct siw_cm_work { + struct delayed_work work; + struct list_head list; + enum siw_work_type type; + struct siw_cep *cep; +}; + +#define to_sockaddr_in(a) (*(struct sockaddr_in *)(&(a))) +#define to_sockaddr_in6(a) (*(struct sockaddr_in6 *)(&(a))) + +static inline int getname_peer(struct socket *s, struct sockaddr_storage *a) +{ + return s->ops->getname(s, (struct sockaddr *)a, 1); +} + +static inline int getname_local(struct socket *s, struct sockaddr_storage *a) +{ + return s->ops->getname(s, (struct sockaddr *)a, 0); +} + +static inline int ksock_recv(struct socket *sock, char *buf, size_t size, + int flags) +{ + struct kvec iov = { buf, size }; + struct msghdr msg = { .msg_name = NULL, .msg_flags = flags }; + + return kernel_recvmsg(sock, &msg, &iov, 1, size, flags); +} + +int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *parm); +int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *param); +int siw_reject(struct iw_cm_id *id, const void *data, u8 len); +int siw_create_listen(struct iw_cm_id *id, int backlog); +int siw_destroy_listen(struct iw_cm_id *id); + +void siw_cep_get(struct siw_cep *cep); +void siw_cep_put(struct siw_cep *cep); +int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type); + +int siw_cm_init(void); +void siw_cm_exit(void); + +/* + * TCP socket interface + */ +#define sk_to_qp(sk) (((struct siw_cep *)((sk)->sk_user_data))->qp) +#define sk_to_cep(sk) ((struct siw_cep *)((sk)->sk_user_data)) + +#endif -- cgit v1.2.3 From 303ae1cdfdf7280ff4cfbbe65563b5ff15bb025b Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:27 +0200 Subject: rdma/siw: application interface Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_verbs.c | 1760 ++++++++++++++++++++++++++++++ drivers/infiniband/sw/siw/siw_verbs.h | 91 ++ include/uapi/rdma/rdma_user_ioctl_cmds.h | 1 + include/uapi/rdma/siw-abi.h | 185 ++++ 4 files changed, 2037 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw_verbs.c create mode 100644 drivers/infiniband/sw/siw/siw_verbs.h create mode 100644 include/uapi/rdma/siw-abi.h (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c new file mode 100644 index 000000000000..32dc79d0e898 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -0,0 +1,1760 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "siw.h" +#include "siw_verbs.h" +#include "siw_mem.h" + +static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = SIW_QP_STATE_IDLE, + [IB_QPS_INIT] = SIW_QP_STATE_IDLE, + [IB_QPS_RTR] = SIW_QP_STATE_RTR, + [IB_QPS_RTS] = SIW_QP_STATE_RTS, + [IB_QPS_SQD] = SIW_QP_STATE_CLOSING, + [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE, + [IB_QPS_ERR] = SIW_QP_STATE_ERROR +}; + +static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = { + [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR", + [IB_QPS_RTS] = "RTS", [IB_QPS_SQD] = "SQD", [IB_QPS_SQE] = "SQE", + [IB_QPS_ERR] = "ERR" +}; + +static u32 siw_create_uobj(struct siw_ucontext *uctx, void *vaddr, u32 size) +{ + struct siw_uobj *uobj; + struct xa_limit limit = XA_LIMIT(0, SIW_UOBJ_MAX_KEY); + u32 key; + + uobj = kzalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) + return SIW_INVAL_UOBJ_KEY; + + if (xa_alloc_cyclic(&uctx->xa, &key, uobj, limit, &uctx->uobj_nextkey, + GFP_KERNEL) < 0) { + kfree(uobj); + return SIW_INVAL_UOBJ_KEY; + } + uobj->size = PAGE_ALIGN(size); + uobj->addr = vaddr; + + return key; +} + +static struct siw_uobj *siw_get_uobj(struct siw_ucontext *uctx, + unsigned long off, u32 size) +{ + struct siw_uobj *uobj = xa_load(&uctx->xa, off); + + if (uobj && uobj->size == size) + return uobj; + + return NULL; +} + +int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma) +{ + struct siw_ucontext *uctx = to_siw_ctx(ctx); + struct siw_uobj *uobj; + unsigned long off = vma->vm_pgoff; + int size = vma->vm_end - vma->vm_start; + int rv = -EINVAL; + + /* + * Must be page aligned + */ + if (vma->vm_start & (PAGE_SIZE - 1)) { + pr_warn("siw: mmap not page aligned\n"); + goto out; + } + uobj = siw_get_uobj(uctx, off, size); + if (!uobj) { + siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %u\n", + off, size); + goto out; + } + rv = remap_vmalloc_range(vma, uobj->addr, 0); + if (rv) + pr_warn("remap_vmalloc_range failed: %lu, %u\n", off, size); +out: + return rv; +} + +int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(base_ctx->device); + struct siw_ucontext *ctx = to_siw_ctx(base_ctx); + struct siw_uresp_alloc_ctx uresp = {}; + int rv; + + if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) { + rv = -ENOMEM; + goto err_out; + } + xa_init_flags(&ctx->xa, XA_FLAGS_ALLOC); + ctx->uobj_nextkey = 0; + ctx->sdev = sdev; + + uresp.dev_id = sdev->vendor_part_id; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out; + + siw_dbg(base_ctx->device, "success. now %d context(s)\n", + atomic_read(&sdev->num_ctx)); + + return 0; + +err_out: + atomic_dec(&sdev->num_ctx); + siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv, + atomic_read(&sdev->num_ctx)); + + return rv; +} + +void siw_dealloc_ucontext(struct ib_ucontext *base_ctx) +{ + struct siw_ucontext *uctx = to_siw_ctx(base_ctx); + void *entry; + unsigned long index; + + /* + * Make sure all user mmap objects are gone. Since QP, CQ + * and SRQ destroy routines destroy related objects, nothing + * should be found here. + */ + xa_for_each(&uctx->xa, index, entry) { + kfree(xa_erase(&uctx->xa, index)); + pr_warn("siw: dropping orphaned uobj at %lu\n", index); + } + xa_destroy(&uctx->xa); + atomic_dec(&uctx->sdev->num_ctx); +} + +int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, + struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(base_dev); + + if (udata->inlen || udata->outlen) + return -EINVAL; + + memset(attr, 0, sizeof(*attr)); + + /* Revisit atomic caps if RFC 7306 gets supported */ + attr->atomic_cap = 0; + attr->device_cap_flags = + IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_ALLOW_USER_UNREG; + attr->max_cq = sdev->attrs.max_cq; + attr->max_cqe = sdev->attrs.max_cqe; + attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL; + attr->max_fmr = sdev->attrs.max_fmr; + attr->max_mr = sdev->attrs.max_mr; + attr->max_mw = sdev->attrs.max_mw; + attr->max_mr_size = ~0ull; + attr->max_pd = sdev->attrs.max_pd; + attr->max_qp = sdev->attrs.max_qp; + attr->max_qp_init_rd_atom = sdev->attrs.max_ird; + attr->max_qp_rd_atom = sdev->attrs.max_ord; + attr->max_qp_wr = sdev->attrs.max_qp_wr; + attr->max_recv_sge = sdev->attrs.max_sge; + attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird; + attr->max_send_sge = sdev->attrs.max_sge; + attr->max_sge_rd = sdev->attrs.max_sge_rd; + attr->max_srq = sdev->attrs.max_srq; + attr->max_srq_sge = sdev->attrs.max_srq_sge; + attr->max_srq_wr = sdev->attrs.max_srq_wr; + attr->page_size_cap = PAGE_SIZE; + attr->vendor_id = SIW_VENDOR_ID; + attr->vendor_part_id = sdev->vendor_part_id; + + memcpy(&attr->sys_image_guid, sdev->netdev->dev_addr, 6); + + return 0; +} + +int siw_query_port(struct ib_device *base_dev, u8 port, + struct ib_port_attr *attr) +{ + struct siw_device *sdev = to_siw_dev(base_dev); + + memset(attr, 0, sizeof(*attr)); + + attr->active_mtu = attr->max_mtu; + attr->active_speed = 2; + attr->active_width = 2; + attr->gid_tbl_len = 1; + attr->max_msg_sz = -1; + attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); + attr->phys_state = sdev->state == IB_PORT_ACTIVE ? 5 : 3; + attr->pkey_tbl_len = 1; + attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; + attr->state = sdev->state; + /* + * All zero + * + * attr->lid = 0; + * attr->bad_pkey_cntr = 0; + * attr->qkey_viol_cntr = 0; + * attr->sm_lid = 0; + * attr->lmc = 0; + * attr->max_vl_num = 0; + * attr->sm_sl = 0; + * attr->subnet_timeout = 0; + * attr->init_type_repy = 0; + */ + return 0; +} + +int siw_get_port_immutable(struct ib_device *base_dev, u8 port, + struct ib_port_immutable *port_immutable) +{ + struct ib_port_attr attr; + int rv = siw_query_port(base_dev, port, &attr); + + if (rv) + return rv; + + port_immutable->pkey_tbl_len = attr.pkey_tbl_len; + port_immutable->gid_tbl_len = attr.gid_tbl_len; + port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; + + return 0; +} + +int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey) +{ + /* Report the default pkey */ + *pkey = 0xffff; + return 0; +} + +int siw_query_gid(struct ib_device *base_dev, u8 port, int idx, + union ib_gid *gid) +{ + struct siw_device *sdev = to_siw_dev(base_dev); + + /* subnet_prefix == interface_id == 0; */ + memset(gid, 0, sizeof(*gid)); + memcpy(&gid->raw[0], sdev->netdev->dev_addr, 6); + + return 0; +} + +int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + + if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) { + atomic_dec(&sdev->num_pd); + return -ENOMEM; + } + siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd)); + + return 0; +} + +void siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + + siw_dbg_pd(pd, "free PD\n"); + atomic_dec(&sdev->num_pd); +} + +void siw_qp_get_ref(struct ib_qp *base_qp) +{ + siw_qp_get(to_siw_qp(base_qp)); +} + +void siw_qp_put_ref(struct ib_qp *base_qp) +{ + siw_qp_put(to_siw_qp(base_qp)); +} + +/* + * siw_create_qp() + * + * Create QP of requested size on given device. + * + * @pd: Protection Domain + * @attrs: Initial QP attributes. + * @udata: used to provide QP ID, SQ and RQ size back to user. + */ + +struct ib_qp *siw_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *attrs, + struct ib_udata *udata) +{ + struct siw_qp *qp = NULL; + struct siw_base_qp *siw_base_qp = NULL; + struct ib_device *base_dev = pd->device; + struct siw_device *sdev = to_siw_dev(base_dev); + struct siw_ucontext *uctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + struct siw_cq *scq = NULL, *rcq = NULL; + unsigned long flags; + int num_sqe, num_rqe, rv = 0; + + siw_dbg(base_dev, "create new QP\n"); + + if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) { + siw_dbg(base_dev, "too many QP's\n"); + rv = -ENOMEM; + goto err_out; + } + if (attrs->qp_type != IB_QPT_RC) { + siw_dbg(base_dev, "only RC QP's supported\n"); + rv = -EINVAL; + goto err_out; + } + if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) || + (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) || + (attrs->cap.max_send_sge > SIW_MAX_SGE) || + (attrs->cap.max_recv_sge > SIW_MAX_SGE)) { + siw_dbg(base_dev, "QP size error\n"); + rv = -EINVAL; + goto err_out; + } + if (attrs->cap.max_inline_data > SIW_MAX_INLINE) { + siw_dbg(base_dev, "max inline send: %d > %d\n", + attrs->cap.max_inline_data, (int)SIW_MAX_INLINE); + rv = -EINVAL; + goto err_out; + } + /* + * NOTE: we allow for zero element SQ and RQ WQE's SGL's + * but not for a QP unable to hold any WQE (SQ + RQ) + */ + if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) { + siw_dbg(base_dev, "QP must have send or receive queue\n"); + rv = -EINVAL; + goto err_out; + } + scq = to_siw_cq(attrs->send_cq); + rcq = to_siw_cq(attrs->recv_cq); + + if (!scq || (!rcq && !attrs->srq)) { + siw_dbg(base_dev, "send CQ or receive CQ invalid\n"); + rv = -EINVAL; + goto err_out; + } + siw_base_qp = kzalloc(sizeof(*siw_base_qp), GFP_KERNEL); + if (!siw_base_qp) { + rv = -ENOMEM; + goto err_out; + } + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) { + rv = -ENOMEM; + goto err_out; + } + siw_base_qp->qp = qp; + qp->ib_qp = &siw_base_qp->base_qp; + + init_rwsem(&qp->state_lock); + spin_lock_init(&qp->sq_lock); + spin_lock_init(&qp->rq_lock); + spin_lock_init(&qp->orq_lock); + + qp->kernel_verbs = !udata; + qp->xa_sq_index = SIW_INVAL_UOBJ_KEY; + qp->xa_rq_index = SIW_INVAL_UOBJ_KEY; + + rv = siw_qp_add(sdev, qp); + if (rv) + goto err_out; + + /* All queue indices are derived from modulo operations + * on a free running 'get' (consumer) and 'put' (producer) + * unsigned counter. Having queue sizes at power of two + * avoids handling counter wrap around. + */ + num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr); + num_rqe = roundup_pow_of_two(attrs->cap.max_recv_wr); + + if (qp->kernel_verbs) + qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe)); + else + qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe)); + + if (qp->sendq == NULL) { + siw_dbg(base_dev, "SQ size %d alloc failed\n", num_sqe); + rv = -ENOMEM; + goto err_out_xa; + } + if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) { + if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->attrs.flags |= SIW_SIGNAL_ALL_WR; + else { + rv = -EINVAL; + goto err_out_xa; + } + } + qp->pd = pd; + qp->scq = scq; + qp->rcq = rcq; + + if (attrs->srq) { + /* + * SRQ support. + * Verbs 6.3.7: ignore RQ size, if SRQ present + * Verbs 6.3.5: do not check PD of SRQ against PD of QP + */ + qp->srq = to_siw_srq(attrs->srq); + qp->attrs.rq_size = 0; + siw_dbg(base_dev, "QP [%u]: [SRQ 0x%p] attached\n", + qp->qp_num, qp->srq); + } else if (num_rqe) { + if (qp->kernel_verbs) + qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe)); + else + qp->recvq = + vmalloc_user(num_rqe * sizeof(struct siw_rqe)); + + if (qp->recvq == NULL) { + siw_dbg(base_dev, "RQ size %d alloc failed\n", num_rqe); + rv = -ENOMEM; + goto err_out_xa; + } + qp->attrs.rq_size = num_rqe; + } + qp->attrs.sq_size = num_sqe; + qp->attrs.sq_max_sges = attrs->cap.max_send_sge; + qp->attrs.rq_max_sges = attrs->cap.max_recv_sge; + + /* Make those two tunables fixed for now. */ + qp->tx_ctx.gso_seg_limit = 1; + qp->tx_ctx.zcopy_tx = zcopy_tx; + + qp->attrs.state = SIW_QP_STATE_IDLE; + + if (udata) { + struct siw_uresp_create_qp uresp = {}; + + uresp.num_sqe = num_sqe; + uresp.num_rqe = num_rqe; + uresp.qp_id = qp_id(qp); + + if (qp->sendq) { + qp->xa_sq_index = + siw_create_uobj(uctx, qp->sendq, + num_sqe * sizeof(struct siw_sqe)); + } + if (qp->recvq) { + qp->xa_rq_index = + siw_create_uobj(uctx, qp->recvq, + num_rqe * sizeof(struct siw_rqe)); + } + if (qp->xa_sq_index == SIW_INVAL_UOBJ_KEY || + qp->xa_rq_index == SIW_INVAL_UOBJ_KEY) { + rv = -ENOMEM; + goto err_out_xa; + } + uresp.sq_key = qp->xa_sq_index << PAGE_SHIFT; + uresp.rq_key = qp->xa_rq_index << PAGE_SHIFT; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out_xa; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out_xa; + } + qp->tx_cpu = siw_get_tx_cpu(sdev); + if (qp->tx_cpu < 0) { + rv = -EINVAL; + goto err_out_xa; + } + INIT_LIST_HEAD(&qp->devq); + spin_lock_irqsave(&sdev->lock, flags); + list_add_tail(&qp->devq, &sdev->qp_list); + spin_unlock_irqrestore(&sdev->lock, flags); + + return qp->ib_qp; + +err_out_xa: + xa_erase(&sdev->qp_xa, qp_id(qp)); +err_out: + kfree(siw_base_qp); + + if (qp) { + if (qp->xa_sq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&uctx->xa, qp->xa_sq_index)); + if (qp->xa_rq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&uctx->xa, qp->xa_rq_index)); + + vfree(qp->sendq); + vfree(qp->recvq); + kfree(qp); + } + atomic_dec(&sdev->num_qp); + + return ERR_PTR(rv); +} + +/* + * Minimum siw_query_qp() verb interface. + * + * @qp_attr_mask is not used but all available information is provided + */ +int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + struct siw_qp *qp; + struct siw_device *sdev; + + if (base_qp && qp_attr && qp_init_attr) { + qp = to_siw_qp(base_qp); + sdev = to_siw_dev(base_qp->device); + } else { + return -EINVAL; + } + qp_attr->cap.max_inline_data = SIW_MAX_INLINE; + qp_attr->cap.max_send_wr = qp->attrs.sq_size; + qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges; + qp_attr->cap.max_recv_wr = qp->attrs.rq_size; + qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges; + qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); + qp_attr->max_rd_atomic = qp->attrs.irq_size; + qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; + + qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ; + + qp_init_attr->qp_type = base_qp->qp_type; + qp_init_attr->send_cq = base_qp->send_cq; + qp_init_attr->recv_cq = base_qp->recv_cq; + qp_init_attr->srq = base_qp->srq; + + qp_init_attr->cap = qp_attr->cap; + + return 0; +} + +int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct siw_qp_attrs new_attrs; + enum siw_qp_attr_mask siw_attr_mask = 0; + struct siw_qp *qp = to_siw_qp(base_qp); + int rv = 0; + + if (!attr_mask) + return 0; + + memset(&new_attrs, 0, sizeof(new_attrs)); + + if (attr_mask & IB_QP_ACCESS_FLAGS) { + siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS; + + if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) + new_attrs.flags |= SIW_RDMA_READ_ENABLED; + if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) + new_attrs.flags |= SIW_RDMA_WRITE_ENABLED; + if (attr->qp_access_flags & IB_ACCESS_MW_BIND) + new_attrs.flags |= SIW_RDMA_BIND_ENABLED; + } + if (attr_mask & IB_QP_STATE) { + siw_dbg_qp(qp, "desired IB QP state: %s\n", + ib_qp_state_to_string[attr->qp_state]); + + new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state]; + + if (new_attrs.state > SIW_QP_STATE_RTS) + qp->tx_ctx.tx_suspend = 1; + + siw_attr_mask |= SIW_QP_ATTR_STATE; + } + if (!siw_attr_mask) + goto out; + + down_write(&qp->state_lock); + + rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask); + + up_write(&qp->state_lock); +out: + return rv; +} + +int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata) +{ + struct siw_qp *qp = to_siw_qp(base_qp); + struct siw_base_qp *siw_base_qp = to_siw_base_qp(base_qp); + struct siw_ucontext *uctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + struct siw_qp_attrs qp_attrs; + + siw_dbg_qp(qp, "state %d, cep 0x%p\n", qp->attrs.state, qp->cep); + + /* + * Mark QP as in process of destruction to prevent from + * any async callbacks to RDMA core + */ + qp->attrs.flags |= SIW_QP_IN_DESTROY; + qp->rx_stream.rx_suspend = 1; + + if (uctx && qp->xa_sq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&uctx->xa, qp->xa_sq_index)); + if (uctx && qp->xa_rq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&uctx->xa, qp->xa_rq_index)); + + down_write(&qp->state_lock); + + qp_attrs.state = SIW_QP_STATE_ERROR; + siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE); + + if (qp->cep) { + siw_cep_put(qp->cep); + qp->cep = NULL; + } + up_write(&qp->state_lock); + + kfree(qp->tx_ctx.mpa_crc_hd); + kfree(qp->rx_stream.mpa_crc_hd); + + qp->scq = qp->rcq = NULL; + + siw_qp_put(qp); + kfree(siw_base_qp); + + return 0; +} + +/* + * siw_copy_inline_sgl() + * + * Prepare sgl of inlined data for sending. For userland callers + * function checks if given buffer addresses and len's are within + * process context bounds. + * Data from all provided sge's are copied together into the wqe, + * referenced by a single sge. + */ +static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr, + struct siw_sqe *sqe) +{ + struct ib_sge *core_sge = core_wr->sg_list; + void *kbuf = &sqe->sge[1]; + int num_sge = core_wr->num_sge, bytes = 0; + + sqe->sge[0].laddr = (u64)kbuf; + sqe->sge[0].lkey = 0; + + while (num_sge--) { + if (!core_sge->length) { + core_sge++; + continue; + } + bytes += core_sge->length; + if (bytes > SIW_MAX_INLINE) { + bytes = -EINVAL; + break; + } + memcpy(kbuf, (void *)(uintptr_t)core_sge->addr, + core_sge->length); + + kbuf += core_sge->length; + core_sge++; + } + sqe->sge[0].length = bytes > 0 ? bytes : 0; + sqe->num_sge = bytes > 0 ? 1 : 0; + + return bytes; +} + +/* + * siw_post_send() + * + * Post a list of S-WR's to a SQ. + * + * @base_qp: Base QP contained in siw QP + * @wr: Null terminated list of user WR's + * @bad_wr: Points to failing WR in case of synchronous failure. + */ +int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + struct siw_qp *qp = to_siw_qp(base_qp); + struct siw_wqe *wqe = tx_wqe(qp); + + unsigned long flags; + int rv = 0; + + /* + * Try to acquire QP state lock. Must be non-blocking + * to accommodate kernel clients needs. + */ + if (!down_read_trylock(&qp->state_lock)) { + *bad_wr = wr; + siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state); + return -ENOTCONN; + } + if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) { + up_read(&qp->state_lock); + *bad_wr = wr; + siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state); + return -ENOTCONN; + } + if (wr && !qp->kernel_verbs) { + siw_dbg_qp(qp, "wr must be empty for user mapped sq\n"); + up_read(&qp->state_lock); + *bad_wr = wr; + return -EINVAL; + } + spin_lock_irqsave(&qp->sq_lock, flags); + + while (wr) { + u32 idx = qp->sq_put % qp->attrs.sq_size; + struct siw_sqe *sqe = &qp->sendq[idx]; + + if (sqe->flags) { + siw_dbg_qp(qp, "sq full\n"); + rv = -ENOMEM; + break; + } + if (wr->num_sge > qp->attrs.sq_max_sges) { + siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); + rv = -EINVAL; + break; + } + sqe->id = wr->wr_id; + + if ((wr->send_flags & IB_SEND_SIGNALED) || + (qp->attrs.flags & SIW_SIGNAL_ALL_WR)) + sqe->flags |= SIW_WQE_SIGNALLED; + + if (wr->send_flags & IB_SEND_FENCE) + sqe->flags |= SIW_WQE_READ_FENCE; + + switch (wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_INV: + if (wr->send_flags & IB_SEND_SOLICITED) + sqe->flags |= SIW_WQE_SOLICITED; + + if (!(wr->send_flags & IB_SEND_INLINE)) { + siw_copy_sgl(wr->sg_list, sqe->sge, + wr->num_sge); + sqe->num_sge = wr->num_sge; + } else { + rv = siw_copy_inline_sgl(wr, sqe); + if (rv <= 0) { + rv = -EINVAL; + break; + } + sqe->flags |= SIW_WQE_INLINE; + sqe->num_sge = 1; + } + if (wr->opcode == IB_WR_SEND) + sqe->opcode = SIW_OP_SEND; + else { + sqe->opcode = SIW_OP_SEND_REMOTE_INV; + sqe->rkey = wr->ex.invalidate_rkey; + } + break; + + case IB_WR_RDMA_READ_WITH_INV: + case IB_WR_RDMA_READ: + /* + * iWarp restricts RREAD sink to SGL containing + * 1 SGE only. we could relax to SGL with multiple + * elements referring the SAME ltag or even sending + * a private per-rreq tag referring to a checked + * local sgl with MULTIPLE ltag's. + */ + if (unlikely(wr->num_sge != 1)) { + rv = -EINVAL; + break; + } + siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1); + /* + * NOTE: zero length RREAD is allowed! + */ + sqe->raddr = rdma_wr(wr)->remote_addr; + sqe->rkey = rdma_wr(wr)->rkey; + sqe->num_sge = 1; + + if (wr->opcode == IB_WR_RDMA_READ) + sqe->opcode = SIW_OP_READ; + else + sqe->opcode = SIW_OP_READ_LOCAL_INV; + break; + + case IB_WR_RDMA_WRITE: + if (!(wr->send_flags & IB_SEND_INLINE)) { + siw_copy_sgl(wr->sg_list, &sqe->sge[0], + wr->num_sge); + sqe->num_sge = wr->num_sge; + } else { + rv = siw_copy_inline_sgl(wr, sqe); + if (unlikely(rv < 0)) { + rv = -EINVAL; + break; + } + sqe->flags |= SIW_WQE_INLINE; + sqe->num_sge = 1; + } + sqe->raddr = rdma_wr(wr)->remote_addr; + sqe->rkey = rdma_wr(wr)->rkey; + sqe->opcode = SIW_OP_WRITE; + break; + + case IB_WR_REG_MR: + sqe->base_mr = (uint64_t)reg_wr(wr)->mr; + sqe->rkey = reg_wr(wr)->key; + sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK; + sqe->opcode = SIW_OP_REG_MR; + break; + + case IB_WR_LOCAL_INV: + sqe->rkey = wr->ex.invalidate_rkey; + sqe->opcode = SIW_OP_INVAL_STAG; + break; + + default: + siw_dbg_qp(qp, "ib wr type %d unsupported\n", + wr->opcode); + rv = -EINVAL; + break; + } + siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%p\n", + sqe->opcode, sqe->flags, (void *)sqe->id); + + if (unlikely(rv < 0)) + break; + + /* make SQE only valid after completely written */ + smp_wmb(); + sqe->flags |= SIW_WQE_VALID; + + qp->sq_put++; + wr = wr->next; + } + + /* + * Send directly if SQ processing is not in progress. + * Eventual immediate errors (rv < 0) do not affect the involved + * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ + * processing, if new work is already pending. But rv must be passed + * to caller. + */ + if (wqe->wr_status != SIW_WR_IDLE) { + spin_unlock_irqrestore(&qp->sq_lock, flags); + goto skip_direct_sending; + } + rv = siw_activate_tx(qp); + spin_unlock_irqrestore(&qp->sq_lock, flags); + + if (rv <= 0) + goto skip_direct_sending; + + if (qp->kernel_verbs) { + rv = siw_sq_start(qp); + } else { + qp->tx_ctx.in_syscall = 1; + + if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend)) + siw_qp_cm_drop(qp, 0); + + qp->tx_ctx.in_syscall = 0; + } +skip_direct_sending: + + up_read(&qp->state_lock); + + if (rv >= 0) + return 0; + /* + * Immediate error + */ + siw_dbg_qp(qp, "error %d\n", rv); + + *bad_wr = wr; + return rv; +} + +/* + * siw_post_receive() + * + * Post a list of R-WR's to a RQ. + * + * @base_qp: Base QP contained in siw QP + * @wr: Null terminated list of user WR's + * @bad_wr: Points to failing WR in case of synchronous failure. + */ +int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct siw_qp *qp = to_siw_qp(base_qp); + unsigned long flags; + int rv = 0; + + if (qp->srq) { + *bad_wr = wr; + return -EOPNOTSUPP; /* what else from errno.h? */ + } + /* + * Try to acquire QP state lock. Must be non-blocking + * to accommodate kernel clients needs. + */ + if (!down_read_trylock(&qp->state_lock)) { + *bad_wr = wr; + return -ENOTCONN; + } + if (!qp->kernel_verbs) { + siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n"); + up_read(&qp->state_lock); + *bad_wr = wr; + return -EINVAL; + } + if (qp->attrs.state > SIW_QP_STATE_RTS) { + up_read(&qp->state_lock); + *bad_wr = wr; + return -EINVAL; + } + /* + * Serialize potentially multiple producers. + * Not needed for single threaded consumer side. + */ + spin_lock_irqsave(&qp->rq_lock, flags); + + while (wr) { + u32 idx = qp->rq_put % qp->attrs.rq_size; + struct siw_rqe *rqe = &qp->recvq[idx]; + + if (rqe->flags) { + siw_dbg_qp(qp, "RQ full\n"); + rv = -ENOMEM; + break; + } + if (wr->num_sge > qp->attrs.rq_max_sges) { + siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); + rv = -EINVAL; + break; + } + rqe->id = wr->wr_id; + rqe->num_sge = wr->num_sge; + siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); + + /* make sure RQE is completely written before valid */ + smp_wmb(); + + rqe->flags = SIW_WQE_VALID; + + qp->rq_put++; + wr = wr->next; + } + spin_unlock_irqrestore(&qp->rq_lock, flags); + + up_read(&qp->state_lock); + + if (rv < 0) { + siw_dbg_qp(qp, "error %d\n", rv); + *bad_wr = wr; + } + return rv > 0 ? 0 : rv; +} + +void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata) +{ + struct siw_cq *cq = to_siw_cq(base_cq); + struct siw_device *sdev = to_siw_dev(base_cq->device); + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + + siw_dbg_cq(cq, "free CQ resources\n"); + + siw_cq_flush(cq); + + if (ctx && cq->xa_cq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&ctx->xa, cq->xa_cq_index)); + + atomic_dec(&sdev->num_cq); + + vfree(cq->queue); +} + +/* + * siw_create_cq() + * + * Populate CQ of requested size + * + * @base_cq: CQ as allocated by RDMA midlayer + * @attr: Initial CQ attributes + * @udata: relates to user context + */ + +int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(base_cq->device); + struct siw_cq *cq = to_siw_cq(base_cq); + int rv, size = attr->cqe; + + if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) { + siw_dbg(base_cq->device, "too many CQ's\n"); + rv = -ENOMEM; + goto err_out; + } + if (size < 1 || size > sdev->attrs.max_cqe) { + siw_dbg(base_cq->device, "CQ size error: %d\n", size); + rv = -EINVAL; + goto err_out; + } + size = roundup_pow_of_two(size); + cq->base_cq.cqe = size; + cq->num_cqe = size; + cq->xa_cq_index = SIW_INVAL_UOBJ_KEY; + + if (!udata) { + cq->kernel_verbs = 1; + cq->queue = vzalloc(size * sizeof(struct siw_cqe) + + sizeof(struct siw_cq_ctrl)); + } else { + cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) + + sizeof(struct siw_cq_ctrl)); + } + if (cq->queue == NULL) { + rv = -ENOMEM; + goto err_out; + } + get_random_bytes(&cq->id, 4); + siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id); + + spin_lock_init(&cq->lock); + + cq->notify = &((struct siw_cq_ctrl *)&cq->queue[size])->notify; + + if (udata) { + struct siw_uresp_create_cq uresp = {}; + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + + cq->xa_cq_index = + siw_create_uobj(ctx, cq->queue, + size * sizeof(struct siw_cqe) + + sizeof(struct siw_cq_ctrl)); + if (cq->xa_cq_index == SIW_INVAL_UOBJ_KEY) { + rv = -ENOMEM; + goto err_out; + } + uresp.cq_key = cq->xa_cq_index << PAGE_SHIFT; + uresp.cq_id = cq->id; + uresp.num_cqe = size; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out; + } + return 0; + +err_out: + siw_dbg(base_cq->device, "CQ creation failed: %d", rv); + + if (cq && cq->queue) { + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + if (cq->xa_cq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&ctx->xa, cq->xa_cq_index)); + vfree(cq->queue); + } + atomic_dec(&sdev->num_cq); + + return rv; +} + +/* + * siw_poll_cq() + * + * Reap CQ entries if available and copy work completion status into + * array of WC's provided by caller. Returns number of reaped CQE's. + * + * @base_cq: Base CQ contained in siw CQ. + * @num_cqe: Maximum number of CQE's to reap. + * @wc: Array of work completions to be filled by siw. + */ +int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc) +{ + struct siw_cq *cq = to_siw_cq(base_cq); + int i; + + for (i = 0; i < num_cqe; i++) { + if (!siw_reap_cqe(cq, wc)) + break; + wc++; + } + return i; +} + +/* + * siw_req_notify_cq() + * + * Request notification for new CQE's added to that CQ. + * Defined flags: + * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification + * event if a WQE with notification flag set enters the CQ + * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification + * event if a WQE enters the CQ. + * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the + * number of not reaped CQE's regardless of its notification + * type and current or new CQ notification settings. + * + * @base_cq: Base CQ contained in siw CQ. + * @flags: Requested notification flags. + */ +int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags) +{ + struct siw_cq *cq = to_siw_cq(base_cq); + + siw_dbg_cq(cq, "flags: 0x%02x\n", flags); + + if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) + /* CQ event for next solicited completion */ + smp_store_mb(*cq->notify, SIW_NOTIFY_SOLICITED); + else + /* CQ event for any signalled completion */ + smp_store_mb(*cq->notify, SIW_NOTIFY_ALL); + + if (flags & IB_CQ_REPORT_MISSED_EVENTS) + return cq->cq_put - cq->cq_get; + + return 0; +} + +/* + * siw_dereg_mr() + * + * Release Memory Region. + * + * @base_mr: Base MR contained in siw MR. + * @udata: points to user context, unused. + */ +int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata) +{ + struct siw_mr *mr = to_siw_mr(base_mr); + struct siw_device *sdev = to_siw_dev(base_mr->device); + + siw_dbg_mem(mr->mem, "deregister MR\n"); + + atomic_dec(&sdev->num_mr); + + siw_mr_drop_mem(mr); + kfree_rcu(mr, rcu); + + return 0; +} + +/* + * siw_reg_user_mr() + * + * Register Memory Region. + * + * @pd: Protection Domain + * @start: starting address of MR (virtual address) + * @len: len of MR + * @rnic_va: not used by siw + * @rights: MR access rights + * @udata: user buffer to communicate STag and Key. + */ +struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, + u64 rnic_va, int rights, struct ib_udata *udata) +{ + struct siw_mr *mr = NULL; + struct siw_umem *umem = NULL; + struct siw_ureq_reg_mr ureq; + struct siw_device *sdev = to_siw_dev(pd->device); + + unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK); + int rv; + + siw_dbg_pd(pd, "start: 0x%016llx, va: 0x%016llx, len: %llu\n", + (unsigned long long)start, (unsigned long long)rnic_va, + (unsigned long long)len); + + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { + siw_dbg_pd(pd, "too many mr's\n"); + rv = -ENOMEM; + goto err_out; + } + if (!len) { + rv = -EINVAL; + goto err_out; + } + if (mem_limit != RLIM_INFINITY) { + unsigned long num_pages = + (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT; + mem_limit >>= PAGE_SHIFT; + + if (num_pages > mem_limit - current->mm->locked_vm) { + siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n", + num_pages, mem_limit, + current->mm->locked_vm); + rv = -ENOMEM; + goto err_out; + } + } + umem = siw_umem_get(start, len, ib_access_writable(rights)); + if (IS_ERR(umem)) { + rv = PTR_ERR(umem); + siw_dbg_pd(pd, "getting user memory failed: %d\n", rv); + umem = NULL; + goto err_out; + } + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + rv = -ENOMEM; + goto err_out; + } + rv = siw_mr_add_mem(mr, pd, umem, start, len, rights); + if (rv) + goto err_out; + + if (udata) { + struct siw_uresp_reg_mr uresp = {}; + struct siw_mem *mem = mr->mem; + + if (udata->inlen < sizeof(ureq)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq)); + if (rv) + goto err_out; + + mr->base_mr.lkey |= ureq.stag_key; + mr->base_mr.rkey |= ureq.stag_key; + mem->stag |= ureq.stag_key; + uresp.stag = mem->stag; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out; + } + mr->mem->stag_valid = 1; + + return &mr->base_mr; + +err_out: + atomic_dec(&sdev->num_mr); + if (mr) { + if (mr->mem) + siw_mr_drop_mem(mr); + kfree_rcu(mr, rcu); + } else { + if (umem) + siw_umem_release(umem, false); + } + return ERR_PTR(rv); +} + +struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, + u32 max_sge, struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mr *mr = NULL; + struct siw_pbl *pbl = NULL; + int rv; + + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { + siw_dbg_pd(pd, "too many mr's\n"); + rv = -ENOMEM; + goto err_out; + } + if (mr_type != IB_MR_TYPE_MEM_REG) { + siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type); + rv = -EOPNOTSUPP; + goto err_out; + } + if (max_sge > SIW_MAX_SGE_PBL) { + siw_dbg_pd(pd, "too many sge's: %d\n", max_sge); + rv = -ENOMEM; + goto err_out; + } + pbl = siw_pbl_alloc(max_sge); + if (IS_ERR(pbl)) { + rv = PTR_ERR(pbl); + siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv); + pbl = NULL; + goto err_out; + } + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + rv = -ENOMEM; + goto err_out; + } + rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0); + if (rv) + goto err_out; + + mr->mem->is_pbl = 1; + + siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); + + return &mr->base_mr; + +err_out: + atomic_dec(&sdev->num_mr); + + if (!mr) { + kfree(pbl); + } else { + if (mr->mem) + siw_mr_drop_mem(mr); + kfree_rcu(mr, rcu); + } + siw_dbg_pd(pd, "failed: %d\n", rv); + + return ERR_PTR(rv); +} + +/* Just used to count number of pages being mapped */ +static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr) +{ + return 0; +} + +int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, + unsigned int *sg_off) +{ + struct scatterlist *slp; + struct siw_mr *mr = to_siw_mr(base_mr); + struct siw_mem *mem = mr->mem; + struct siw_pbl *pbl = mem->pbl; + struct siw_pble *pble; + u64 pbl_size; + int i, rv; + + if (!pbl) { + siw_dbg_mem(mem, "no PBL allocated\n"); + return -EINVAL; + } + pble = pbl->pbe; + + if (pbl->max_buf < num_sle) { + siw_dbg_mem(mem, "too many SGE's: %d > %d\n", + mem->pbl->max_buf, num_sle); + return -ENOMEM; + } + for_each_sg(sl, slp, num_sle, i) { + if (sg_dma_len(slp) == 0) { + siw_dbg_mem(mem, "empty SGE\n"); + return -EINVAL; + } + if (i == 0) { + pble->addr = sg_dma_address(slp); + pble->size = sg_dma_len(slp); + pble->pbl_off = 0; + pbl_size = pble->size; + pbl->num_buf = 1; + } else { + /* Merge PBL entries if adjacent */ + if (pble->addr + pble->size == sg_dma_address(slp)) { + pble->size += sg_dma_len(slp); + } else { + pble++; + pbl->num_buf++; + pble->addr = sg_dma_address(slp); + pble->size = sg_dma_len(slp); + pble->pbl_off = pbl_size; + } + pbl_size += sg_dma_len(slp); + } + siw_dbg_mem(mem, + "sge[%d], size %llu, addr 0x%016llx, total %llu\n", + i, pble->size, pble->addr, pbl_size); + } + rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page); + if (rv > 0) { + mem->len = base_mr->length; + mem->va = base_mr->iova; + siw_dbg_mem(mem, + "%llu bytes, start 0x%016llx, %u SLE to %u entries\n", + mem->len, mem->va, num_sle, pbl->num_buf); + } + return rv; +} + +/* + * siw_get_dma_mr() + * + * Create a (empty) DMA memory region, where no umem is attached. + */ +struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mr *mr = NULL; + int rv; + + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { + siw_dbg_pd(pd, "too many mr's\n"); + rv = -ENOMEM; + goto err_out; + } + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + rv = -ENOMEM; + goto err_out; + } + rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights); + if (rv) + goto err_out; + + mr->mem->stag_valid = 1; + + siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); + + return &mr->base_mr; + +err_out: + if (rv) + kfree(mr); + + atomic_dec(&sdev->num_mr); + + return ERR_PTR(rv); +} + +/* + * siw_create_srq() + * + * Create Shared Receive Queue of attributes @init_attrs + * within protection domain given by @pd. + * + * @base_srq: Base SRQ contained in siw SRQ. + * @init_attrs: SRQ init attributes. + * @udata: points to user context + */ +int siw_create_srq(struct ib_srq *base_srq, + struct ib_srq_init_attr *init_attrs, struct ib_udata *udata) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + struct ib_srq_attr *attrs = &init_attrs->attr; + struct siw_device *sdev = to_siw_dev(base_srq->device); + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + int rv; + + if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) { + siw_dbg_pd(base_srq->pd, "too many SRQ's\n"); + rv = -ENOMEM; + goto err_out; + } + if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR || + attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) { + rv = -EINVAL; + goto err_out; + } + srq->max_sge = attrs->max_sge; + srq->num_rqe = roundup_pow_of_two(attrs->max_wr); + srq->xa_srq_index = SIW_INVAL_UOBJ_KEY; + srq->limit = attrs->srq_limit; + if (srq->limit) + srq->armed = 1; + + srq->kernel_verbs = !udata; + + if (udata) + srq->recvq = + vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe)); + else + srq->recvq = vzalloc(srq->num_rqe * sizeof(struct siw_rqe)); + + if (srq->recvq == NULL) { + rv = -ENOMEM; + goto err_out; + } + if (udata) { + struct siw_uresp_create_srq uresp = {}; + + srq->xa_srq_index = siw_create_uobj( + ctx, srq->recvq, srq->num_rqe * sizeof(struct siw_rqe)); + + if (srq->xa_srq_index == SIW_INVAL_UOBJ_KEY) { + rv = -ENOMEM; + goto err_out; + } + uresp.srq_key = srq->xa_srq_index; + uresp.num_rqe = srq->num_rqe; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out; + } + spin_lock_init(&srq->lock); + + siw_dbg_pd(base_srq->pd, "[SRQ 0x%p]: success\n", srq); + + return 0; + +err_out: + if (srq->recvq) { + if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&ctx->xa, srq->xa_srq_index)); + vfree(srq->recvq); + } + atomic_dec(&sdev->num_srq); + + return rv; +} + +/* + * siw_modify_srq() + * + * Modify SRQ. The caller may resize SRQ and/or set/reset notification + * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification. + * + * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE + * parameter. siw_modify_srq() does not check the attrs->max_sge param. + */ +int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + unsigned long flags; + int rv = 0; + + spin_lock_irqsave(&srq->lock, flags); + + if (attr_mask & IB_SRQ_MAX_WR) { + /* resize request not yet supported */ + rv = -EOPNOTSUPP; + goto out; + } + if (attr_mask & IB_SRQ_LIMIT) { + if (attrs->srq_limit) { + if (unlikely(attrs->srq_limit > srq->num_rqe)) { + rv = -EINVAL; + goto out; + } + srq->armed = 1; + } else { + srq->armed = 0; + } + srq->limit = attrs->srq_limit; + } +out: + spin_unlock_irqrestore(&srq->lock, flags); + + return rv; +} + +/* + * siw_query_srq() + * + * Query SRQ attributes. + */ +int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + unsigned long flags; + + spin_lock_irqsave(&srq->lock, flags); + + attrs->max_wr = srq->num_rqe; + attrs->max_sge = srq->max_sge; + attrs->srq_limit = srq->limit; + + spin_unlock_irqrestore(&srq->lock, flags); + + return 0; +} + +/* + * siw_destroy_srq() + * + * Destroy SRQ. + * It is assumed that the SRQ is not referenced by any + * QP anymore - the code trusts the RDMA core environment to keep track + * of QP references. + */ +void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + struct siw_device *sdev = to_siw_dev(base_srq->device); + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + + if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&ctx->xa, srq->xa_srq_index)); + + vfree(srq->recvq); + atomic_dec(&sdev->num_srq); +} + +/* + * siw_post_srq_recv() + * + * Post a list of receive queue elements to SRQ. + * NOTE: The function does not check or lock a certain SRQ state + * during the post operation. The code simply trusts the + * RDMA core environment. + * + * @base_srq: Base SRQ contained in siw SRQ + * @wr: List of R-WR's + * @bad_wr: Updated to failing WR if posting fails. + */ +int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + unsigned long flags; + int rv = 0; + + if (unlikely(!srq->kernel_verbs)) { + siw_dbg_pd(base_srq->pd, + "[SRQ 0x%p]: no kernel post_recv for mapped srq\n", + srq); + rv = -EINVAL; + goto out; + } + /* + * Serialize potentially multiple producers. + * Also needed to serialize potentially multiple + * consumers. + */ + spin_lock_irqsave(&srq->lock, flags); + + while (wr) { + u32 idx = srq->rq_put % srq->num_rqe; + struct siw_rqe *rqe = &srq->recvq[idx]; + + if (rqe->flags) { + siw_dbg_pd(base_srq->pd, "SRQ full\n"); + rv = -ENOMEM; + break; + } + if (unlikely(wr->num_sge > srq->max_sge)) { + siw_dbg_pd(base_srq->pd, + "[SRQ 0x%p]: too many sge's: %d\n", srq, + wr->num_sge); + rv = -EINVAL; + break; + } + rqe->id = wr->wr_id; + rqe->num_sge = wr->num_sge; + siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); + + /* Make sure S-RQE is completely written before valid */ + smp_wmb(); + + rqe->flags = SIW_WQE_VALID; + + srq->rq_put++; + wr = wr->next; + } + spin_unlock_irqrestore(&srq->lock, flags); +out: + if (unlikely(rv < 0)) { + siw_dbg_pd(base_srq->pd, "[SRQ 0x%p]: error %d\n", srq, rv); + *bad_wr = wr; + } + return rv; +} + +void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype) +{ + struct ib_event event; + struct ib_qp *base_qp = qp->ib_qp; + + /* + * Do not report asynchronous errors on QP which gets + * destroyed via verbs interface (siw_destroy_qp()) + */ + if (qp->attrs.flags & SIW_QP_IN_DESTROY) + return; + + event.event = etype; + event.device = base_qp->device; + event.element.qp = base_qp; + + if (base_qp->event_handler) { + siw_dbg_qp(qp, "reporting event %d\n", etype); + base_qp->event_handler(&event, base_qp->qp_context); + } +} + +void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype) +{ + struct ib_event event; + struct ib_cq *base_cq = &cq->base_cq; + + event.event = etype; + event.device = base_cq->device; + event.element.cq = base_cq; + + if (base_cq->event_handler) { + siw_dbg_cq(cq, "reporting CQ event %d\n", etype); + base_cq->event_handler(&event, base_cq->cq_context); + } +} + +void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype) +{ + struct ib_event event; + struct ib_srq *base_srq = &srq->base_srq; + + event.event = etype; + event.device = base_srq->device; + event.element.srq = base_srq; + + if (base_srq->event_handler) { + siw_dbg_pd(srq->base_srq.pd, + "reporting SRQ event %d\n", etype); + base_srq->event_handler(&event, base_srq->srq_context); + } +} + +void siw_port_event(struct siw_device *sdev, u8 port, enum ib_event_type etype) +{ + struct ib_event event; + + event.event = etype; + event.device = &sdev->base_dev; + event.element.port_num = port; + + siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype); + + ib_dispatch_event(&event); +} diff --git a/drivers/infiniband/sw/siw/siw_verbs.h b/drivers/infiniband/sw/siw/siw_verbs.h new file mode 100644 index 000000000000..1910869281cb --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_verbs.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _SIW_VERBS_H +#define _SIW_VERBS_H + +#include + +#include +#include +#include + +#include "siw.h" +#include "siw_cm.h" + +/* + * siw_copy_sgl() + * + * Copy SGL from RDMA core representation to local + * representation. + */ +static inline void siw_copy_sgl(struct ib_sge *sge, struct siw_sge *siw_sge, + int num_sge) +{ + while (num_sge--) { + siw_sge->laddr = sge->addr; + siw_sge->length = sge->length; + siw_sge->lkey = sge->lkey; + + siw_sge++; + sge++; + } +} + +int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata); +void siw_dealloc_ucontext(struct ib_ucontext *base_ctx); +int siw_query_port(struct ib_device *base_dev, u8 port, + struct ib_port_attr *attr); +int siw_get_port_immutable(struct ib_device *base_dev, u8 port, + struct ib_port_immutable *port_immutable); +int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, + struct ib_udata *udata); +int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); +int siw_query_port(struct ib_device *base_dev, u8 port, + struct ib_port_attr *attr); +int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey); +int siw_query_gid(struct ib_device *base_dev, u8 port, int idx, + union ib_gid *gid); +int siw_alloc_pd(struct ib_pd *base_pd, struct ib_udata *udata); +void siw_dealloc_pd(struct ib_pd *base_pd, struct ib_udata *udata); +struct ib_qp *siw_create_qp(struct ib_pd *base_pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata); +int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); +int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata); +int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata); +int siw_poll_cq(struct ib_cq *base_cq, int num_entries, struct ib_wc *wc); +int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags); +struct ib_mr *siw_reg_user_mr(struct ib_pd *base_pd, u64 start, u64 len, + u64 rnic_va, int rights, struct ib_udata *udata); +struct ib_mr *siw_alloc_mr(struct ib_pd *base_pd, enum ib_mr_type mr_type, + u32 max_sge, struct ib_udata *udata); +struct ib_mr *siw_get_dma_mr(struct ib_pd *base_pd, int rights); +int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, + unsigned int *sg_off); +int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata); +int siw_create_srq(struct ib_srq *base_srq, struct ib_srq_init_attr *attr, + struct ib_udata *udata); +int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask mask, struct ib_udata *udata); +int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr); +void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata); +int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma); +void siw_qp_event(struct siw_qp *qp, enum ib_event_type type); +void siw_cq_event(struct siw_cq *cq, enum ib_event_type type); +void siw_srq_event(struct siw_srq *srq, enum ib_event_type type); +void siw_port_event(struct siw_device *dev, u8 port, enum ib_event_type type); + +#endif diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h index 26213f49f5c8..64c14cb0022f 100644 --- a/include/uapi/rdma/rdma_user_ioctl_cmds.h +++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h @@ -103,6 +103,7 @@ enum rdma_driver_id { RDMA_DRIVER_HFI1, RDMA_DRIVER_QIB, RDMA_DRIVER_EFA, + RDMA_DRIVER_SIW, }; #endif diff --git a/include/uapi/rdma/siw-abi.h b/include/uapi/rdma/siw-abi.h new file mode 100644 index 000000000000..3dd8071ace7b --- /dev/null +++ b/include/uapi/rdma/siw-abi.h @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _SIW_USER_H +#define _SIW_USER_H + +#include + +#define SIW_NODE_DESC_COMMON "Software iWARP stack" +#define SIW_ABI_VERSION 1 +#define SIW_MAX_SGE 6 +#define SIW_UOBJ_MAX_KEY 0x08FFFF +#define SIW_INVAL_UOBJ_KEY (SIW_UOBJ_MAX_KEY + 1) + +struct siw_uresp_create_cq { + __u32 cq_id; + __u32 num_cqe; + __aligned_u64 cq_key; +}; + +struct siw_uresp_create_qp { + __u32 qp_id; + __u32 num_sqe; + __u32 num_rqe; + __u32 pad; + __aligned_u64 sq_key; + __aligned_u64 rq_key; +}; + +struct siw_ureq_reg_mr { + __u8 stag_key; + __u8 reserved[3]; + __u32 pad; +}; + +struct siw_uresp_reg_mr { + __u32 stag; + __u32 pad; +}; + +struct siw_uresp_create_srq { + __u32 num_rqe; + __u32 pad; + __aligned_u64 srq_key; +}; + +struct siw_uresp_alloc_ctx { + __u32 dev_id; + __u32 pad; +}; + +enum siw_opcode { + SIW_OP_WRITE, + SIW_OP_READ, + SIW_OP_READ_LOCAL_INV, + SIW_OP_SEND, + SIW_OP_SEND_WITH_IMM, + SIW_OP_SEND_REMOTE_INV, + + /* Unsupported */ + SIW_OP_FETCH_AND_ADD, + SIW_OP_COMP_AND_SWAP, + + SIW_OP_RECEIVE, + /* provider internal SQE */ + SIW_OP_READ_RESPONSE, + /* + * below opcodes valid for + * in-kernel clients only + */ + SIW_OP_INVAL_STAG, + SIW_OP_REG_MR, + SIW_NUM_OPCODES +}; + +/* Keep it same as ibv_sge to allow for memcpy */ +struct siw_sge { + __aligned_u64 laddr; + __u32 length; + __u32 lkey; +}; + +/* + * Inline data are kept within the work request itself occupying + * the space of sge[1] .. sge[n]. Therefore, inline data cannot be + * supported if SIW_MAX_SGE is below 2 elements. + */ +#define SIW_MAX_INLINE (sizeof(struct siw_sge) * (SIW_MAX_SGE - 1)) + +#if SIW_MAX_SGE < 2 +#error "SIW_MAX_SGE must be at least 2" +#endif + +enum siw_wqe_flags { + SIW_WQE_VALID = 1, + SIW_WQE_INLINE = (1 << 1), + SIW_WQE_SIGNALLED = (1 << 2), + SIW_WQE_SOLICITED = (1 << 3), + SIW_WQE_READ_FENCE = (1 << 4), + SIW_WQE_REM_INVAL = (1 << 5), + SIW_WQE_COMPLETED = (1 << 6) +}; + +/* Send Queue Element */ +struct siw_sqe { + __aligned_u64 id; + __u16 flags; + __u8 num_sge; + /* Contains enum siw_opcode values */ + __u8 opcode; + __u32 rkey; + union { + __aligned_u64 raddr; + __aligned_u64 base_mr; + }; + union { + struct siw_sge sge[SIW_MAX_SGE]; + __aligned_u64 access; + }; +}; + +/* Receive Queue Element */ +struct siw_rqe { + __aligned_u64 id; + __u16 flags; + __u8 num_sge; + /* + * only used by kernel driver, + * ignored if set by user + */ + __u8 opcode; + __u32 unused; + struct siw_sge sge[SIW_MAX_SGE]; +}; + +enum siw_notify_flags { + SIW_NOTIFY_NOT = (0), + SIW_NOTIFY_SOLICITED = (1 << 0), + SIW_NOTIFY_NEXT_COMPLETION = (1 << 1), + SIW_NOTIFY_MISSED_EVENTS = (1 << 2), + SIW_NOTIFY_ALL = SIW_NOTIFY_SOLICITED | SIW_NOTIFY_NEXT_COMPLETION | + SIW_NOTIFY_MISSED_EVENTS +}; + +enum siw_wc_status { + SIW_WC_SUCCESS, + SIW_WC_LOC_LEN_ERR, + SIW_WC_LOC_PROT_ERR, + SIW_WC_LOC_QP_OP_ERR, + SIW_WC_WR_FLUSH_ERR, + SIW_WC_BAD_RESP_ERR, + SIW_WC_LOC_ACCESS_ERR, + SIW_WC_REM_ACCESS_ERR, + SIW_WC_REM_INV_REQ_ERR, + SIW_WC_GENERAL_ERR, + SIW_NUM_WC_STATUS +}; + +struct siw_cqe { + __aligned_u64 id; + __u8 flags; + __u8 opcode; + __u16 status; + __u32 bytes; + union { + __aligned_u64 imm_data; + __u32 inval_stag; + }; + /* QP number or QP pointer */ + union { + struct ib_qp *base_qp; + __aligned_u64 qp_id; + }; +}; + +/* + * Shared structure between user and kernel + * to control CQ arming. + */ +struct siw_cq_ctrl { + __aligned_u64 notify; +}; +#endif -- cgit v1.2.3 From 2251334dcac9eb337575d8767e2a6a7e81848f7f Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:28 +0200 Subject: rdma/siw: application buffer management Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_mem.c | 460 ++++++++++++++++++++++++++++++++++++ drivers/infiniband/sw/siw/siw_mem.h | 74 ++++++ 2 files changed, 534 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw_mem.c create mode 100644 drivers/infiniband/sw/siw/siw_mem.h (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c new file mode 100644 index 000000000000..67171c82b0c4 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_mem.c @@ -0,0 +1,460 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include +#include +#include +#include +#include +#include + +#include "siw.h" +#include "siw_mem.h" + +/* + * Stag lookup is based on its index part only (24 bits). + * The code avoids special Stag of zero and tries to randomize + * STag values between 1 and SIW_STAG_MAX_INDEX. + */ +int siw_mem_add(struct siw_device *sdev, struct siw_mem *m) +{ + struct xa_limit limit = XA_LIMIT(1, 0x00ffffff); + u32 id, next; + + get_random_bytes(&next, 4); + next &= 0x00ffffff; + + if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next, + GFP_KERNEL) < 0) + return -ENOMEM; + + /* Set the STag index part */ + m->stag = id << 8; + + siw_dbg_mem(m, "new MEM object\n"); + + return 0; +} + +/* + * siw_mem_id2obj() + * + * resolves memory from stag given by id. might be called from: + * o process context before sending out of sgl, or + * o in softirq when resolving target memory + */ +struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index) +{ + struct siw_mem *mem; + + rcu_read_lock(); + mem = xa_load(&sdev->mem_xa, stag_index); + if (likely(mem && kref_get_unless_zero(&mem->ref))) { + rcu_read_unlock(); + return mem; + } + rcu_read_unlock(); + + return NULL; +} + +static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages, + bool dirty) +{ + struct page **p = chunk->plist; + + while (num_pages--) { + if (!PageDirty(*p) && dirty) + put_user_pages_dirty_lock(p, 1); + else + put_user_page(*p); + p++; + } +} + +void siw_umem_release(struct siw_umem *umem, bool dirty) +{ + struct mm_struct *mm_s = umem->owning_mm; + int i, num_pages = umem->num_pages; + + for (i = 0; num_pages; i++) { + int to_free = min_t(int, PAGES_PER_CHUNK, num_pages); + + siw_free_plist(&umem->page_chunk[i], to_free, + umem->writable && dirty); + kfree(umem->page_chunk[i].plist); + num_pages -= to_free; + } + atomic64_sub(umem->num_pages, &mm_s->pinned_vm); + + mmdrop(mm_s); + kfree(umem->page_chunk); + kfree(umem); +} + +int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj, + u64 start, u64 len, int rights) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL); + struct xa_limit limit = XA_LIMIT(1, 0x00ffffff); + u32 id, next; + + if (!mem) + return -ENOMEM; + + mem->mem_obj = mem_obj; + mem->stag_valid = 0; + mem->sdev = sdev; + mem->va = start; + mem->len = len; + mem->pd = pd; + mem->perms = rights & IWARP_ACCESS_MASK; + kref_init(&mem->ref); + + mr->mem = mem; + + get_random_bytes(&next, 4); + next &= 0x00ffffff; + + if (xa_alloc_cyclic(&sdev->mem_xa, &id, mem, limit, &next, + GFP_KERNEL) < 0) { + kfree(mem); + return -ENOMEM; + } + /* Set the STag index part */ + mem->stag = id << 8; + mr->base_mr.lkey = mr->base_mr.rkey = mem->stag; + + return 0; +} + +void siw_mr_drop_mem(struct siw_mr *mr) +{ + struct siw_mem *mem = mr->mem, *found; + + mem->stag_valid = 0; + + /* make STag invalid visible asap */ + smp_mb(); + + found = xa_erase(&mem->sdev->mem_xa, mem->stag >> 8); + WARN_ON(found != mem); + siw_mem_put(mem); +} + +void siw_free_mem(struct kref *ref) +{ + struct siw_mem *mem = container_of(ref, struct siw_mem, ref); + + siw_dbg_mem(mem, "free mem, pbl: %s\n", mem->is_pbl ? "y" : "n"); + + if (!mem->is_mw && mem->mem_obj) { + if (mem->is_pbl == 0) + siw_umem_release(mem->umem, true); + else + kfree(mem->pbl); + } + kfree(mem); +} + +/* + * siw_check_mem() + * + * Check protection domain, STAG state, access permissions and + * address range for memory object. + * + * @pd: Protection Domain memory should belong to + * @mem: memory to be checked + * @addr: starting addr of mem + * @perms: requested access permissions + * @len: len of memory interval to be checked + * + */ +int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, + enum ib_access_flags perms, int len) +{ + if (!mem->stag_valid) { + siw_dbg_pd(pd, "STag 0x%08x invalid\n", mem->stag); + return -E_STAG_INVALID; + } + if (mem->pd != pd) { + siw_dbg_pd(pd, "STag 0x%08x: PD mismatch\n", mem->stag); + return -E_PD_MISMATCH; + } + /* + * check access permissions + */ + if ((mem->perms & perms) < perms) { + siw_dbg_pd(pd, "permissions 0x%08x < 0x%08x\n", + mem->perms, perms); + return -E_ACCESS_PERM; + } + /* + * Check if access falls into valid memory interval. + */ + if (addr < mem->va || addr + len > mem->va + mem->len) { + siw_dbg_pd(pd, "MEM interval len %d\n", len); + siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] out of bounds\n", + (unsigned long long)addr, + (unsigned long long)(addr + len)); + siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] STag=0x%08x\n", + (unsigned long long)mem->va, + (unsigned long long)(mem->va + mem->len), + mem->stag); + + return -E_BASE_BOUNDS; + } + return E_ACCESS_OK; +} + +/* + * siw_check_sge() + * + * Check SGE for access rights in given interval + * + * @pd: Protection Domain memory should belong to + * @sge: SGE to be checked + * @mem: location of memory reference within array + * @perms: requested access permissions + * @off: starting offset in SGE + * @len: len of memory interval to be checked + * + * NOTE: Function references SGE's memory object (mem->obj) + * if not yet done. New reference is kept if check went ok and + * released if check failed. If mem->obj is already valid, no new + * lookup is being done and mem is not released it check fails. + */ +int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[], + enum ib_access_flags perms, u32 off, int len) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mem *new = NULL; + int rv = E_ACCESS_OK; + + if (len + off > sge->length) { + rv = -E_BASE_BOUNDS; + goto fail; + } + if (*mem == NULL) { + new = siw_mem_id2obj(sdev, sge->lkey >> 8); + if (unlikely(!new)) { + siw_dbg_pd(pd, "STag unknown: 0x%08x\n", sge->lkey); + rv = -E_STAG_INVALID; + goto fail; + } + *mem = new; + } + /* Check if user re-registered with different STag key */ + if (unlikely((*mem)->stag != sge->lkey)) { + siw_dbg_mem((*mem), "STag mismatch: 0x%08x\n", sge->lkey); + rv = -E_STAG_INVALID; + goto fail; + } + rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len); + if (unlikely(rv)) + goto fail; + + return 0; + +fail: + if (new) { + *mem = NULL; + siw_mem_put(new); + } + return rv; +} + +void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op) +{ + switch (op) { + case SIW_OP_SEND: + case SIW_OP_WRITE: + case SIW_OP_SEND_WITH_IMM: + case SIW_OP_SEND_REMOTE_INV: + case SIW_OP_READ: + case SIW_OP_READ_LOCAL_INV: + if (!(wqe->sqe.flags & SIW_WQE_INLINE)) + siw_unref_mem_sgl(wqe->mem, wqe->sqe.num_sge); + break; + + case SIW_OP_RECEIVE: + siw_unref_mem_sgl(wqe->mem, wqe->rqe.num_sge); + break; + + case SIW_OP_READ_RESPONSE: + siw_unref_mem_sgl(wqe->mem, 1); + break; + + default: + /* + * SIW_OP_INVAL_STAG and SIW_OP_REG_MR + * do not hold memory references + */ + break; + } +} + +int siw_invalidate_stag(struct ib_pd *pd, u32 stag) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mem *mem = siw_mem_id2obj(sdev, stag >> 8); + int rv = 0; + + if (unlikely(!mem)) { + siw_dbg_pd(pd, "STag 0x%08x unknown\n", stag); + return -EINVAL; + } + if (unlikely(mem->pd != pd)) { + siw_dbg_pd(pd, "PD mismatch for STag 0x%08x\n", stag); + rv = -EACCES; + goto out; + } + /* + * Per RDMA verbs definition, an STag may already be in invalid + * state if invalidation is requested. So no state check here. + */ + mem->stag_valid = 0; + + siw_dbg_pd(pd, "STag 0x%08x now invalid\n", stag); +out: + siw_mem_put(mem); + return rv; +} + +/* + * Gets physical address backed by PBL element. Address is referenced + * by linear byte offset into list of variably sized PB elements. + * Optionally, provides remaining len within current element, and + * current PBL index for later resume at same element. + */ +u64 siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx) +{ + int i = idx ? *idx : 0; + + while (i < pbl->num_buf) { + struct siw_pble *pble = &pbl->pbe[i]; + + if (pble->pbl_off + pble->size > off) { + u64 pble_off = off - pble->pbl_off; + + if (len) + *len = pble->size - pble_off; + if (idx) + *idx = i; + + return pble->addr + pble_off; + } + i++; + } + if (len) + *len = 0; + return 0; +} + +struct siw_pbl *siw_pbl_alloc(u32 num_buf) +{ + struct siw_pbl *pbl; + int buf_size = sizeof(*pbl); + + if (num_buf == 0) + return ERR_PTR(-EINVAL); + + buf_size += ((num_buf - 1) * sizeof(struct siw_pble)); + + pbl = kzalloc(buf_size, GFP_KERNEL); + if (!pbl) + return ERR_PTR(-ENOMEM); + + pbl->max_buf = num_buf; + + return pbl; +} + +struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) +{ + struct siw_umem *umem; + struct mm_struct *mm_s; + u64 first_page_va; + unsigned long mlock_limit; + unsigned int foll_flags = FOLL_WRITE; + int num_pages, num_chunks, i, rv = 0; + + if (!can_do_mlock()) + return ERR_PTR(-EPERM); + + if (!len) + return ERR_PTR(-EINVAL); + + first_page_va = start & PAGE_MASK; + num_pages = PAGE_ALIGN(start + len - first_page_va) >> PAGE_SHIFT; + num_chunks = (num_pages >> CHUNK_SHIFT) + 1; + + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + + mm_s = current->mm; + umem->owning_mm = mm_s; + umem->writable = writable; + + mmgrab(mm_s); + + if (!writable) + foll_flags |= FOLL_FORCE; + + down_read(&mm_s->mmap_sem); + + mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if (num_pages + atomic64_read(&mm_s->pinned_vm) > mlock_limit) { + rv = -ENOMEM; + goto out_sem_up; + } + umem->fp_addr = first_page_va; + + umem->page_chunk = + kcalloc(num_chunks, sizeof(struct siw_page_chunk), GFP_KERNEL); + if (!umem->page_chunk) { + rv = -ENOMEM; + goto out_sem_up; + } + for (i = 0; num_pages; i++) { + int got, nents = min_t(int, num_pages, PAGES_PER_CHUNK); + + umem->page_chunk[i].plist = + kcalloc(nents, sizeof(struct page *), GFP_KERNEL); + if (!umem->page_chunk[i].plist) { + rv = -ENOMEM; + goto out_sem_up; + } + got = 0; + while (nents) { + struct page **plist = &umem->page_chunk[i].plist[got]; + + rv = get_user_pages(first_page_va, nents, + foll_flags | FOLL_LONGTERM, + plist, NULL); + if (rv < 0) + goto out_sem_up; + + umem->num_pages += rv; + atomic64_add(rv, &mm_s->pinned_vm); + first_page_va += rv * PAGE_SIZE; + nents -= rv; + got += rv; + } + num_pages -= got; + } +out_sem_up: + up_read(&mm_s->mmap_sem); + + if (rv > 0) + return umem; + + siw_umem_release(umem, false); + + return ERR_PTR(rv); +} diff --git a/drivers/infiniband/sw/siw/siw_mem.h b/drivers/infiniband/sw/siw/siw_mem.h new file mode 100644 index 000000000000..f43daf280891 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_mem.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _SIW_MEM_H +#define _SIW_MEM_H + +struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable); +void siw_umem_release(struct siw_umem *umem, bool dirty); +struct siw_pbl *siw_pbl_alloc(u32 num_buf); +u64 siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx); +struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index); +int siw_mem_add(struct siw_device *sdev, struct siw_mem *m); +int siw_invalidate_stag(struct ib_pd *pd, u32 stag); +int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, + enum ib_access_flags perms, int len); +int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, + struct siw_mem *mem[], enum ib_access_flags perms, + u32 off, int len); +void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op); +int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj, + u64 start, u64 len, int rights); +void siw_mr_drop_mem(struct siw_mr *mr); +void siw_free_mem(struct kref *ref); + +static inline void siw_mem_put(struct siw_mem *mem) +{ + kref_put(&mem->ref, siw_free_mem); +} + +static inline struct siw_mr *siw_mem2mr(struct siw_mem *m) +{ + return container_of(m, struct siw_mr, mem); +} + +static inline void siw_unref_mem_sgl(struct siw_mem **mem, unsigned int num_sge) +{ + while (num_sge) { + if (*mem == NULL) + break; + + siw_mem_put(*mem); + *mem = NULL; + mem++; + num_sge--; + } +} + +#define CHUNK_SHIFT 9 /* sets number of pages per chunk */ +#define PAGES_PER_CHUNK (_AC(1, UL) << CHUNK_SHIFT) +#define CHUNK_MASK (~(PAGES_PER_CHUNK - 1)) +#define PAGE_CHUNK_SIZE (PAGES_PER_CHUNK * sizeof(struct page *)) + +/* + * siw_get_upage() + * + * Get page pointer for address on given umem. + * + * @umem: two dimensional list of page pointers + * @addr: user virtual address + */ +static inline struct page *siw_get_upage(struct siw_umem *umem, u64 addr) +{ + unsigned int page_idx = (addr - umem->fp_addr) >> PAGE_SHIFT, + chunk_idx = page_idx >> CHUNK_SHIFT, + page_in_chunk = page_idx & ~CHUNK_MASK; + + if (likely(page_idx < umem->num_pages)) + return umem->page_chunk[chunk_idx].plist[page_in_chunk]; + + return NULL; +} +#endif -- cgit v1.2.3 From f29dd55b0236f7a26a4b9dd69186e3c04266797b Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:29 +0200 Subject: rdma/siw: queue pair methods Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_qp.c | 1322 ++++++++++++++++++++++++++++++++++++ 1 file changed, 1322 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw_qp.c (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/siw_qp.c b/drivers/infiniband/sw/siw/siw_qp.c new file mode 100644 index 000000000000..11383d9f95ef --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_qp.c @@ -0,0 +1,1322 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include +#include +#include +#include +#include +#include +#include + +#include "siw.h" +#include "siw_verbs.h" +#include "siw_mem.h" + +static char siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof "TERMINATE"] = { + [SIW_QP_STATE_IDLE] = "IDLE", + [SIW_QP_STATE_RTR] = "RTR", + [SIW_QP_STATE_RTS] = "RTS", + [SIW_QP_STATE_CLOSING] = "CLOSING", + [SIW_QP_STATE_TERMINATE] = "TERMINATE", + [SIW_QP_STATE_ERROR] = "ERROR" +}; + +/* + * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp settings on a + * per-RDMAP message basis. Please keep order of initializer. All MPA len + * is initialized to minimum packet size. + */ +struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] = { + { /* RDMAP_RDMA_WRITE */ + .hdr_len = sizeof(struct iwarp_rdma_write), + .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST | + cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_RDMA_WRITE), + .rx_data = siw_proc_write }, + { /* RDMAP_RDMA_READ_REQ */ + .hdr_len = sizeof(struct iwarp_rdma_rreq), + .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_RDMA_READ_REQ), + .rx_data = siw_proc_rreq }, + { /* RDMAP_RDMA_READ_RESP */ + .hdr_len = sizeof(struct iwarp_rdma_rresp), + .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST | + cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_RDMA_READ_RESP), + .rx_data = siw_proc_rresp }, + { /* RDMAP_SEND */ + .hdr_len = sizeof(struct iwarp_send), + .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_SEND), + .rx_data = siw_proc_send }, + { /* RDMAP_SEND_INVAL */ + .hdr_len = sizeof(struct iwarp_send_inv), + .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_SEND_INVAL), + .rx_data = siw_proc_send }, + { /* RDMAP_SEND_SE */ + .hdr_len = sizeof(struct iwarp_send), + .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_SEND_SE), + .rx_data = siw_proc_send }, + { /* RDMAP_SEND_SE_INVAL */ + .hdr_len = sizeof(struct iwarp_send_inv), + .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_SEND_SE_INVAL), + .rx_data = siw_proc_send }, + { /* RDMAP_TERMINATE */ + .hdr_len = sizeof(struct iwarp_terminate), + .ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_TERMINATE), + .rx_data = siw_proc_terminate } +}; + +void siw_qp_llp_data_ready(struct sock *sk) +{ + struct siw_qp *qp; + + read_lock(&sk->sk_callback_lock); + + if (unlikely(!sk->sk_user_data || !sk_to_qp(sk))) + goto done; + + qp = sk_to_qp(sk); + + if (likely(!qp->rx_stream.rx_suspend && + down_read_trylock(&qp->state_lock))) { + read_descriptor_t rd_desc = { .arg.data = qp, .count = 1 }; + + if (likely(qp->attrs.state == SIW_QP_STATE_RTS)) + /* + * Implements data receive operation during + * socket callback. TCP gracefully catches + * the case where there is nothing to receive + * (not calling siw_tcp_rx_data() then). + */ + tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data); + + up_read(&qp->state_lock); + } else { + siw_dbg_qp(qp, "unable to process RX, suspend: %d\n", + qp->rx_stream.rx_suspend); + } +done: + read_unlock(&sk->sk_callback_lock); +} + +void siw_qp_llp_close(struct siw_qp *qp) +{ + siw_dbg_qp(qp, "enter llp close, state = %s\n", + siw_qp_state_to_string[qp->attrs.state]); + + down_write(&qp->state_lock); + + qp->rx_stream.rx_suspend = 1; + qp->tx_ctx.tx_suspend = 1; + qp->attrs.sk = NULL; + + switch (qp->attrs.state) { + case SIW_QP_STATE_RTS: + case SIW_QP_STATE_RTR: + case SIW_QP_STATE_IDLE: + case SIW_QP_STATE_TERMINATE: + qp->attrs.state = SIW_QP_STATE_ERROR; + break; + /* + * SIW_QP_STATE_CLOSING: + * + * This is a forced close. shall the QP be moved to + * ERROR or IDLE ? + */ + case SIW_QP_STATE_CLOSING: + if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) + qp->attrs.state = SIW_QP_STATE_ERROR; + else + qp->attrs.state = SIW_QP_STATE_IDLE; + break; + + default: + siw_dbg_qp(qp, "llp close: no state transition needed: %s\n", + siw_qp_state_to_string[qp->attrs.state]); + break; + } + siw_sq_flush(qp); + siw_rq_flush(qp); + + /* + * Dereference closing CEP + */ + if (qp->cep) { + siw_cep_put(qp->cep); + qp->cep = NULL; + } + + up_write(&qp->state_lock); + + siw_dbg_qp(qp, "llp close exit: state %s\n", + siw_qp_state_to_string[qp->attrs.state]); +} + +/* + * socket callback routine informing about newly available send space. + * Function schedules SQ work for processing SQ items. + */ +void siw_qp_llp_write_space(struct sock *sk) +{ + struct siw_cep *cep = sk_to_cep(sk); + + cep->sk_write_space(sk); + + if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + (void)siw_sq_start(cep->qp); +} + +static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int orq_size) +{ + irq_size = roundup_pow_of_two(irq_size); + orq_size = roundup_pow_of_two(orq_size); + + qp->attrs.irq_size = irq_size; + qp->attrs.orq_size = orq_size; + + qp->irq = vzalloc(irq_size * sizeof(struct siw_sqe)); + if (!qp->irq) { + siw_dbg_qp(qp, "irq malloc for %d failed\n", irq_size); + qp->attrs.irq_size = 0; + return -ENOMEM; + } + qp->orq = vzalloc(orq_size * sizeof(struct siw_sqe)); + if (!qp->orq) { + siw_dbg_qp(qp, "orq malloc for %d failed\n", orq_size); + qp->attrs.orq_size = 0; + qp->attrs.irq_size = 0; + vfree(qp->irq); + return -ENOMEM; + } + siw_dbg_qp(qp, "ORD %d, IRD %d\n", orq_size, irq_size); + return 0; +} + +static int siw_qp_enable_crc(struct siw_qp *qp) +{ + struct siw_rx_stream *c_rx = &qp->rx_stream; + struct siw_iwarp_tx *c_tx = &qp->tx_ctx; + int size = crypto_shash_descsize(siw_crypto_shash) + + sizeof(struct shash_desc); + + if (siw_crypto_shash == NULL) + return -ENOENT; + + c_tx->mpa_crc_hd = kzalloc(size, GFP_KERNEL); + c_rx->mpa_crc_hd = kzalloc(size, GFP_KERNEL); + if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) { + kfree(c_tx->mpa_crc_hd); + kfree(c_rx->mpa_crc_hd); + c_tx->mpa_crc_hd = NULL; + c_rx->mpa_crc_hd = NULL; + return -ENOMEM; + } + c_tx->mpa_crc_hd->tfm = siw_crypto_shash; + c_rx->mpa_crc_hd->tfm = siw_crypto_shash; + + return 0; +} + +/* + * Send a non signalled READ or WRITE to peer side as negotiated + * with MPAv2 P2P setup protocol. The work request is only created + * as a current active WR and does not consume Send Queue space. + * + * Caller must hold QP state lock. + */ +int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl) +{ + struct siw_wqe *wqe = tx_wqe(qp); + unsigned long flags; + int rv = 0; + + spin_lock_irqsave(&qp->sq_lock, flags); + + if (unlikely(wqe->wr_status != SIW_WR_IDLE)) { + spin_unlock_irqrestore(&qp->sq_lock, flags); + return -EIO; + } + memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE); + + wqe->wr_status = SIW_WR_QUEUED; + wqe->sqe.flags = 0; + wqe->sqe.num_sge = 1; + wqe->sqe.sge[0].length = 0; + wqe->sqe.sge[0].laddr = 0; + wqe->sqe.sge[0].lkey = 0; + /* + * While it must not be checked for inbound zero length + * READ/WRITE, some HW may treat STag 0 special. + */ + wqe->sqe.rkey = 1; + wqe->sqe.raddr = 0; + wqe->processed = 0; + + if (ctrl & MPA_V2_RDMA_WRITE_RTR) + wqe->sqe.opcode = SIW_OP_WRITE; + else if (ctrl & MPA_V2_RDMA_READ_RTR) { + struct siw_sqe *rreq; + + wqe->sqe.opcode = SIW_OP_READ; + + spin_lock(&qp->orq_lock); + + rreq = orq_get_free(qp); + if (rreq) { + siw_read_to_orq(rreq, &wqe->sqe); + qp->orq_put++; + } else + rv = -EIO; + + spin_unlock(&qp->orq_lock); + } else + rv = -EINVAL; + + if (rv) + wqe->wr_status = SIW_WR_IDLE; + + spin_unlock_irqrestore(&qp->sq_lock, flags); + + if (!rv) + rv = siw_sq_start(qp); + + return rv; +} + +/* + * Map memory access error to DDP tagged error + */ +enum ddp_ecode siw_tagged_error(enum siw_access_state state) +{ + switch (state) { + case E_STAG_INVALID: + return DDP_ECODE_T_INVALID_STAG; + case E_BASE_BOUNDS: + return DDP_ECODE_T_BASE_BOUNDS; + case E_PD_MISMATCH: + return DDP_ECODE_T_STAG_NOT_ASSOC; + case E_ACCESS_PERM: + /* + * RFC 5041 (DDP) lacks an ecode for insufficient access + * permissions. 'Invalid STag' seem to be the closest + * match though. + */ + return DDP_ECODE_T_INVALID_STAG; + default: + WARN_ON(1); + return DDP_ECODE_T_INVALID_STAG; + } +} + +/* + * Map memory access error to RDMAP protection error + */ +enum rdmap_ecode siw_rdmap_error(enum siw_access_state state) +{ + switch (state) { + case E_STAG_INVALID: + return RDMAP_ECODE_INVALID_STAG; + case E_BASE_BOUNDS: + return RDMAP_ECODE_BASE_BOUNDS; + case E_PD_MISMATCH: + return RDMAP_ECODE_STAG_NOT_ASSOC; + case E_ACCESS_PERM: + return RDMAP_ECODE_ACCESS_RIGHTS; + default: + return RDMAP_ECODE_UNSPECIFIED; + } +} + +void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer, u8 etype, + u8 ecode, int in_tx) +{ + if (!qp->term_info.valid) { + memset(&qp->term_info, 0, sizeof(qp->term_info)); + qp->term_info.layer = layer; + qp->term_info.etype = etype; + qp->term_info.ecode = ecode; + qp->term_info.in_tx = in_tx; + qp->term_info.valid = 1; + } + siw_dbg_qp(qp, "init TERM: layer %d, type %d, code %d, in tx %s\n", + layer, etype, ecode, in_tx ? "yes" : "no"); +} + +/* + * Send a TERMINATE message, as defined in RFC's 5040/5041/5044/6581. + * Sending TERMINATE messages is best effort - such messages + * can only be send if the QP is still connected and it does + * not have another outbound message in-progress, i.e. the + * TERMINATE message must not interfer with an incomplete current + * transmit operation. + */ +void siw_send_terminate(struct siw_qp *qp) +{ + struct kvec iov[3]; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; + struct iwarp_terminate *term = NULL; + union iwarp_hdr *err_hdr = NULL; + struct socket *s = qp->attrs.sk; + struct siw_rx_stream *srx = &qp->rx_stream; + union iwarp_hdr *rx_hdr = &srx->hdr; + u32 crc = 0; + int num_frags, len_terminate, rv; + + if (!qp->term_info.valid) + return; + + qp->term_info.valid = 0; + + if (tx_wqe(qp)->wr_status == SIW_WR_INPROGRESS) { + siw_dbg_qp(qp, "cannot send TERMINATE: op %d in progress\n", + tx_type(tx_wqe(qp))); + return; + } + if (!s && qp->cep) + /* QP not yet in RTS. Take socket from connection end point */ + s = qp->cep->sock; + + if (!s) { + siw_dbg_qp(qp, "cannot send TERMINATE: not connected\n"); + return; + } + + term = kzalloc(sizeof(*term), GFP_KERNEL); + if (!term) + return; + + term->ddp_qn = cpu_to_be32(RDMAP_UNTAGGED_QN_TERMINATE); + term->ddp_mo = 0; + term->ddp_msn = cpu_to_be32(1); + + iov[0].iov_base = term; + iov[0].iov_len = sizeof(*term); + + if ((qp->term_info.layer == TERM_ERROR_LAYER_DDP) || + ((qp->term_info.layer == TERM_ERROR_LAYER_RDMAP) && + (qp->term_info.etype != RDMAP_ETYPE_CATASTROPHIC))) { + err_hdr = kzalloc(sizeof(*err_hdr), GFP_KERNEL); + if (!err_hdr) { + kfree(term); + return; + } + } + memcpy(&term->ctrl, &iwarp_pktinfo[RDMAP_TERMINATE].ctrl, + sizeof(struct iwarp_ctrl)); + + __rdmap_term_set_layer(term, qp->term_info.layer); + __rdmap_term_set_etype(term, qp->term_info.etype); + __rdmap_term_set_ecode(term, qp->term_info.ecode); + + switch (qp->term_info.layer) { + case TERM_ERROR_LAYER_RDMAP: + if (qp->term_info.etype == RDMAP_ETYPE_CATASTROPHIC) + /* No additional DDP/RDMAP header to be included */ + break; + + if (qp->term_info.etype == RDMAP_ETYPE_REMOTE_PROTECTION) { + /* + * Complete RDMAP frame will get attached, and + * DDP segment length is valid + */ + term->flag_m = 1; + term->flag_d = 1; + term->flag_r = 1; + + if (qp->term_info.in_tx) { + struct iwarp_rdma_rreq *rreq; + struct siw_wqe *wqe = tx_wqe(qp); + + /* Inbound RREQ error, detected during + * RRESP creation. Take state from + * current TX work queue element to + * reconstruct peers RREQ. + */ + rreq = (struct iwarp_rdma_rreq *)err_hdr; + + memcpy(&rreq->ctrl, + &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl, + sizeof(struct iwarp_ctrl)); + + rreq->rsvd = 0; + rreq->ddp_qn = + htonl(RDMAP_UNTAGGED_QN_RDMA_READ); + + /* Provide RREQ's MSN as kept aside */ + rreq->ddp_msn = htonl(wqe->sqe.sge[0].length); + + rreq->ddp_mo = htonl(wqe->processed); + rreq->sink_stag = htonl(wqe->sqe.rkey); + rreq->sink_to = cpu_to_be64(wqe->sqe.raddr); + rreq->read_size = htonl(wqe->sqe.sge[0].length); + rreq->source_stag = htonl(wqe->sqe.sge[0].lkey); + rreq->source_to = + cpu_to_be64(wqe->sqe.sge[0].laddr); + + iov[1].iov_base = rreq; + iov[1].iov_len = sizeof(*rreq); + + rx_hdr = (union iwarp_hdr *)rreq; + } else { + /* Take RDMAP/DDP information from + * current (failed) inbound frame. + */ + iov[1].iov_base = rx_hdr; + + if (__rdmap_get_opcode(&rx_hdr->ctrl) == + RDMAP_RDMA_READ_REQ) + iov[1].iov_len = + sizeof(struct iwarp_rdma_rreq); + else /* SEND type */ + iov[1].iov_len = + sizeof(struct iwarp_send); + } + } else { + /* Do not report DDP hdr information if packet + * layout is unknown + */ + if ((qp->term_info.ecode == RDMAP_ECODE_VERSION) || + (qp->term_info.ecode == RDMAP_ECODE_OPCODE)) + break; + + iov[1].iov_base = rx_hdr; + + /* Only DDP frame will get attached */ + if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED) + iov[1].iov_len = + sizeof(struct iwarp_rdma_write); + else + iov[1].iov_len = sizeof(struct iwarp_send); + + term->flag_m = 1; + term->flag_d = 1; + } + term->ctrl.mpa_len = cpu_to_be16(iov[1].iov_len); + break; + + case TERM_ERROR_LAYER_DDP: + /* Report error encountered while DDP processing. + * This can only happen as a result of inbound + * DDP processing + */ + + /* Do not report DDP hdr information if packet + * layout is unknown + */ + if (((qp->term_info.etype == DDP_ETYPE_TAGGED_BUF) && + (qp->term_info.ecode == DDP_ECODE_T_VERSION)) || + ((qp->term_info.etype == DDP_ETYPE_UNTAGGED_BUF) && + (qp->term_info.ecode == DDP_ECODE_UT_VERSION))) + break; + + iov[1].iov_base = rx_hdr; + + if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED) + iov[1].iov_len = sizeof(struct iwarp_ctrl_tagged); + else + iov[1].iov_len = sizeof(struct iwarp_ctrl_untagged); + + term->flag_m = 1; + term->flag_d = 1; + break; + + default: + break; + } + if (term->flag_m || term->flag_d || term->flag_r) { + iov[2].iov_base = &crc; + iov[2].iov_len = sizeof(crc); + len_terminate = sizeof(*term) + iov[1].iov_len + MPA_CRC_SIZE; + num_frags = 3; + } else { + iov[1].iov_base = &crc; + iov[1].iov_len = sizeof(crc); + len_terminate = sizeof(*term) + MPA_CRC_SIZE; + num_frags = 2; + } + + /* Adjust DDP Segment Length parameter, if valid */ + if (term->flag_m) { + u32 real_ddp_len = be16_to_cpu(rx_hdr->ctrl.mpa_len); + enum rdma_opcode op = __rdmap_get_opcode(&rx_hdr->ctrl); + + real_ddp_len -= iwarp_pktinfo[op].hdr_len - MPA_HDR_SIZE; + rx_hdr->ctrl.mpa_len = cpu_to_be16(real_ddp_len); + } + + term->ctrl.mpa_len = + cpu_to_be16(len_terminate - (MPA_HDR_SIZE + MPA_CRC_SIZE)); + if (qp->tx_ctx.mpa_crc_hd) { + crypto_shash_init(qp->tx_ctx.mpa_crc_hd); + if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd, + (u8 *)iov[0].iov_base, + iov[0].iov_len)) + goto out; + + if (num_frags == 3) { + if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd, + (u8 *)iov[1].iov_base, + iov[1].iov_len)) + goto out; + } + crypto_shash_final(qp->tx_ctx.mpa_crc_hd, (u8 *)&crc); + } + + rv = kernel_sendmsg(s, &msg, iov, num_frags, len_terminate); + siw_dbg_qp(qp, "sent TERM: %s, layer %d, type %d, code %d (%d bytes)\n", + rv == len_terminate ? "success" : "failure", + __rdmap_term_layer(term), __rdmap_term_etype(term), + __rdmap_term_ecode(term), rv); +out: + kfree(term); + kfree(err_hdr); +} + +/* + * Handle all attrs other than state + */ +static void siw_qp_modify_nonstate(struct siw_qp *qp, + struct siw_qp_attrs *attrs, + enum siw_qp_attr_mask mask) +{ + if (mask & SIW_QP_ATTR_ACCESS_FLAGS) { + if (attrs->flags & SIW_RDMA_BIND_ENABLED) + qp->attrs.flags |= SIW_RDMA_BIND_ENABLED; + else + qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED; + + if (attrs->flags & SIW_RDMA_WRITE_ENABLED) + qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED; + else + qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED; + + if (attrs->flags & SIW_RDMA_READ_ENABLED) + qp->attrs.flags |= SIW_RDMA_READ_ENABLED; + else + qp->attrs.flags &= ~SIW_RDMA_READ_ENABLED; + } +} + +static int siw_qp_nextstate_from_idle(struct siw_qp *qp, + struct siw_qp_attrs *attrs, + enum siw_qp_attr_mask mask) +{ + int rv = 0; + + switch (attrs->state) { + case SIW_QP_STATE_RTS: + if (attrs->flags & SIW_MPA_CRC) { + rv = siw_qp_enable_crc(qp); + if (rv) + break; + } + if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) { + siw_dbg_qp(qp, "no socket\n"); + rv = -EINVAL; + break; + } + if (!(mask & SIW_QP_ATTR_MPA)) { + siw_dbg_qp(qp, "no MPA\n"); + rv = -EINVAL; + break; + } + /* + * Initialize iWARP TX state + */ + qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0; + qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0; + qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0; + + /* + * Initialize iWARP RX state + */ + qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1; + qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1; + qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1; + + /* + * init IRD free queue, caller has already checked + * limits. + */ + rv = siw_qp_readq_init(qp, attrs->irq_size, + attrs->orq_size); + if (rv) + break; + + qp->attrs.sk = attrs->sk; + qp->attrs.state = SIW_QP_STATE_RTS; + + siw_dbg_qp(qp, "enter RTS: crc=%s, ord=%u, ird=%u\n", + attrs->flags & SIW_MPA_CRC ? "y" : "n", + qp->attrs.orq_size, qp->attrs.irq_size); + break; + + case SIW_QP_STATE_ERROR: + siw_rq_flush(qp); + qp->attrs.state = SIW_QP_STATE_ERROR; + if (qp->cep) { + siw_cep_put(qp->cep); + qp->cep = NULL; + } + break; + + default: + break; + } + return rv; +} + +static int siw_qp_nextstate_from_rts(struct siw_qp *qp, + struct siw_qp_attrs *attrs) +{ + int drop_conn = 0; + + switch (attrs->state) { + case SIW_QP_STATE_CLOSING: + /* + * Verbs: move to IDLE if SQ and ORQ are empty. + * Move to ERROR otherwise. But first of all we must + * close the connection. So we keep CLOSING or ERROR + * as a transient state, schedule connection drop work + * and wait for the socket state change upcall to + * come back closed. + */ + if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) { + qp->attrs.state = SIW_QP_STATE_CLOSING; + } else { + qp->attrs.state = SIW_QP_STATE_ERROR; + siw_sq_flush(qp); + } + siw_rq_flush(qp); + + drop_conn = 1; + break; + + case SIW_QP_STATE_TERMINATE: + qp->attrs.state = SIW_QP_STATE_TERMINATE; + + siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_CATASTROPHIC, + RDMAP_ECODE_UNSPECIFIED, 1); + drop_conn = 1; + break; + + case SIW_QP_STATE_ERROR: + /* + * This is an emergency close. + * + * Any in progress transmit operation will get + * cancelled. + * This will likely result in a protocol failure, + * if a TX operation is in transit. The caller + * could unconditional wait to give the current + * operation a chance to complete. + * Esp., how to handle the non-empty IRQ case? + * The peer was asking for data transfer at a valid + * point in time. + */ + siw_sq_flush(qp); + siw_rq_flush(qp); + qp->attrs.state = SIW_QP_STATE_ERROR; + drop_conn = 1; + break; + + default: + break; + } + return drop_conn; +} + +static void siw_qp_nextstate_from_term(struct siw_qp *qp, + struct siw_qp_attrs *attrs) +{ + switch (attrs->state) { + case SIW_QP_STATE_ERROR: + siw_rq_flush(qp); + qp->attrs.state = SIW_QP_STATE_ERROR; + + if (tx_wqe(qp)->wr_status != SIW_WR_IDLE) + siw_sq_flush(qp); + break; + + default: + break; + } +} + +static int siw_qp_nextstate_from_close(struct siw_qp *qp, + struct siw_qp_attrs *attrs) +{ + int rv = 0; + + switch (attrs->state) { + case SIW_QP_STATE_IDLE: + WARN_ON(tx_wqe(qp)->wr_status != SIW_WR_IDLE); + qp->attrs.state = SIW_QP_STATE_IDLE; + break; + + case SIW_QP_STATE_CLOSING: + /* + * The LLP may already moved the QP to closing + * due to graceful peer close init + */ + break; + + case SIW_QP_STATE_ERROR: + /* + * QP was moved to CLOSING by LLP event + * not yet seen by user. + */ + qp->attrs.state = SIW_QP_STATE_ERROR; + + if (tx_wqe(qp)->wr_status != SIW_WR_IDLE) + siw_sq_flush(qp); + + siw_rq_flush(qp); + break; + + default: + siw_dbg_qp(qp, "state transition undefined: %s => %s\n", + siw_qp_state_to_string[qp->attrs.state], + siw_qp_state_to_string[attrs->state]); + + rv = -ECONNABORTED; + } + return rv; +} + +/* + * Caller must hold qp->state_lock + */ +int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attrs, + enum siw_qp_attr_mask mask) +{ + int drop_conn = 0, rv = 0; + + if (!mask) + return 0; + + siw_dbg_qp(qp, "state: %s => %s\n", + siw_qp_state_to_string[qp->attrs.state], + siw_qp_state_to_string[attrs->state]); + + if (mask != SIW_QP_ATTR_STATE) + siw_qp_modify_nonstate(qp, attrs, mask); + + if (!(mask & SIW_QP_ATTR_STATE)) + return 0; + + switch (qp->attrs.state) { + case SIW_QP_STATE_IDLE: + case SIW_QP_STATE_RTR: + rv = siw_qp_nextstate_from_idle(qp, attrs, mask); + break; + + case SIW_QP_STATE_RTS: + drop_conn = siw_qp_nextstate_from_rts(qp, attrs); + break; + + case SIW_QP_STATE_TERMINATE: + siw_qp_nextstate_from_term(qp, attrs); + break; + + case SIW_QP_STATE_CLOSING: + siw_qp_nextstate_from_close(qp, attrs); + break; + default: + break; + } + if (drop_conn) + siw_qp_cm_drop(qp, 0); + + return rv; +} + +void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe) +{ + rreq->id = sqe->id; + rreq->opcode = sqe->opcode; + rreq->sge[0].laddr = sqe->sge[0].laddr; + rreq->sge[0].length = sqe->sge[0].length; + rreq->sge[0].lkey = sqe->sge[0].lkey; + rreq->sge[1].lkey = sqe->sge[1].lkey; + rreq->flags = sqe->flags | SIW_WQE_VALID; + rreq->num_sge = 1; +} + +/* + * Must be called with SQ locked. + * To avoid complete SQ starvation by constant inbound READ requests, + * the active IRQ will not be served after qp->irq_burst, if the + * SQ has pending work. + */ +int siw_activate_tx(struct siw_qp *qp) +{ + struct siw_sqe *irqe, *sqe; + struct siw_wqe *wqe = tx_wqe(qp); + int rv = 1; + + irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size]; + + if (irqe->flags & SIW_WQE_VALID) { + sqe = sq_get_next(qp); + + /* + * Avoid local WQE processing starvation in case + * of constant inbound READ request stream + */ + if (sqe && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) { + qp->irq_burst = 0; + goto skip_irq; + } + memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE); + wqe->wr_status = SIW_WR_QUEUED; + + /* start READ RESPONSE */ + wqe->sqe.opcode = SIW_OP_READ_RESPONSE; + wqe->sqe.flags = 0; + if (irqe->num_sge) { + wqe->sqe.num_sge = 1; + wqe->sqe.sge[0].length = irqe->sge[0].length; + wqe->sqe.sge[0].laddr = irqe->sge[0].laddr; + wqe->sqe.sge[0].lkey = irqe->sge[0].lkey; + } else { + wqe->sqe.num_sge = 0; + } + + /* Retain original RREQ's message sequence number for + * potential error reporting cases. + */ + wqe->sqe.sge[1].length = irqe->sge[1].length; + + wqe->sqe.rkey = irqe->rkey; + wqe->sqe.raddr = irqe->raddr; + + wqe->processed = 0; + qp->irq_get++; + + /* mark current IRQ entry free */ + smp_store_mb(irqe->flags, 0); + + goto out; + } + sqe = sq_get_next(qp); + if (sqe) { +skip_irq: + memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE); + wqe->wr_status = SIW_WR_QUEUED; + + /* First copy SQE to kernel private memory */ + memcpy(&wqe->sqe, sqe, sizeof(*sqe)); + + if (wqe->sqe.opcode >= SIW_NUM_OPCODES) { + rv = -EINVAL; + goto out; + } + if (wqe->sqe.flags & SIW_WQE_INLINE) { + if (wqe->sqe.opcode != SIW_OP_SEND && + wqe->sqe.opcode != SIW_OP_WRITE) { + rv = -EINVAL; + goto out; + } + if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) { + rv = -EINVAL; + goto out; + } + wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1]; + wqe->sqe.sge[0].lkey = 0; + wqe->sqe.num_sge = 1; + } + if (wqe->sqe.flags & SIW_WQE_READ_FENCE) { + /* A READ cannot be fenced */ + if (unlikely(wqe->sqe.opcode == SIW_OP_READ || + wqe->sqe.opcode == + SIW_OP_READ_LOCAL_INV)) { + siw_dbg_qp(qp, "cannot fence read\n"); + rv = -EINVAL; + goto out; + } + spin_lock(&qp->orq_lock); + + if (!siw_orq_empty(qp)) { + qp->tx_ctx.orq_fence = 1; + rv = 0; + } + spin_unlock(&qp->orq_lock); + + } else if (wqe->sqe.opcode == SIW_OP_READ || + wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) { + struct siw_sqe *rreq; + + wqe->sqe.num_sge = 1; + + spin_lock(&qp->orq_lock); + + rreq = orq_get_free(qp); + if (rreq) { + /* + * Make an immediate copy in ORQ to be ready + * to process loopback READ reply + */ + siw_read_to_orq(rreq, &wqe->sqe); + qp->orq_put++; + } else { + qp->tx_ctx.orq_fence = 1; + rv = 0; + } + spin_unlock(&qp->orq_lock); + } + + /* Clear SQE, can be re-used by application */ + smp_store_mb(sqe->flags, 0); + qp->sq_get++; + } else { + rv = 0; + } +out: + if (unlikely(rv < 0)) { + siw_dbg_qp(qp, "error %d\n", rv); + wqe->wr_status = SIW_WR_IDLE; + } + return rv; +} + +/* + * Check if current CQ state qualifies for calling CQ completion + * handler. Must be called with CQ lock held. + */ +static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags) +{ + u64 cq_notify; + + if (!cq->base_cq.comp_handler) + return false; + + cq_notify = READ_ONCE(*cq->notify); + + if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) || + ((cq_notify & SIW_NOTIFY_SOLICITED) && + (flags & SIW_WQE_SOLICITED))) { + /* dis-arm CQ */ + smp_store_mb(*cq->notify, SIW_NOTIFY_NOT); + + return true; + } + return false; +} + +int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes, + enum siw_wc_status status) +{ + struct siw_cq *cq = qp->scq; + int rv = 0; + + if (cq) { + u32 sqe_flags = sqe->flags; + struct siw_cqe *cqe; + u32 idx; + unsigned long flags; + + spin_lock_irqsave(&cq->lock, flags); + + idx = cq->cq_put % cq->num_cqe; + cqe = &cq->queue[idx]; + + if (!READ_ONCE(cqe->flags)) { + bool notify; + + cqe->id = sqe->id; + cqe->opcode = sqe->opcode; + cqe->status = status; + cqe->imm_data = 0; + cqe->bytes = bytes; + + if (cq->kernel_verbs) + cqe->base_qp = qp->ib_qp; + else + cqe->qp_id = qp_id(qp); + + /* mark CQE valid for application */ + WRITE_ONCE(cqe->flags, SIW_WQE_VALID); + /* recycle SQE */ + smp_store_mb(sqe->flags, 0); + + cq->cq_put++; + notify = siw_cq_notify_now(cq, sqe_flags); + + spin_unlock_irqrestore(&cq->lock, flags); + + if (notify) { + siw_dbg_cq(cq, "Call completion handler\n"); + cq->base_cq.comp_handler(&cq->base_cq, + cq->base_cq.cq_context); + } + } else { + spin_unlock_irqrestore(&cq->lock, flags); + rv = -ENOMEM; + siw_cq_event(cq, IB_EVENT_CQ_ERR); + } + } else { + /* recycle SQE */ + smp_store_mb(sqe->flags, 0); + } + return rv; +} + +int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes, + u32 inval_stag, enum siw_wc_status status) +{ + struct siw_cq *cq = qp->rcq; + int rv = 0; + + if (cq) { + struct siw_cqe *cqe; + u32 idx; + unsigned long flags; + + spin_lock_irqsave(&cq->lock, flags); + + idx = cq->cq_put % cq->num_cqe; + cqe = &cq->queue[idx]; + + if (!READ_ONCE(cqe->flags)) { + bool notify; + u8 cqe_flags = SIW_WQE_VALID; + + cqe->id = rqe->id; + cqe->opcode = SIW_OP_RECEIVE; + cqe->status = status; + cqe->imm_data = 0; + cqe->bytes = bytes; + + if (cq->kernel_verbs) { + cqe->base_qp = qp->ib_qp; + if (inval_stag) { + cqe_flags |= SIW_WQE_REM_INVAL; + cqe->inval_stag = inval_stag; + } + } else { + cqe->qp_id = qp_id(qp); + } + /* mark CQE valid for application */ + WRITE_ONCE(cqe->flags, cqe_flags); + /* recycle RQE */ + smp_store_mb(rqe->flags, 0); + + cq->cq_put++; + notify = siw_cq_notify_now(cq, SIW_WQE_SIGNALLED); + + spin_unlock_irqrestore(&cq->lock, flags); + + if (notify) { + siw_dbg_cq(cq, "Call completion handler\n"); + cq->base_cq.comp_handler(&cq->base_cq, + cq->base_cq.cq_context); + } + } else { + spin_unlock_irqrestore(&cq->lock, flags); + rv = -ENOMEM; + siw_cq_event(cq, IB_EVENT_CQ_ERR); + } + } else { + /* recycle RQE */ + smp_store_mb(rqe->flags, 0); + } + return rv; +} + +/* + * siw_sq_flush() + * + * Flush SQ and ORRQ entries to CQ. + * + * Must be called with QP state write lock held. + * Therefore, SQ and ORQ lock must not be taken. + */ +void siw_sq_flush(struct siw_qp *qp) +{ + struct siw_sqe *sqe; + struct siw_wqe *wqe = tx_wqe(qp); + int async_event = 0; + + /* + * Start with completing any work currently on the ORQ + */ + while (qp->attrs.orq_size) { + sqe = &qp->orq[qp->orq_get % qp->attrs.orq_size]; + if (!READ_ONCE(sqe->flags)) + break; + + if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0) + break; + + WRITE_ONCE(sqe->flags, 0); + qp->orq_get++; + } + /* + * Flush an in-progress WQE if present + */ + if (wqe->wr_status != SIW_WR_IDLE) { + siw_dbg_qp(qp, "flush current SQE, type %d, status %d\n", + tx_type(wqe), wqe->wr_status); + + siw_wqe_put_mem(wqe, tx_type(wqe)); + + if (tx_type(wqe) != SIW_OP_READ_RESPONSE && + ((tx_type(wqe) != SIW_OP_READ && + tx_type(wqe) != SIW_OP_READ_LOCAL_INV) || + wqe->wr_status == SIW_WR_QUEUED)) + /* + * An in-progress Read Request is already in + * the ORQ + */ + siw_sqe_complete(qp, &wqe->sqe, wqe->bytes, + SIW_WC_WR_FLUSH_ERR); + + wqe->wr_status = SIW_WR_IDLE; + } + /* + * Flush the Send Queue + */ + while (qp->attrs.sq_size) { + sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size]; + if (!READ_ONCE(sqe->flags)) + break; + + async_event = 1; + if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0) + /* + * Shall IB_EVENT_SQ_DRAINED be supressed if work + * completion fails? + */ + break; + + WRITE_ONCE(sqe->flags, 0); + qp->sq_get++; + } + if (async_event) + siw_qp_event(qp, IB_EVENT_SQ_DRAINED); +} + +/* + * siw_rq_flush() + * + * Flush recv queue entries to CQ. Also + * takes care of pending active tagged and untagged + * inbound transfers, which have target memory + * referenced. + * + * Must be called with QP state write lock held. + * Therefore, RQ lock must not be taken. + */ +void siw_rq_flush(struct siw_qp *qp) +{ + struct siw_wqe *wqe = &qp->rx_untagged.wqe_active; + + /* + * Flush an in-progress untagged operation if present + */ + if (wqe->wr_status != SIW_WR_IDLE) { + siw_dbg_qp(qp, "flush current rqe, type %d, status %d\n", + rx_type(wqe), wqe->wr_status); + + siw_wqe_put_mem(wqe, rx_type(wqe)); + + if (rx_type(wqe) == SIW_OP_RECEIVE) { + siw_rqe_complete(qp, &wqe->rqe, wqe->bytes, + 0, SIW_WC_WR_FLUSH_ERR); + } else if (rx_type(wqe) != SIW_OP_READ && + rx_type(wqe) != SIW_OP_READ_RESPONSE && + rx_type(wqe) != SIW_OP_WRITE) { + siw_sqe_complete(qp, &wqe->sqe, 0, SIW_WC_WR_FLUSH_ERR); + } + wqe->wr_status = SIW_WR_IDLE; + } + wqe = &qp->rx_tagged.wqe_active; + + if (wqe->wr_status != SIW_WR_IDLE) { + siw_wqe_put_mem(wqe, rx_type(wqe)); + wqe->wr_status = SIW_WR_IDLE; + } + /* + * Flush the Receive Queue + */ + while (qp->attrs.rq_size) { + struct siw_rqe *rqe = + &qp->recvq[qp->rq_get % qp->attrs.rq_size]; + + if (!READ_ONCE(rqe->flags)) + break; + + if (siw_rqe_complete(qp, rqe, 0, 0, SIW_WC_WR_FLUSH_ERR) != 0) + break; + + WRITE_ONCE(rqe->flags, 0); + qp->rq_get++; + } +} + +int siw_qp_add(struct siw_device *sdev, struct siw_qp *qp) +{ + int rv = xa_alloc(&sdev->qp_xa, &qp->ib_qp->qp_num, qp, xa_limit_32b, + GFP_KERNEL); + + if (!rv) { + kref_init(&qp->ref); + qp->sdev = sdev; + qp->qp_num = qp->ib_qp->qp_num; + siw_dbg_qp(qp, "new QP\n"); + } + return rv; +} + +void siw_free_qp(struct kref *ref) +{ + struct siw_qp *found, *qp = container_of(ref, struct siw_qp, ref); + struct siw_device *sdev = qp->sdev; + unsigned long flags; + + if (qp->cep) + siw_cep_put(qp->cep); + + found = xa_erase(&sdev->qp_xa, qp_id(qp)); + WARN_ON(found != qp); + spin_lock_irqsave(&sdev->lock, flags); + list_del(&qp->devq); + spin_unlock_irqrestore(&sdev->lock, flags); + + vfree(qp->sendq); + vfree(qp->recvq); + vfree(qp->irq); + vfree(qp->orq); + + siw_put_tx_cpu(qp->tx_cpu); + + atomic_dec(&sdev->num_qp); + siw_dbg_qp(qp, "free QP\n"); + kfree_rcu(qp, rcu); +} -- cgit v1.2.3 From b9be6f18cf9ed04dd8087cb9d69de6e90d8ceb08 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:30 +0200 Subject: rdma/siw: transmit path Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_qp_tx.c | 1269 +++++++++++++++++++++++++++++++++ 1 file changed, 1269 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw_qp_tx.c (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c new file mode 100644 index 000000000000..5e926fac51db --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c @@ -0,0 +1,1269 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "siw.h" +#include "siw_verbs.h" +#include "siw_mem.h" + +#define MAX_HDR_INLINE \ + (((uint32_t)(sizeof(struct siw_rreq_pkt) - \ + sizeof(struct iwarp_send))) & 0xF8) + +static struct page *siw_get_pblpage(struct siw_mem *mem, u64 addr, int *idx) +{ + struct siw_pbl *pbl = mem->pbl; + u64 offset = addr - mem->va; + u64 paddr = siw_pbl_get_buffer(pbl, offset, NULL, idx); + + if (paddr) + return virt_to_page(paddr); + + return NULL; +} + +/* + * Copy short payload at provided destination payload address + */ +static int siw_try_1seg(struct siw_iwarp_tx *c_tx, u64 paddr) +{ + struct siw_wqe *wqe = &c_tx->wqe_active; + struct siw_sge *sge = &wqe->sqe.sge[0]; + u32 bytes = sge->length; + + if (bytes > MAX_HDR_INLINE || wqe->sqe.num_sge != 1) + return MAX_HDR_INLINE + 1; + + if (!bytes) + return 0; + + if (tx_flags(wqe) & SIW_WQE_INLINE) { + memcpy((void *)paddr, &wqe->sqe.sge[1], bytes); + } else { + struct siw_mem *mem = wqe->mem[0]; + + if (!mem->mem_obj) { + /* Kernel client using kva */ + memcpy((void *)paddr, (void *)sge->laddr, bytes); + } else if (c_tx->in_syscall) { + if (copy_from_user((void *)paddr, + (const void __user *)sge->laddr, + bytes)) + return -EFAULT; + } else { + unsigned int off = sge->laddr & ~PAGE_MASK; + struct page *p; + char *buffer; + int pbl_idx = 0; + + if (!mem->is_pbl) + p = siw_get_upage(mem->umem, sge->laddr); + else + p = siw_get_pblpage(mem, sge->laddr, &pbl_idx); + + if (unlikely(!p)) + return -EFAULT; + + buffer = kmap_atomic(p); + + if (likely(PAGE_SIZE - off >= bytes)) { + memcpy((void *)paddr, buffer + off, bytes); + kunmap_atomic(buffer); + } else { + unsigned long part = bytes - (PAGE_SIZE - off); + + memcpy((void *)paddr, buffer + off, part); + kunmap_atomic(buffer); + + if (!mem->is_pbl) + p = siw_get_upage(mem->umem, + sge->laddr + part); + else + p = siw_get_pblpage(mem, + sge->laddr + part, + &pbl_idx); + if (unlikely(!p)) + return -EFAULT; + + buffer = kmap_atomic(p); + memcpy((void *)(paddr + part), buffer, + bytes - part); + kunmap_atomic(buffer); + } + } + } + return (int)bytes; +} + +#define PKT_FRAGMENTED 1 +#define PKT_COMPLETE 0 + +/* + * siw_qp_prepare_tx() + * + * Prepare tx state for sending out one fpdu. Builds complete pkt + * if no user data or only immediate data are present. + * + * returns PKT_COMPLETE if complete pkt built, PKT_FRAGMENTED otherwise. + */ +static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx) +{ + struct siw_wqe *wqe = &c_tx->wqe_active; + char *crc = NULL; + int data = 0; + + switch (tx_type(wqe)) { + case SIW_OP_READ: + case SIW_OP_READ_LOCAL_INV: + memcpy(&c_tx->pkt.ctrl, + &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl, + sizeof(struct iwarp_ctrl)); + + c_tx->pkt.rreq.rsvd = 0; + c_tx->pkt.rreq.ddp_qn = htonl(RDMAP_UNTAGGED_QN_RDMA_READ); + c_tx->pkt.rreq.ddp_msn = + htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]); + c_tx->pkt.rreq.ddp_mo = 0; + c_tx->pkt.rreq.sink_stag = htonl(wqe->sqe.sge[0].lkey); + c_tx->pkt.rreq.sink_to = + cpu_to_be64(wqe->sqe.sge[0].laddr); + c_tx->pkt.rreq.source_stag = htonl(wqe->sqe.rkey); + c_tx->pkt.rreq.source_to = cpu_to_be64(wqe->sqe.raddr); + c_tx->pkt.rreq.read_size = htonl(wqe->sqe.sge[0].length); + + c_tx->ctrl_len = sizeof(struct iwarp_rdma_rreq); + crc = (char *)&c_tx->pkt.rreq_pkt.crc; + break; + + case SIW_OP_SEND: + if (tx_flags(wqe) & SIW_WQE_SOLICITED) + memcpy(&c_tx->pkt.ctrl, + &iwarp_pktinfo[RDMAP_SEND_SE].ctrl, + sizeof(struct iwarp_ctrl)); + else + memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_SEND].ctrl, + sizeof(struct iwarp_ctrl)); + + c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND; + c_tx->pkt.send.ddp_msn = + htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); + c_tx->pkt.send.ddp_mo = 0; + + c_tx->pkt.send_inv.inval_stag = 0; + + c_tx->ctrl_len = sizeof(struct iwarp_send); + + crc = (char *)&c_tx->pkt.send_pkt.crc; + data = siw_try_1seg(c_tx, (u64)crc); + break; + + case SIW_OP_SEND_REMOTE_INV: + if (tx_flags(wqe) & SIW_WQE_SOLICITED) + memcpy(&c_tx->pkt.ctrl, + &iwarp_pktinfo[RDMAP_SEND_SE_INVAL].ctrl, + sizeof(struct iwarp_ctrl)); + else + memcpy(&c_tx->pkt.ctrl, + &iwarp_pktinfo[RDMAP_SEND_INVAL].ctrl, + sizeof(struct iwarp_ctrl)); + + c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND; + c_tx->pkt.send.ddp_msn = + htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); + c_tx->pkt.send.ddp_mo = 0; + + c_tx->pkt.send_inv.inval_stag = cpu_to_be32(wqe->sqe.rkey); + + c_tx->ctrl_len = sizeof(struct iwarp_send_inv); + + crc = (char *)&c_tx->pkt.send_pkt.crc; + data = siw_try_1seg(c_tx, (u64)crc); + break; + + case SIW_OP_WRITE: + memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_RDMA_WRITE].ctrl, + sizeof(struct iwarp_ctrl)); + + c_tx->pkt.rwrite.sink_stag = htonl(wqe->sqe.rkey); + c_tx->pkt.rwrite.sink_to = cpu_to_be64(wqe->sqe.raddr); + c_tx->ctrl_len = sizeof(struct iwarp_rdma_write); + + crc = (char *)&c_tx->pkt.write_pkt.crc; + data = siw_try_1seg(c_tx, (u64)crc); + break; + + case SIW_OP_READ_RESPONSE: + memcpy(&c_tx->pkt.ctrl, + &iwarp_pktinfo[RDMAP_RDMA_READ_RESP].ctrl, + sizeof(struct iwarp_ctrl)); + + /* NBO */ + c_tx->pkt.rresp.sink_stag = cpu_to_be32(wqe->sqe.rkey); + c_tx->pkt.rresp.sink_to = cpu_to_be64(wqe->sqe.raddr); + + c_tx->ctrl_len = sizeof(struct iwarp_rdma_rresp); + + crc = (char *)&c_tx->pkt.write_pkt.crc; + data = siw_try_1seg(c_tx, (u64)crc); + break; + + default: + siw_dbg_qp(tx_qp(c_tx), "stale wqe type %d\n", tx_type(wqe)); + return -EOPNOTSUPP; + } + if (unlikely(data < 0)) + return data; + + c_tx->ctrl_sent = 0; + + if (data <= MAX_HDR_INLINE) { + if (data) { + wqe->processed = data; + + c_tx->pkt.ctrl.mpa_len = + htons(c_tx->ctrl_len + data - MPA_HDR_SIZE); + + /* Add pad, if needed */ + data += -(int)data & 0x3; + /* advance CRC location after payload */ + crc += data; + c_tx->ctrl_len += data; + + if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)) + c_tx->pkt.c_untagged.ddp_mo = 0; + else + c_tx->pkt.c_tagged.ddp_to = + cpu_to_be64(wqe->sqe.raddr); + } + + *(u32 *)crc = 0; + /* + * Do complete CRC if enabled and short packet + */ + if (c_tx->mpa_crc_hd) { + crypto_shash_init(c_tx->mpa_crc_hd); + if (crypto_shash_update(c_tx->mpa_crc_hd, + (u8 *)&c_tx->pkt, + c_tx->ctrl_len)) + return -EINVAL; + crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)crc); + } + c_tx->ctrl_len += MPA_CRC_SIZE; + + return PKT_COMPLETE; + } + c_tx->ctrl_len += MPA_CRC_SIZE; + c_tx->sge_idx = 0; + c_tx->sge_off = 0; + c_tx->pbl_idx = 0; + + /* + * Allow direct sending out of user buffer if WR is non signalled + * and payload is over threshold. + * Per RDMA verbs, the application should not change the send buffer + * until the work completed. In iWarp, work completion is only + * local delivery to TCP. TCP may reuse the buffer for + * retransmission. Changing unsent data also breaks the CRC, + * if applied. + */ + if (c_tx->zcopy_tx && wqe->bytes >= SENDPAGE_THRESH && + !(tx_flags(wqe) & SIW_WQE_SIGNALLED)) + c_tx->use_sendpage = 1; + else + c_tx->use_sendpage = 0; + + return PKT_FRAGMENTED; +} + +/* + * Send out one complete control type FPDU, or header of FPDU carrying + * data. Used for fixed sized packets like Read.Requests or zero length + * SENDs, WRITEs, READ.Responses, or header only. + */ +static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s, + int flags) +{ + struct msghdr msg = { .msg_flags = flags }; + struct kvec iov = { .iov_base = + (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent, + .iov_len = c_tx->ctrl_len - c_tx->ctrl_sent }; + + int rv = kernel_sendmsg(s, &msg, &iov, 1, + c_tx->ctrl_len - c_tx->ctrl_sent); + + if (rv >= 0) { + c_tx->ctrl_sent += rv; + + if (c_tx->ctrl_sent == c_tx->ctrl_len) + rv = 0; + else + rv = -EAGAIN; + } + return rv; +} + +/* + * 0copy TCP transmit interface: Use do_tcp_sendpages. + * + * Using sendpage to push page by page appears to be less efficient + * than using sendmsg, even if data are copied. + * + * A general performance limitation might be the extra four bytes + * trailer checksum segment to be pushed after user data. + */ +static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset, + size_t size) +{ + struct sock *sk = s->sk; + int i = 0, rv = 0, sent = 0, + flags = MSG_MORE | MSG_DONTWAIT | MSG_SENDPAGE_NOTLAST; + + while (size) { + size_t bytes = min_t(size_t, PAGE_SIZE - offset, size); + + if (size + offset <= PAGE_SIZE) + flags = MSG_MORE | MSG_DONTWAIT; + + tcp_rate_check_app_limited(sk); +try_page_again: + lock_sock(sk); + rv = do_tcp_sendpages(sk, page[i], offset, bytes, flags); + release_sock(sk); + + if (rv > 0) { + size -= rv; + sent += rv; + if (rv != bytes) { + offset += rv; + bytes -= rv; + goto try_page_again; + } + offset = 0; + } else { + if (rv == -EAGAIN || rv == 0) + break; + return rv; + } + i++; + } + return sent; +} + +/* + * siw_0copy_tx() + * + * Pushes list of pages to TCP socket. If pages from multiple + * SGE's, all referenced pages of each SGE are pushed in one + * shot. + */ +static int siw_0copy_tx(struct socket *s, struct page **page, + struct siw_sge *sge, unsigned int offset, + unsigned int size) +{ + int i = 0, sent = 0, rv; + int sge_bytes = min(sge->length - offset, size); + + offset = (sge->laddr + offset) & ~PAGE_MASK; + + while (sent != size) { + rv = siw_tcp_sendpages(s, &page[i], offset, sge_bytes); + if (rv >= 0) { + sent += rv; + if (size == sent || sge_bytes > rv) + break; + + i += PAGE_ALIGN(sge_bytes + offset) >> PAGE_SHIFT; + sge++; + sge_bytes = min(sge->length, size - sent); + offset = sge->laddr & ~PAGE_MASK; + } else { + sent = rv; + break; + } + } + return sent; +} + +#define MAX_TRAILER (MPA_CRC_SIZE + 4) + +static void siw_unmap_pages(struct page **pages, int hdr_len, int num_maps) +{ + if (hdr_len) { + ++pages; + --num_maps; + } + while (num_maps-- > 0) { + kunmap(*pages); + pages++; + } +} + +/* + * siw_tx_hdt() tries to push a complete packet to TCP where all + * packet fragments are referenced by the elements of one iovec. + * For the data portion, each involved page must be referenced by + * one extra element. All sge's data can be non-aligned to page + * boundaries. Two more elements are referencing iWARP header + * and trailer: + * MAX_ARRAY = 64KB/PAGE_SIZE + 1 + (2 * (SIW_MAX_SGE - 1) + HDR + TRL + */ +#define MAX_ARRAY ((0xffff / PAGE_SIZE) + 1 + (2 * (SIW_MAX_SGE - 1) + 2)) + +/* + * Write out iov referencing hdr, data and trailer of current FPDU. + * Update transmit state dependent on write return status + */ +static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) +{ + struct siw_wqe *wqe = &c_tx->wqe_active; + struct siw_sge *sge = &wqe->sqe.sge[c_tx->sge_idx]; + struct kvec iov[MAX_ARRAY]; + struct page *page_array[MAX_ARRAY]; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; + + int seg = 0, do_crc = c_tx->do_crc, is_kva = 0, rv; + unsigned int data_len = c_tx->bytes_unsent, hdr_len = 0, trl_len = 0, + sge_off = c_tx->sge_off, sge_idx = c_tx->sge_idx, + pbl_idx = c_tx->pbl_idx; + + if (c_tx->state == SIW_SEND_HDR) { + if (c_tx->use_sendpage) { + rv = siw_tx_ctrl(c_tx, s, MSG_DONTWAIT | MSG_MORE); + if (rv) + goto done; + + c_tx->state = SIW_SEND_DATA; + } else { + iov[0].iov_base = + (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent; + iov[0].iov_len = hdr_len = + c_tx->ctrl_len - c_tx->ctrl_sent; + seg = 1; + } + } + + wqe->processed += data_len; + + while (data_len) { /* walk the list of SGE's */ + unsigned int sge_len = min(sge->length - sge_off, data_len); + unsigned int fp_off = (sge->laddr + sge_off) & ~PAGE_MASK; + struct siw_mem *mem; + + if (!(tx_flags(wqe) & SIW_WQE_INLINE)) { + mem = wqe->mem[sge_idx]; + if (!mem->mem_obj) + is_kva = 1; + } else { + is_kva = 1; + } + if (is_kva && !c_tx->use_sendpage) { + /* + * tx from kernel virtual address: either inline data + * or memory region with assigned kernel buffer + */ + iov[seg].iov_base = (void *)(sge->laddr + sge_off); + iov[seg].iov_len = sge_len; + + if (do_crc) + crypto_shash_update(c_tx->mpa_crc_hd, + iov[seg].iov_base, + sge_len); + sge_off += sge_len; + data_len -= sge_len; + seg++; + goto sge_done; + } + + while (sge_len) { + size_t plen = min((int)PAGE_SIZE - fp_off, sge_len); + + if (!is_kva) { + struct page *p; + + if (mem->is_pbl) + p = siw_get_pblpage( + mem, sge->laddr + sge_off, + &pbl_idx); + else + p = siw_get_upage(mem->umem, + sge->laddr + sge_off); + if (unlikely(!p)) { + if (hdr_len) + seg--; + if (!c_tx->use_sendpage && seg) { + siw_unmap_pages(page_array, + hdr_len, seg); + } + wqe->processed -= c_tx->bytes_unsent; + rv = -EFAULT; + goto done_crc; + } + page_array[seg] = p; + + if (!c_tx->use_sendpage) { + iov[seg].iov_base = kmap(p) + fp_off; + iov[seg].iov_len = plen; + if (do_crc) + crypto_shash_update( + c_tx->mpa_crc_hd, + iov[seg].iov_base, + plen); + } else if (do_crc) + crypto_shash_update( + c_tx->mpa_crc_hd, + page_address(p) + fp_off, + plen); + } else { + u64 pa = ((sge->laddr + sge_off) & PAGE_MASK); + + page_array[seg] = virt_to_page(pa); + if (do_crc) + crypto_shash_update( + c_tx->mpa_crc_hd, + (void *)(sge->laddr + sge_off), + plen); + } + + sge_len -= plen; + sge_off += plen; + data_len -= plen; + fp_off = 0; + + if (++seg > (int)MAX_ARRAY) { + siw_dbg_qp(tx_qp(c_tx), "to many fragments\n"); + if (!is_kva && !c_tx->use_sendpage) { + siw_unmap_pages(page_array, hdr_len, + seg - 1); + } + wqe->processed -= c_tx->bytes_unsent; + rv = -EMSGSIZE; + goto done_crc; + } + } +sge_done: + /* Update SGE variables at end of SGE */ + if (sge_off == sge->length && + (data_len != 0 || wqe->processed < wqe->bytes)) { + sge_idx++; + sge++; + sge_off = 0; + } + } + /* trailer */ + if (likely(c_tx->state != SIW_SEND_TRAILER)) { + iov[seg].iov_base = &c_tx->trailer.pad[4 - c_tx->pad]; + iov[seg].iov_len = trl_len = MAX_TRAILER - (4 - c_tx->pad); + } else { + iov[seg].iov_base = &c_tx->trailer.pad[c_tx->ctrl_sent]; + iov[seg].iov_len = trl_len = MAX_TRAILER - c_tx->ctrl_sent; + } + + if (c_tx->pad) { + *(u32 *)c_tx->trailer.pad = 0; + if (do_crc) + crypto_shash_update(c_tx->mpa_crc_hd, + (u8 *)&c_tx->trailer.crc - c_tx->pad, + c_tx->pad); + } + if (!c_tx->mpa_crc_hd) + c_tx->trailer.crc = 0; + else if (do_crc) + crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)&c_tx->trailer.crc); + + data_len = c_tx->bytes_unsent; + + if (c_tx->use_sendpage) { + rv = siw_0copy_tx(s, page_array, &wqe->sqe.sge[c_tx->sge_idx], + c_tx->sge_off, data_len); + if (rv == data_len) { + rv = kernel_sendmsg(s, &msg, &iov[seg], 1, trl_len); + if (rv > 0) + rv += data_len; + else + rv = data_len; + } + } else { + rv = kernel_sendmsg(s, &msg, iov, seg + 1, + hdr_len + data_len + trl_len); + if (!is_kva) + siw_unmap_pages(page_array, hdr_len, seg); + } + if (rv < (int)hdr_len) { + /* Not even complete hdr pushed or negative rv */ + wqe->processed -= data_len; + if (rv >= 0) { + c_tx->ctrl_sent += rv; + rv = -EAGAIN; + } + goto done_crc; + } + rv -= hdr_len; + + if (rv >= (int)data_len) { + /* all user data pushed to TCP or no data to push */ + if (data_len > 0 && wqe->processed < wqe->bytes) { + /* Save the current state for next tx */ + c_tx->sge_idx = sge_idx; + c_tx->sge_off = sge_off; + c_tx->pbl_idx = pbl_idx; + } + rv -= data_len; + + if (rv == trl_len) /* all pushed */ + rv = 0; + else { + c_tx->state = SIW_SEND_TRAILER; + c_tx->ctrl_len = MAX_TRAILER; + c_tx->ctrl_sent = rv + 4 - c_tx->pad; + c_tx->bytes_unsent = 0; + rv = -EAGAIN; + } + + } else if (data_len > 0) { + /* Maybe some user data pushed to TCP */ + c_tx->state = SIW_SEND_DATA; + wqe->processed -= data_len - rv; + + if (rv) { + /* + * Some bytes out. Recompute tx state based + * on old state and bytes pushed + */ + unsigned int sge_unsent; + + c_tx->bytes_unsent -= rv; + sge = &wqe->sqe.sge[c_tx->sge_idx]; + sge_unsent = sge->length - c_tx->sge_off; + + while (sge_unsent <= rv) { + rv -= sge_unsent; + c_tx->sge_idx++; + c_tx->sge_off = 0; + sge++; + sge_unsent = sge->length; + } + c_tx->sge_off += rv; + } + rv = -EAGAIN; + } +done_crc: + c_tx->do_crc = 0; +done: + return rv; +} + +static void siw_update_tcpseg(struct siw_iwarp_tx *c_tx, + struct socket *s) +{ + struct tcp_sock *tp = tcp_sk(s->sk); + + if (tp->gso_segs) { + if (c_tx->gso_seg_limit == 0) + c_tx->tcp_seglen = tp->mss_cache * tp->gso_segs; + else + c_tx->tcp_seglen = + tp->mss_cache * + min_t(u16, c_tx->gso_seg_limit, tp->gso_segs); + } else { + c_tx->tcp_seglen = tp->mss_cache; + } + /* Loopback may give odd numbers */ + c_tx->tcp_seglen &= 0xfffffff8; +} + +/* + * siw_prepare_fpdu() + * + * Prepares transmit context to send out one FPDU if FPDU will contain + * user data and user data are not immediate data. + * Computes maximum FPDU length to fill up TCP MSS if possible. + * + * @qp: QP from which to transmit + * @wqe: Current WQE causing transmission + * + * TODO: Take into account real available sendspace on socket + * to avoid header misalignment due to send pausing within + * fpdu transmission + */ +static void siw_prepare_fpdu(struct siw_qp *qp, struct siw_wqe *wqe) +{ + struct siw_iwarp_tx *c_tx = &qp->tx_ctx; + int data_len; + + c_tx->ctrl_len = + iwarp_pktinfo[__rdmap_get_opcode(&c_tx->pkt.ctrl)].hdr_len; + c_tx->ctrl_sent = 0; + + /* + * Update target buffer offset if any + */ + if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)) + /* Untagged message */ + c_tx->pkt.c_untagged.ddp_mo = cpu_to_be32(wqe->processed); + else /* Tagged message */ + c_tx->pkt.c_tagged.ddp_to = + cpu_to_be64(wqe->sqe.raddr + wqe->processed); + + data_len = wqe->bytes - wqe->processed; + if (data_len + c_tx->ctrl_len + MPA_CRC_SIZE > c_tx->tcp_seglen) { + /* Trim DDP payload to fit into current TCP segment */ + data_len = c_tx->tcp_seglen - (c_tx->ctrl_len + MPA_CRC_SIZE); + c_tx->pkt.ctrl.ddp_rdmap_ctrl &= ~DDP_FLAG_LAST; + c_tx->pad = 0; + } else { + c_tx->pkt.ctrl.ddp_rdmap_ctrl |= DDP_FLAG_LAST; + c_tx->pad = -data_len & 0x3; + } + c_tx->bytes_unsent = data_len; + + c_tx->pkt.ctrl.mpa_len = + htons(c_tx->ctrl_len + data_len - MPA_HDR_SIZE); + + /* + * Init MPA CRC computation + */ + if (c_tx->mpa_crc_hd) { + crypto_shash_init(c_tx->mpa_crc_hd); + crypto_shash_update(c_tx->mpa_crc_hd, (u8 *)&c_tx->pkt, + c_tx->ctrl_len); + c_tx->do_crc = 1; + } +} + +/* + * siw_check_sgl_tx() + * + * Check permissions for a list of SGE's (SGL). + * A successful check will have all memory referenced + * for transmission resolved and assigned to the WQE. + * + * @pd: Protection Domain SGL should belong to + * @wqe: WQE to be checked + * @perms: requested access permissions + * + */ + +static int siw_check_sgl_tx(struct ib_pd *pd, struct siw_wqe *wqe, + enum ib_access_flags perms) +{ + struct siw_sge *sge = &wqe->sqe.sge[0]; + int i, len, num_sge = wqe->sqe.num_sge; + + if (unlikely(num_sge > SIW_MAX_SGE)) + return -EINVAL; + + for (i = 0, len = 0; num_sge; num_sge--, i++, sge++) { + /* + * rdma verbs: do not check stag for a zero length sge + */ + if (sge->length) { + int rv = siw_check_sge(pd, sge, &wqe->mem[i], perms, 0, + sge->length); + + if (unlikely(rv != E_ACCESS_OK)) + return rv; + } + len += sge->length; + } + return len; +} + +/* + * siw_qp_sq_proc_tx() + * + * Process one WQE which needs transmission on the wire. + */ +static int siw_qp_sq_proc_tx(struct siw_qp *qp, struct siw_wqe *wqe) +{ + struct siw_iwarp_tx *c_tx = &qp->tx_ctx; + struct socket *s = qp->attrs.sk; + int rv = 0, burst_len = qp->tx_ctx.burst; + enum rdmap_ecode ecode = RDMAP_ECODE_CATASTROPHIC_STREAM; + + if (unlikely(wqe->wr_status == SIW_WR_IDLE)) + return 0; + + if (!burst_len) + burst_len = SQ_USER_MAXBURST; + + if (wqe->wr_status == SIW_WR_QUEUED) { + if (!(wqe->sqe.flags & SIW_WQE_INLINE)) { + if (tx_type(wqe) == SIW_OP_READ_RESPONSE) + wqe->sqe.num_sge = 1; + + if (tx_type(wqe) != SIW_OP_READ && + tx_type(wqe) != SIW_OP_READ_LOCAL_INV) { + /* + * Reference memory to be tx'd w/o checking + * access for LOCAL_READ permission, since + * not defined in RDMA core. + */ + rv = siw_check_sgl_tx(qp->pd, wqe, 0); + if (rv < 0) { + if (tx_type(wqe) == + SIW_OP_READ_RESPONSE) + ecode = siw_rdmap_error(-rv); + rv = -EINVAL; + goto tx_error; + } + wqe->bytes = rv; + } else { + wqe->bytes = 0; + } + } else { + wqe->bytes = wqe->sqe.sge[0].length; + if (!qp->kernel_verbs) { + if (wqe->bytes > SIW_MAX_INLINE) { + rv = -EINVAL; + goto tx_error; + } + wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1]; + } + } + wqe->wr_status = SIW_WR_INPROGRESS; + wqe->processed = 0; + + siw_update_tcpseg(c_tx, s); + + rv = siw_qp_prepare_tx(c_tx); + if (rv == PKT_FRAGMENTED) { + c_tx->state = SIW_SEND_HDR; + siw_prepare_fpdu(qp, wqe); + } else if (rv == PKT_COMPLETE) { + c_tx->state = SIW_SEND_SHORT_FPDU; + } else { + goto tx_error; + } + } + +next_segment: + siw_dbg_qp(qp, "wr type %d, state %d, data %u, sent %u, id %llx\n", + tx_type(wqe), wqe->wr_status, wqe->bytes, wqe->processed, + wqe->sqe.id); + + if (--burst_len == 0) { + rv = -EINPROGRESS; + goto tx_done; + } + if (c_tx->state == SIW_SEND_SHORT_FPDU) { + enum siw_opcode tx_type = tx_type(wqe); + unsigned int msg_flags; + + if (siw_sq_empty(qp) || !siw_tcp_nagle || burst_len == 1) + /* + * End current TCP segment, if SQ runs empty, + * or siw_tcp_nagle is not set, or we bail out + * soon due to no burst credit left. + */ + msg_flags = MSG_DONTWAIT; + else + msg_flags = MSG_DONTWAIT | MSG_MORE; + + rv = siw_tx_ctrl(c_tx, s, msg_flags); + + if (!rv && tx_type != SIW_OP_READ && + tx_type != SIW_OP_READ_LOCAL_INV) + wqe->processed = wqe->bytes; + + goto tx_done; + + } else { + rv = siw_tx_hdt(c_tx, s); + } + if (!rv) { + /* + * One segment sent. Processing completed if last + * segment, Do next segment otherwise. + */ + if (unlikely(c_tx->tx_suspend)) { + /* + * Verbs, 6.4.: Try stopping sending after a full + * DDP segment if the connection goes down + * (== peer halfclose) + */ + rv = -ECONNABORTED; + goto tx_done; + } + if (c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_LAST) { + siw_dbg_qp(qp, "WQE completed\n"); + goto tx_done; + } + c_tx->state = SIW_SEND_HDR; + + siw_update_tcpseg(c_tx, s); + + siw_prepare_fpdu(qp, wqe); + goto next_segment; + } +tx_done: + qp->tx_ctx.burst = burst_len; + return rv; + +tx_error: + if (ecode != RDMAP_ECODE_CATASTROPHIC_STREAM) + siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_REMOTE_PROTECTION, ecode, 1); + else + siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_CATASTROPHIC, + RDMAP_ECODE_UNSPECIFIED, 1); + return rv; +} + +static int siw_fastreg_mr(struct ib_pd *pd, struct siw_sqe *sqe) +{ + struct ib_mr *base_mr = (struct ib_mr *)sqe->base_mr; + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mem *mem = siw_mem_id2obj(sdev, sqe->rkey >> 8); + int rv = 0; + + siw_dbg_pd(pd, "STag 0x%08x\n", sqe->rkey); + + if (unlikely(!mem || !base_mr)) { + pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe->rkey); + return -EINVAL; + } + if (unlikely(base_mr->rkey >> 8 != sqe->rkey >> 8)) { + pr_warn("siw: fastreg: STag 0x%08x: bad MR\n", sqe->rkey); + rv = -EINVAL; + goto out; + } + if (unlikely(mem->pd != pd)) { + pr_warn("siw: fastreg: PD mismatch\n"); + rv = -EINVAL; + goto out; + } + if (unlikely(mem->stag_valid)) { + pr_warn("siw: fastreg: STag 0x%08x already valid\n", sqe->rkey); + rv = -EINVAL; + goto out; + } + /* Refresh STag since user may have changed key part */ + mem->stag = sqe->rkey; + mem->perms = sqe->access; + + siw_dbg_mem(mem, "STag now valid, MR va: 0x%016llx -> 0x%016llx\n", + mem->va, base_mr->iova); + mem->va = base_mr->iova; + mem->stag_valid = 1; +out: + siw_mem_put(mem); + return rv; +} + +static int siw_qp_sq_proc_local(struct siw_qp *qp, struct siw_wqe *wqe) +{ + int rv; + + switch (tx_type(wqe)) { + case SIW_OP_REG_MR: + rv = siw_fastreg_mr(qp->pd, &wqe->sqe); + break; + + case SIW_OP_INVAL_STAG: + rv = siw_invalidate_stag(qp->pd, wqe->sqe.rkey); + break; + + default: + rv = -EINVAL; + } + return rv; +} + +/* + * siw_qp_sq_process() + * + * Core TX path routine for RDMAP/DDP/MPA using a TCP kernel socket. + * Sends RDMAP payload for the current SQ WR @wqe of @qp in one or more + * MPA FPDUs, each containing a DDP segment. + * + * SQ processing may occur in user context as a result of posting + * new WQE's or from siw_sq_work_handler() context. Processing in + * user context is limited to non-kernel verbs users. + * + * SQ processing may get paused anytime, possibly in the middle of a WR + * or FPDU, if insufficient send space is available. SQ processing + * gets resumed from siw_sq_work_handler(), if send space becomes + * available again. + * + * Must be called with the QP state read-locked. + * + * Note: + * An outbound RREQ can be satisfied by the corresponding RRESP + * _before_ it gets assigned to the ORQ. This happens regularly + * in RDMA READ via loopback case. Since both outbound RREQ and + * inbound RRESP can be handled by the same CPU, locking the ORQ + * is dead-lock prone and thus not an option. With that, the + * RREQ gets assigned to the ORQ _before_ being sent - see + * siw_activate_tx() - and pulled back in case of send failure. + */ +int siw_qp_sq_process(struct siw_qp *qp) +{ + struct siw_wqe *wqe = tx_wqe(qp); + enum siw_opcode tx_type; + unsigned long flags; + int rv = 0; + + siw_dbg_qp(qp, "enter for type %d\n", tx_type(wqe)); + +next_wqe: + /* + * Stop QP processing if SQ state changed + */ + if (unlikely(qp->tx_ctx.tx_suspend)) { + siw_dbg_qp(qp, "tx suspended\n"); + goto done; + } + tx_type = tx_type(wqe); + + if (tx_type <= SIW_OP_READ_RESPONSE) + rv = siw_qp_sq_proc_tx(qp, wqe); + else + rv = siw_qp_sq_proc_local(qp, wqe); + + if (!rv) { + /* + * WQE processing done + */ + switch (tx_type) { + case SIW_OP_SEND: + case SIW_OP_SEND_REMOTE_INV: + case SIW_OP_WRITE: + siw_wqe_put_mem(wqe, tx_type); + case SIW_OP_INVAL_STAG: + case SIW_OP_REG_MR: + if (tx_flags(wqe) & SIW_WQE_SIGNALLED) + siw_sqe_complete(qp, &wqe->sqe, wqe->bytes, + SIW_WC_SUCCESS); + break; + + case SIW_OP_READ: + case SIW_OP_READ_LOCAL_INV: + /* + * already enqueued to ORQ queue + */ + break; + + case SIW_OP_READ_RESPONSE: + siw_wqe_put_mem(wqe, tx_type); + break; + + default: + WARN(1, "undefined WQE type %d\n", tx_type); + rv = -EINVAL; + goto done; + } + + spin_lock_irqsave(&qp->sq_lock, flags); + wqe->wr_status = SIW_WR_IDLE; + rv = siw_activate_tx(qp); + spin_unlock_irqrestore(&qp->sq_lock, flags); + + if (rv <= 0) + goto done; + + goto next_wqe; + + } else if (rv == -EAGAIN) { + siw_dbg_qp(qp, "sq paused: hd/tr %d of %d, data %d\n", + qp->tx_ctx.ctrl_sent, qp->tx_ctx.ctrl_len, + qp->tx_ctx.bytes_unsent); + rv = 0; + goto done; + } else if (rv == -EINPROGRESS) { + rv = siw_sq_start(qp); + goto done; + } else { + /* + * WQE processing failed. + * Verbs 8.3.2: + * o It turns any WQE into a signalled WQE. + * o Local catastrophic error must be surfaced + * o QP must be moved into Terminate state: done by code + * doing socket state change processing + * + * o TODO: Termination message must be sent. + * o TODO: Implement more precise work completion errors, + * see enum ib_wc_status in ib_verbs.h + */ + siw_dbg_qp(qp, "wqe type %d processing failed: %d\n", + tx_type(wqe), rv); + + spin_lock_irqsave(&qp->sq_lock, flags); + /* + * RREQ may have already been completed by inbound RRESP! + */ + if (tx_type == SIW_OP_READ || + tx_type == SIW_OP_READ_LOCAL_INV) { + /* Cleanup pending entry in ORQ */ + qp->orq_put--; + qp->orq[qp->orq_put % qp->attrs.orq_size].flags = 0; + } + spin_unlock_irqrestore(&qp->sq_lock, flags); + /* + * immediately suspends further TX processing + */ + if (!qp->tx_ctx.tx_suspend) + siw_qp_cm_drop(qp, 0); + + switch (tx_type) { + case SIW_OP_SEND: + case SIW_OP_SEND_REMOTE_INV: + case SIW_OP_SEND_WITH_IMM: + case SIW_OP_WRITE: + case SIW_OP_READ: + case SIW_OP_READ_LOCAL_INV: + siw_wqe_put_mem(wqe, tx_type); + case SIW_OP_INVAL_STAG: + case SIW_OP_REG_MR: + siw_sqe_complete(qp, &wqe->sqe, wqe->bytes, + SIW_WC_LOC_QP_OP_ERR); + + siw_qp_event(qp, IB_EVENT_QP_FATAL); + + break; + + case SIW_OP_READ_RESPONSE: + siw_dbg_qp(qp, "proc. read.response failed: %d\n", rv); + + siw_qp_event(qp, IB_EVENT_QP_REQ_ERR); + + siw_wqe_put_mem(wqe, SIW_OP_READ_RESPONSE); + + break; + + default: + WARN(1, "undefined WQE type %d\n", tx_type); + rv = -EINVAL; + } + wqe->wr_status = SIW_WR_IDLE; + } +done: + return rv; +} + +static void siw_sq_resume(struct siw_qp *qp) +{ + if (down_read_trylock(&qp->state_lock)) { + if (likely(qp->attrs.state == SIW_QP_STATE_RTS && + !qp->tx_ctx.tx_suspend)) { + int rv = siw_qp_sq_process(qp); + + up_read(&qp->state_lock); + + if (unlikely(rv < 0)) { + siw_dbg_qp(qp, "SQ task failed: err %d\n", rv); + + if (!qp->tx_ctx.tx_suspend) + siw_qp_cm_drop(qp, 0); + } + } else { + up_read(&qp->state_lock); + } + } else { + siw_dbg_qp(qp, "Resume SQ while QP locked\n"); + } + siw_qp_put(qp); +} + +struct tx_task_t { + struct llist_head active; + wait_queue_head_t waiting; +}; + +static DEFINE_PER_CPU(struct tx_task_t, tx_task_g); + +void siw_stop_tx_thread(int nr_cpu) +{ + kthread_stop(siw_tx_thread[nr_cpu]); + wake_up(&per_cpu(tx_task_g, nr_cpu).waiting); +} + +int siw_run_sq(void *data) +{ + const int nr_cpu = (unsigned int)(long)data; + struct llist_node *active; + struct siw_qp *qp; + struct tx_task_t *tx_task = &per_cpu(tx_task_g, nr_cpu); + + init_llist_head(&tx_task->active); + init_waitqueue_head(&tx_task->waiting); + + pr_info("Started siw TX thread on CPU %u\n", nr_cpu); + + while (1) { + struct llist_node *fifo_list = NULL; + + wait_event_interruptible(tx_task->waiting, + !llist_empty(&tx_task->active) || + kthread_should_stop()); + + if (kthread_should_stop()) + break; + + active = llist_del_all(&tx_task->active); + /* + * llist_del_all returns a list with newest entry first. + * Re-order list for fairness among QP's. + */ + while (active) { + struct llist_node *tmp = active; + + active = llist_next(active); + tmp->next = fifo_list; + fifo_list = tmp; + } + while (fifo_list) { + qp = container_of(fifo_list, struct siw_qp, tx_list); + fifo_list = llist_next(fifo_list); + qp->tx_list.next = NULL; + + siw_sq_resume(qp); + } + } + active = llist_del_all(&tx_task->active); + if (active) { + llist_for_each_entry(qp, active, tx_list) { + qp->tx_list.next = NULL; + siw_sq_resume(qp); + } + } + pr_info("Stopped siw TX thread on CPU %u\n", nr_cpu); + + return 0; +} + +int siw_sq_start(struct siw_qp *qp) +{ + if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) + return 0; + + if (unlikely(!cpu_online(qp->tx_cpu))) { + siw_put_tx_cpu(qp->tx_cpu); + qp->tx_cpu = siw_get_tx_cpu(qp->sdev); + if (qp->tx_cpu < 0) { + pr_warn("siw: no tx cpu available\n"); + + return -EIO; + } + } + siw_qp_get(qp); + + llist_add(&qp->tx_list, &per_cpu(tx_task_g, qp->tx_cpu).active); + + wake_up(&per_cpu(tx_task_g, qp->tx_cpu).waiting); + + return 0; +} -- cgit v1.2.3 From 8b6a361b8c482f22ac99c3273285ff16b23fba91 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:31 +0200 Subject: rdma/siw: receive path Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_qp_rx.c | 1456 +++++++++++++++++++++++++++++++++ 1 file changed, 1456 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw_qp_rx.c (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c new file mode 100644 index 000000000000..682a290bc11e --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_qp_rx.c @@ -0,0 +1,1456 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include +#include +#include +#include +#include + +#include +#include + +#include "siw.h" +#include "siw_verbs.h" +#include "siw_mem.h" + +/* + * siw_rx_umem() + * + * Receive data of @len into target referenced by @dest_addr. + * + * @srx: Receive Context + * @umem: siw representation of target memory + * @dest_addr: user virtual address + * @len: number of bytes to place + */ +static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem, + u64 dest_addr, int len) +{ + int copied = 0; + + while (len) { + struct page *p; + int pg_off, bytes, rv; + void *dest; + + p = siw_get_upage(umem, dest_addr); + if (unlikely(!p)) { + pr_warn("siw: %s: [QP %u]: bogus addr: %p, %p\n", + __func__, qp_id(rx_qp(srx)), + (void *)dest_addr, (void *)umem->fp_addr); + /* siw internal error */ + srx->skb_copied += copied; + srx->skb_new -= copied; + + return -EFAULT; + } + pg_off = dest_addr & ~PAGE_MASK; + bytes = min(len, (int)PAGE_SIZE - pg_off); + + siw_dbg_qp(rx_qp(srx), "page %p, bytes=%u\n", p, bytes); + + dest = kmap_atomic(p); + rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off, + bytes); + + if (unlikely(rv)) { + kunmap_atomic(dest); + srx->skb_copied += copied; + srx->skb_new -= copied; + + pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n", + qp_id(rx_qp(srx)), __func__, len, p, rv); + + return -EFAULT; + } + if (srx->mpa_crc_hd) { + if (rx_qp(srx)->kernel_verbs) { + crypto_shash_update(srx->mpa_crc_hd, + (u8 *)(dest + pg_off), bytes); + kunmap_atomic(dest); + } else { + kunmap_atomic(dest); + /* + * Do CRC on original, not target buffer. + * Some user land applications may + * concurrently write the target buffer, + * which would yield a broken CRC. + * Walking the skb twice is very ineffcient. + * Folding the CRC into skb_copy_bits() + * would be much better, but is currently + * not supported. + */ + siw_crc_skb(srx, bytes); + } + } else { + kunmap_atomic(dest); + } + srx->skb_offset += bytes; + copied += bytes; + len -= bytes; + dest_addr += bytes; + pg_off = 0; + } + srx->skb_copied += copied; + srx->skb_new -= copied; + + return copied; +} + +static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len) +{ + int rv; + + siw_dbg_qp(rx_qp(srx), "kva: 0x%p, len: %u\n", kva, len); + + rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len); + if (unlikely(rv)) { + pr_warn("siw: [QP %u]: %s, len %d, kva 0x%p, rv %d\n", + qp_id(rx_qp(srx)), __func__, len, kva, rv); + + return rv; + } + if (srx->mpa_crc_hd) + crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len); + + srx->skb_offset += len; + srx->skb_copied += len; + srx->skb_new -= len; + + return len; +} + +static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx, + struct siw_mem *mem, u64 addr, int len) +{ + struct siw_pbl *pbl = mem->pbl; + u64 offset = addr - mem->va; + int copied = 0; + + while (len) { + int bytes; + u64 buf_addr = + siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx); + if (!buf_addr) + break; + + bytes = min(bytes, len); + if (siw_rx_kva(srx, (void *)buf_addr, bytes) == bytes) { + copied += bytes; + offset += bytes; + len -= bytes; + } else { + break; + } + } + return copied; +} + +/* + * siw_rresp_check_ntoh() + * + * Check incoming RRESP fragment header against expected + * header values and update expected values for potential next + * fragment. + * + * NOTE: This function must be called only if a RRESP DDP segment + * starts but not for fragmented consecutive pieces of an + * already started DDP segment. + */ +static int siw_rresp_check_ntoh(struct siw_rx_stream *srx, + struct siw_rx_fpdu *frx) +{ + struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp; + struct siw_wqe *wqe = &frx->wqe_active; + enum ddp_ecode ecode; + + u32 sink_stag = be32_to_cpu(rresp->sink_stag); + u64 sink_to = be64_to_cpu(rresp->sink_to); + + if (frx->first_ddp_seg) { + srx->ddp_stag = wqe->sqe.sge[0].lkey; + srx->ddp_to = wqe->sqe.sge[0].laddr; + frx->pbl_idx = 0; + } + /* Below checks extend beyond the semantics of DDP, and + * into RDMAP: + * We check if the read response matches exactly the + * read request which was send to the remote peer to + * trigger this read response. RFC5040/5041 do not + * always have a proper error code for the detected + * error cases. We choose 'base or bounds error' for + * cases where the inbound STag is valid, but offset + * or length do not match our response receive state. + */ + if (unlikely(srx->ddp_stag != sink_stag)) { + pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n", + qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag); + ecode = DDP_ECODE_T_INVALID_STAG; + goto error; + } + if (unlikely(srx->ddp_to != sink_to)) { + pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n", + qp_id(rx_qp(srx)), (unsigned long long)sink_to, + (unsigned long long)srx->ddp_to); + ecode = DDP_ECODE_T_BASE_BOUNDS; + goto error; + } + if (unlikely(!frx->more_ddp_segs && + (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) { + pr_warn("siw: [QP %u]: rresp len: %d != %d\n", + qp_id(rx_qp(srx)), + wqe->processed + srx->fpdu_part_rem, wqe->bytes); + ecode = DDP_ECODE_T_BASE_BOUNDS; + goto error; + } + return 0; +error: + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, ecode, 0); + return -EINVAL; +} + +/* + * siw_write_check_ntoh() + * + * Check incoming WRITE fragment header against expected + * header values and update expected values for potential next + * fragment + * + * NOTE: This function must be called only if a WRITE DDP segment + * starts but not for fragmented consecutive pieces of an + * already started DDP segment. + */ +static int siw_write_check_ntoh(struct siw_rx_stream *srx, + struct siw_rx_fpdu *frx) +{ + struct iwarp_rdma_write *write = &srx->hdr.rwrite; + enum ddp_ecode ecode; + + u32 sink_stag = be32_to_cpu(write->sink_stag); + u64 sink_to = be64_to_cpu(write->sink_to); + + if (frx->first_ddp_seg) { + srx->ddp_stag = sink_stag; + srx->ddp_to = sink_to; + frx->pbl_idx = 0; + } else { + if (unlikely(srx->ddp_stag != sink_stag)) { + pr_warn("siw: [QP %u]: write stag: %08x != %08x\n", + qp_id(rx_qp(srx)), sink_stag, + srx->ddp_stag); + ecode = DDP_ECODE_T_INVALID_STAG; + goto error; + } + if (unlikely(srx->ddp_to != sink_to)) { + pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n", + qp_id(rx_qp(srx)), + (unsigned long long)sink_to, + (unsigned long long)srx->ddp_to); + ecode = DDP_ECODE_T_BASE_BOUNDS; + goto error; + } + } + return 0; +error: + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, ecode, 0); + return -EINVAL; +} + +/* + * siw_send_check_ntoh() + * + * Check incoming SEND fragment header against expected + * header values and update expected MSN if no next + * fragment expected + * + * NOTE: This function must be called only if a SEND DDP segment + * starts but not for fragmented consecutive pieces of an + * already started DDP segment. + */ +static int siw_send_check_ntoh(struct siw_rx_stream *srx, + struct siw_rx_fpdu *frx) +{ + struct iwarp_send_inv *send = &srx->hdr.send_inv; + struct siw_wqe *wqe = &frx->wqe_active; + enum ddp_ecode ecode; + + u32 ddp_msn = be32_to_cpu(send->ddp_msn); + u32 ddp_mo = be32_to_cpu(send->ddp_mo); + u32 ddp_qn = be32_to_cpu(send->ddp_qn); + + if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) { + pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n", + qp_id(rx_qp(srx)), ddp_qn); + ecode = DDP_ECODE_UT_INVALID_QN; + goto error; + } + if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) { + pr_warn("siw: [QP %u]: send msn: %u != %u\n", + qp_id(rx_qp(srx)), ddp_msn, + srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); + ecode = DDP_ECODE_UT_INVALID_MSN_RANGE; + goto error; + } + if (unlikely(ddp_mo != wqe->processed)) { + pr_warn("siw: [QP %u], send mo: %u != %u\n", + qp_id(rx_qp(srx)), ddp_mo, wqe->processed); + ecode = DDP_ECODE_UT_INVALID_MO; + goto error; + } + if (frx->first_ddp_seg) { + /* initialize user memory write position */ + frx->sge_idx = 0; + frx->sge_off = 0; + frx->pbl_idx = 0; + + /* only valid for SEND_INV and SEND_SE_INV operations */ + srx->inval_stag = be32_to_cpu(send->inval_stag); + } + if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) { + siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n", + wqe->bytes, wqe->processed, srx->fpdu_part_rem); + wqe->wc_status = SIW_WC_LOC_LEN_ERR; + ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF; + goto error; + } + return 0; +error: + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, + DDP_ETYPE_UNTAGGED_BUF, ecode, 0); + return -EINVAL; +} + +static struct siw_wqe *siw_rqe_get(struct siw_qp *qp) +{ + struct siw_rqe *rqe; + struct siw_srq *srq; + struct siw_wqe *wqe = NULL; + bool srq_event = false; + unsigned long flags; + + srq = qp->srq; + if (srq) { + spin_lock_irqsave(&srq->lock, flags); + if (unlikely(!srq->num_rqe)) + goto out; + + rqe = &srq->recvq[srq->rq_get % srq->num_rqe]; + } else { + if (unlikely(!qp->recvq)) + goto out; + + rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size]; + } + if (likely(rqe->flags == SIW_WQE_VALID)) { + int num_sge = rqe->num_sge; + + if (likely(num_sge <= SIW_MAX_SGE)) { + int i = 0; + + wqe = rx_wqe(&qp->rx_untagged); + rx_type(wqe) = SIW_OP_RECEIVE; + wqe->wr_status = SIW_WR_INPROGRESS; + wqe->bytes = 0; + wqe->processed = 0; + + wqe->rqe.id = rqe->id; + wqe->rqe.num_sge = num_sge; + + while (i < num_sge) { + wqe->rqe.sge[i].laddr = rqe->sge[i].laddr; + wqe->rqe.sge[i].lkey = rqe->sge[i].lkey; + wqe->rqe.sge[i].length = rqe->sge[i].length; + wqe->bytes += wqe->rqe.sge[i].length; + wqe->mem[i] = NULL; + i++; + } + /* can be re-used by appl */ + smp_store_mb(rqe->flags, 0); + } else { + siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge); + if (srq) + spin_unlock_irqrestore(&srq->lock, flags); + return NULL; + } + if (!srq) { + qp->rq_get++; + } else { + if (srq->armed) { + /* Test SRQ limit */ + u32 off = (srq->rq_get + srq->limit) % + srq->num_rqe; + struct siw_rqe *rqe2 = &srq->recvq[off]; + + if (!(rqe2->flags & SIW_WQE_VALID)) { + srq->armed = 0; + srq_event = true; + } + } + srq->rq_get++; + } + } +out: + if (srq) { + spin_unlock_irqrestore(&srq->lock, flags); + if (srq_event) + siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED); + } + return wqe; +} + +/* + * siw_proc_send: + * + * Process one incoming SEND and place data into memory referenced by + * receive wqe. + * + * Function supports partially received sends (suspending/resuming + * current receive wqe processing) + * + * return value: + * 0: reached the end of a DDP segment + * -EAGAIN: to be called again to finish the DDP segment + */ +int siw_proc_send(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct siw_rx_fpdu *frx = &qp->rx_untagged; + struct siw_wqe *wqe; + u32 data_bytes; /* all data bytes available */ + u32 rcvd_bytes; /* sum of data bytes rcvd */ + int rv = 0; + + if (frx->first_ddp_seg) { + wqe = siw_rqe_get(qp); + if (unlikely(!wqe)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_UNTAGGED_BUF, + DDP_ECODE_UT_INVALID_MSN_NOBUF, 0); + return -ENOENT; + } + } else { + wqe = rx_wqe(frx); + } + if (srx->state == SIW_GET_DATA_START) { + rv = siw_send_check_ntoh(srx, frx); + if (unlikely(rv)) { + siw_qp_event(qp, IB_EVENT_QP_FATAL); + return rv; + } + if (!srx->fpdu_part_rem) /* zero length SEND */ + return 0; + } + data_bytes = min(srx->fpdu_part_rem, srx->skb_new); + rcvd_bytes = 0; + + /* A zero length SEND will skip below loop */ + while (data_bytes) { + struct ib_pd *pd; + struct siw_mem **mem, *mem_p; + struct siw_sge *sge; + u32 sge_bytes; /* data bytes avail for SGE */ + + sge = &wqe->rqe.sge[frx->sge_idx]; + + if (!sge->length) { + /* just skip empty sge's */ + frx->sge_idx++; + frx->sge_off = 0; + frx->pbl_idx = 0; + continue; + } + sge_bytes = min(data_bytes, sge->length - frx->sge_off); + mem = &wqe->mem[frx->sge_idx]; + + /* + * check with QP's PD if no SRQ present, SRQ's PD otherwise + */ + pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd; + + rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE, + frx->sge_off, sge_bytes); + if (unlikely(rv)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_CATASTROPHIC, + DDP_ECODE_CATASTROPHIC, 0); + + siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); + break; + } + mem_p = *mem; + if (mem_p->mem_obj == NULL) + rv = siw_rx_kva(srx, + (void *)(sge->laddr + frx->sge_off), + sge_bytes); + else if (!mem_p->is_pbl) + rv = siw_rx_umem(srx, mem_p->umem, + sge->laddr + frx->sge_off, sge_bytes); + else + rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p, + sge->laddr + frx->sge_off, sge_bytes); + + if (unlikely(rv != sge_bytes)) { + wqe->processed += rcvd_bytes; + + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_CATASTROPHIC, + DDP_ECODE_CATASTROPHIC, 0); + return -EINVAL; + } + frx->sge_off += rv; + + if (frx->sge_off == sge->length) { + frx->sge_idx++; + frx->sge_off = 0; + frx->pbl_idx = 0; + } + data_bytes -= rv; + rcvd_bytes += rv; + + srx->fpdu_part_rem -= rv; + srx->fpdu_part_rcvd += rv; + } + wqe->processed += rcvd_bytes; + + if (!srx->fpdu_part_rem) + return 0; + + return (rv < 0) ? rv : -EAGAIN; +} + +/* + * siw_proc_write: + * + * Place incoming WRITE after referencing and checking target buffer + + * Function supports partially received WRITEs (suspending/resuming + * current receive processing) + * + * return value: + * 0: reached the end of a DDP segment + * -EAGAIN: to be called again to finish the DDP segment + */ +int siw_proc_write(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct siw_rx_fpdu *frx = &qp->rx_tagged; + struct siw_mem *mem; + int bytes, rv; + + if (srx->state == SIW_GET_DATA_START) { + if (!srx->fpdu_part_rem) /* zero length WRITE */ + return 0; + + rv = siw_write_check_ntoh(srx, frx); + if (unlikely(rv)) { + siw_qp_event(qp, IB_EVENT_QP_FATAL); + return rv; + } + } + bytes = min(srx->fpdu_part_rem, srx->skb_new); + + if (frx->first_ddp_seg) { + struct siw_wqe *wqe = rx_wqe(frx); + + rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8); + if (unlikely(!rx_mem(frx))) { + siw_dbg_qp(qp, + "sink stag not found/invalid, stag 0x%08x\n", + srx->ddp_stag); + + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, + DDP_ECODE_T_INVALID_STAG, 0); + return -EINVAL; + } + wqe->rqe.num_sge = 1; + rx_type(wqe) = SIW_OP_WRITE; + wqe->wr_status = SIW_WR_INPROGRESS; + } + mem = rx_mem(frx); + + /* + * Check if application re-registered memory with different + * key field of STag. + */ + if (unlikely(mem->stag != srx->ddp_stag)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, + DDP_ECODE_T_INVALID_STAG, 0); + return -EINVAL; + } + rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd, + IB_ACCESS_REMOTE_WRITE, bytes); + if (unlikely(rv)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv), + 0); + + siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); + + return -EINVAL; + } + + if (mem->mem_obj == NULL) + rv = siw_rx_kva(srx, + (void *)(srx->ddp_to + srx->fpdu_part_rcvd), + bytes); + else if (!mem->is_pbl) + rv = siw_rx_umem(srx, mem->umem, + srx->ddp_to + srx->fpdu_part_rcvd, bytes); + else + rv = siw_rx_pbl(srx, &frx->pbl_idx, mem, + srx->ddp_to + srx->fpdu_part_rcvd, bytes); + + if (unlikely(rv != bytes)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_CATASTROPHIC, + DDP_ECODE_CATASTROPHIC, 0); + return -EINVAL; + } + srx->fpdu_part_rem -= rv; + srx->fpdu_part_rcvd += rv; + + if (!srx->fpdu_part_rem) { + srx->ddp_to += srx->fpdu_part_rcvd; + return 0; + } + return -EAGAIN; +} + +/* + * Inbound RREQ's cannot carry user data. + */ +int siw_proc_rreq(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + + if (!srx->fpdu_part_rem) + return 0; + + pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp), + be16_to_cpu(srx->hdr.ctrl.mpa_len)); + + return -EPROTO; +} + +/* + * siw_init_rresp: + * + * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE. + * Put it at the tail of the IRQ, if there is another WQE currently in + * transmit processing. If not, make it the current WQE to be processed + * and schedule transmit processing. + * + * Can be called from softirq context and from process + * context (RREAD socket loopback case!) + * + * return value: + * 0: success, + * failure code otherwise + */ + +static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx) +{ + struct siw_wqe *tx_work = tx_wqe(qp); + struct siw_sqe *resp; + + uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to), + laddr = be64_to_cpu(srx->hdr.rreq.source_to); + uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size), + lkey = be32_to_cpu(srx->hdr.rreq.source_stag), + rkey = be32_to_cpu(srx->hdr.rreq.sink_stag), + msn = be32_to_cpu(srx->hdr.rreq.ddp_msn); + + int run_sq = 1, rv = 0; + unsigned long flags; + + if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_UNTAGGED_BUF, + DDP_ECODE_UT_INVALID_MSN_RANGE, 0); + return -EPROTO; + } + spin_lock_irqsave(&qp->sq_lock, flags); + + if (tx_work->wr_status == SIW_WR_IDLE) { + /* + * immediately schedule READ response w/o + * consuming IRQ entry: IRQ must be empty. + */ + tx_work->processed = 0; + tx_work->mem[0] = NULL; + tx_work->wr_status = SIW_WR_QUEUED; + resp = &tx_work->sqe; + } else { + resp = irq_alloc_free(qp); + run_sq = 0; + } + if (likely(resp)) { + resp->opcode = SIW_OP_READ_RESPONSE; + + resp->sge[0].length = length; + resp->sge[0].laddr = laddr; + resp->sge[0].lkey = lkey; + + /* Keep aside message sequence number for potential + * error reporting during Read Response generation. + */ + resp->sge[1].length = msn; + + resp->raddr = raddr; + resp->rkey = rkey; + resp->num_sge = length ? 1 : 0; + + /* RRESP now valid as current TX wqe or placed into IRQ */ + smp_store_mb(resp->flags, SIW_WQE_VALID); + } else { + pr_warn("siw: [QP %u]: irq %d exceeded %d\n", qp_id(qp), + qp->irq_put % qp->attrs.irq_size, qp->attrs.irq_size); + + siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_REMOTE_OPERATION, + RDMAP_ECODE_CATASTROPHIC_STREAM, 0); + rv = -EPROTO; + } + + spin_unlock_irqrestore(&qp->sq_lock, flags); + + if (run_sq) + rv = siw_sq_start(qp); + + return rv; +} + +/* + * Only called at start of Read.Resonse processing. + * Transfer pending Read from tip of ORQ into currrent rx wqe, + * but keep ORQ entry valid until Read.Response processing done. + * No Queue locking needed. + */ +static int siw_orqe_start_rx(struct siw_qp *qp) +{ + struct siw_sqe *orqe; + struct siw_wqe *wqe = NULL; + + /* make sure ORQ indices are current */ + smp_mb(); + + orqe = orq_get_current(qp); + if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) { + /* RRESP is a TAGGED RDMAP operation */ + wqe = rx_wqe(&qp->rx_tagged); + wqe->sqe.id = orqe->id; + wqe->sqe.opcode = orqe->opcode; + wqe->sqe.sge[0].laddr = orqe->sge[0].laddr; + wqe->sqe.sge[0].lkey = orqe->sge[0].lkey; + wqe->sqe.sge[0].length = orqe->sge[0].length; + wqe->sqe.flags = orqe->flags; + wqe->sqe.num_sge = 1; + wqe->bytes = orqe->sge[0].length; + wqe->processed = 0; + wqe->mem[0] = NULL; + /* make sure WQE is completely written before valid */ + smp_wmb(); + wqe->wr_status = SIW_WR_INPROGRESS; + + return 0; + } + return -EPROTO; +} + +/* + * siw_proc_rresp: + * + * Place incoming RRESP data into memory referenced by RREQ WQE + * which is at the tip of the ORQ + * + * Function supports partially received RRESP's (suspending/resuming + * current receive processing) + */ +int siw_proc_rresp(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct siw_rx_fpdu *frx = &qp->rx_tagged; + struct siw_wqe *wqe = rx_wqe(frx); + struct siw_mem **mem, *mem_p; + struct siw_sge *sge; + int bytes, rv; + + if (frx->first_ddp_seg) { + if (unlikely(wqe->wr_status != SIW_WR_IDLE)) { + pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n", + qp_id(qp), wqe->wr_status, wqe->sqe.opcode); + rv = -EPROTO; + goto error_term; + } + /* + * fetch pending RREQ from orq + */ + rv = siw_orqe_start_rx(qp); + if (rv) { + pr_warn("siw: [QP %u]: ORQ empty at idx %d\n", + qp_id(qp), qp->orq_get % qp->attrs.orq_size); + goto error_term; + } + rv = siw_rresp_check_ntoh(srx, frx); + if (unlikely(rv)) { + siw_qp_event(qp, IB_EVENT_QP_FATAL); + return rv; + } + } else { + if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) { + pr_warn("siw: [QP %u]: resume RRESP: status %d\n", + qp_id(qp), wqe->wr_status); + rv = -EPROTO; + goto error_term; + } + } + if (!srx->fpdu_part_rem) /* zero length RRESPONSE */ + return 0; + + sge = wqe->sqe.sge; /* there is only one */ + mem = &wqe->mem[0]; + + if (!(*mem)) { + /* + * check target memory which resolves memory on first fragment + */ + rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0, + wqe->bytes); + if (unlikely(rv)) { + siw_dbg_qp(qp, "target mem check: %d\n", rv); + wqe->wc_status = SIW_WC_LOC_PROT_ERR; + + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, + siw_tagged_error(-rv), 0); + + siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); + + return -EINVAL; + } + } + mem_p = *mem; + + bytes = min(srx->fpdu_part_rem, srx->skb_new); + + if (mem_p->mem_obj == NULL) + rv = siw_rx_kva(srx, (void *)(sge->laddr + wqe->processed), + bytes); + else if (!mem_p->is_pbl) + rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed, + bytes); + else + rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p, + sge->laddr + wqe->processed, bytes); + if (rv != bytes) { + wqe->wc_status = SIW_WC_GENERAL_ERR; + rv = -EINVAL; + goto error_term; + } + srx->fpdu_part_rem -= rv; + srx->fpdu_part_rcvd += rv; + wqe->processed += rv; + + if (!srx->fpdu_part_rem) { + srx->ddp_to += srx->fpdu_part_rcvd; + return 0; + } + return -EAGAIN; + +error_term: + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC, + DDP_ECODE_CATASTROPHIC, 0); + return rv; +} + +int siw_proc_terminate(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct sk_buff *skb = srx->skb; + struct iwarp_terminate *term = &srx->hdr.terminate; + union iwarp_hdr term_info; + u8 *infop = (u8 *)&term_info; + enum rdma_opcode op; + u16 to_copy = sizeof(struct iwarp_ctrl); + + pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n", + __rdmap_term_layer(term), __rdmap_term_etype(term), + __rdmap_term_ecode(term)); + + if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE || + be32_to_cpu(term->ddp_msn) != + qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] || + be32_to_cpu(term->ddp_mo) != 0) { + pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n", + be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn), + be32_to_cpu(term->ddp_mo)); + return -ECONNRESET; + } + /* + * Receive remaining pieces of TERM if indicated + */ + if (!term->flag_m) + return -ECONNRESET; + + /* Do not take the effort to reassemble a network fragmented + * TERM message + */ + if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged)) + return -ECONNRESET; + + memset(infop, 0, sizeof(term_info)); + + skb_copy_bits(skb, srx->skb_offset, infop, to_copy); + + op = __rdmap_get_opcode(&term_info.ctrl); + if (op >= RDMAP_TERMINATE) + goto out; + + infop += to_copy; + srx->skb_offset += to_copy; + srx->skb_new -= to_copy; + srx->skb_copied += to_copy; + srx->fpdu_part_rcvd += to_copy; + srx->fpdu_part_rem -= to_copy; + + to_copy = iwarp_pktinfo[op].hdr_len - to_copy; + + /* Again, no network fragmented TERM's */ + if (to_copy + MPA_CRC_SIZE > srx->skb_new) + return -ECONNRESET; + + skb_copy_bits(skb, srx->skb_offset, infop, to_copy); + + if (term->flag_r) { + siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n", + op, be16_to_cpu(term_info.ctrl.mpa_len), + term->flag_m ? "valid" : "invalid"); + } else if (term->flag_d) { + siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n", + op, be16_to_cpu(term_info.ctrl.mpa_len), + term->flag_m ? "valid" : "invalid"); + } +out: + srx->skb_new -= to_copy; + srx->skb_offset += to_copy; + srx->skb_copied += to_copy; + srx->fpdu_part_rcvd += to_copy; + srx->fpdu_part_rem -= to_copy; + + return -ECONNRESET; +} + +static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx) +{ + struct sk_buff *skb = srx->skb; + u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad; + __wsum crc_in, crc_own = 0; + + siw_dbg_qp(qp, "expected %d, available %d, pad %u\n", + srx->fpdu_part_rem, srx->skb_new, srx->pad); + + if (srx->skb_new < srx->fpdu_part_rem) + return -EAGAIN; + + skb_copy_bits(skb, srx->skb_offset, tbuf, srx->fpdu_part_rem); + + if (srx->mpa_crc_hd && srx->pad) + crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad); + + srx->skb_new -= srx->fpdu_part_rem; + srx->skb_offset += srx->fpdu_part_rem; + srx->skb_copied += srx->fpdu_part_rem; + + if (!srx->mpa_crc_hd) + return 0; + + /* + * CRC32 is computed, transmitted and received directly in NBO, + * so there's never a reason to convert byte order. + */ + crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own); + crc_in = (__force __wsum)srx->trailer.crc; + + if (unlikely(crc_in != crc_own)) { + pr_warn("siw: crc error. in: %08x, own %08x, op %u\n", + crc_in, crc_own, qp->rx_stream.rdmap_op); + + siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, + LLP_ETYPE_MPA, + LLP_ECODE_RECEIVED_CRC, 0); + return -EINVAL; + } + return 0; +} + +#define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged) + +static int siw_get_hdr(struct siw_rx_stream *srx) +{ + struct sk_buff *skb = srx->skb; + struct siw_qp *qp = rx_qp(srx); + struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl; + struct siw_rx_fpdu *frx; + u8 opcode; + int bytes; + + if (srx->fpdu_part_rcvd < MIN_DDP_HDR) { + /* + * copy a mimimum sized (tagged) DDP frame control part + */ + bytes = min_t(int, srx->skb_new, + MIN_DDP_HDR - srx->fpdu_part_rcvd); + + skb_copy_bits(skb, srx->skb_offset, + (char *)c_hdr + srx->fpdu_part_rcvd, bytes); + + srx->fpdu_part_rcvd += bytes; + + srx->skb_new -= bytes; + srx->skb_offset += bytes; + srx->skb_copied += bytes; + + if (srx->fpdu_part_rcvd < MIN_DDP_HDR) + return -EAGAIN; + + if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) { + enum ddp_etype etype; + enum ddp_ecode ecode; + + pr_warn("siw: received ddp version unsupported %d\n", + __ddp_get_version(c_hdr)); + + if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) { + etype = DDP_ETYPE_TAGGED_BUF; + ecode = DDP_ECODE_T_VERSION; + } else { + etype = DDP_ETYPE_UNTAGGED_BUF; + ecode = DDP_ECODE_UT_VERSION; + } + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, + etype, ecode, 0); + return -EINVAL; + } + if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) { + pr_warn("siw: received rdmap version unsupported %d\n", + __rdmap_get_version(c_hdr)); + + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_REMOTE_OPERATION, + RDMAP_ECODE_VERSION, 0); + return -EINVAL; + } + opcode = __rdmap_get_opcode(c_hdr); + + if (opcode > RDMAP_TERMINATE) { + pr_warn("siw: received unknown packet type %u\n", + opcode); + + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_REMOTE_OPERATION, + RDMAP_ECODE_OPCODE, 0); + return -EINVAL; + } + siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode); + } else { + opcode = __rdmap_get_opcode(c_hdr); + } + set_rx_fpdu_context(qp, opcode); + frx = qp->rx_fpdu; + + /* + * Figure out len of current hdr: variable length of + * iwarp hdr may force us to copy hdr information in + * two steps. Only tagged DDP messages are already + * completely received. + */ + if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) { + bytes = iwarp_pktinfo[opcode].hdr_len - MIN_DDP_HDR; + + if (srx->skb_new < bytes) + return -EAGAIN; + + skb_copy_bits(skb, srx->skb_offset, + (char *)c_hdr + srx->fpdu_part_rcvd, bytes); + + srx->fpdu_part_rcvd += bytes; + + srx->skb_new -= bytes; + srx->skb_offset += bytes; + srx->skb_copied += bytes; + } + + /* + * DDP/RDMAP header receive completed. Check if the current + * DDP segment starts a new RDMAP message or continues a previously + * started RDMAP message. + * + * Alternating reception of DDP segments (or FPDUs) from incomplete + * tagged and untagged RDMAP messages is supported, as long as + * the current tagged or untagged message gets eventually completed + * w/o intersection from another message of the same type + * (tagged/untagged). E.g., a WRITE can get intersected by a SEND, + * but not by a READ RESPONSE etc. + */ + if (srx->mpa_crc_hd) { + /* + * Restart CRC computation + */ + crypto_shash_init(srx->mpa_crc_hd); + crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr, + srx->fpdu_part_rcvd); + } + if (frx->more_ddp_segs) { + frx->first_ddp_seg = 0; + if (frx->prev_rdmap_op != opcode) { + pr_warn("siw: packet intersection: %u : %u\n", + frx->prev_rdmap_op, opcode); + /* + * The last inbound RDMA operation of same type + * (tagged or untagged) is left unfinished. + * To complete it in error, make it the current + * operation again, even with the header already + * overwritten. For error handling, only the opcode + * and current rx context are relevant. + */ + set_rx_fpdu_context(qp, frx->prev_rdmap_op); + __rdmap_set_opcode(c_hdr, frx->prev_rdmap_op); + return -EPROTO; + } + } else { + frx->prev_rdmap_op = opcode; + frx->first_ddp_seg = 1; + } + frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1; + + return 0; +} + +static int siw_check_tx_fence(struct siw_qp *qp) +{ + struct siw_wqe *tx_waiting = tx_wqe(qp); + struct siw_sqe *rreq; + int resume_tx = 0, rv = 0; + unsigned long flags; + + spin_lock_irqsave(&qp->orq_lock, flags); + + rreq = orq_get_current(qp); + + /* free current orq entry */ + WRITE_ONCE(rreq->flags, 0); + + if (qp->tx_ctx.orq_fence) { + if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) { + pr_warn("siw: [QP %u]: fence resume: bad status %d\n", + qp_id(qp), tx_waiting->wr_status); + rv = -EPROTO; + goto out; + } + /* resume SQ processing */ + if (tx_waiting->sqe.opcode == SIW_OP_READ || + tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) { + rreq = orq_get_tail(qp); + if (unlikely(!rreq)) { + pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp)); + rv = -EPROTO; + goto out; + } + siw_read_to_orq(rreq, &tx_waiting->sqe); + + qp->orq_put++; + qp->tx_ctx.orq_fence = 0; + resume_tx = 1; + + } else if (siw_orq_empty(qp)) { + qp->tx_ctx.orq_fence = 0; + resume_tx = 1; + } else { + pr_warn("siw: [QP %u]: fence resume: orq idx: %d:%d\n", + qp_id(qp), qp->orq_get, qp->orq_put); + rv = -EPROTO; + } + } + qp->orq_get++; +out: + spin_unlock_irqrestore(&qp->orq_lock, flags); + + if (resume_tx) + rv = siw_sq_start(qp); + + return rv; +} + +/* + * siw_rdmap_complete() + * + * Complete processing of an RDMA message after receiving all + * DDP segmens or ABort processing after encountering error case. + * + * o SENDs + RRESPs will need for completion, + * o RREQs need for READ RESPONSE initialization + * o WRITEs need memory dereferencing + * + * TODO: Failed WRITEs need local error to be surfaced. + */ +static int siw_rdmap_complete(struct siw_qp *qp, int error) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu); + enum siw_wc_status wc_status = wqe->wc_status; + u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl); + int rv = 0; + + switch (opcode) { + case RDMAP_SEND_SE: + case RDMAP_SEND_SE_INVAL: + wqe->rqe.flags |= SIW_WQE_SOLICITED; + case RDMAP_SEND: + case RDMAP_SEND_INVAL: + if (wqe->wr_status == SIW_WR_IDLE) + break; + + srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++; + + if (error != 0 && wc_status == SIW_WC_SUCCESS) + wc_status = SIW_WC_GENERAL_ERR; + /* + * Handle STag invalidation request + */ + if (wc_status == SIW_WC_SUCCESS && + (opcode == RDMAP_SEND_INVAL || + opcode == RDMAP_SEND_SE_INVAL)) { + rv = siw_invalidate_stag(qp->pd, srx->inval_stag); + if (rv) { + siw_init_terminate( + qp, TERM_ERROR_LAYER_RDMAP, + rv == -EACCES ? + RDMAP_ETYPE_REMOTE_PROTECTION : + RDMAP_ETYPE_REMOTE_OPERATION, + RDMAP_ECODE_CANNOT_INVALIDATE, 0); + + wc_status = SIW_WC_REM_INV_REQ_ERR; + } + rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed, + rv ? 0 : srx->inval_stag, + wc_status); + } else { + rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed, + 0, wc_status); + } + siw_wqe_put_mem(wqe, SIW_OP_RECEIVE); + break; + + case RDMAP_RDMA_READ_RESP: + if (wqe->wr_status == SIW_WR_IDLE) + break; + + if (error != 0) { + if ((srx->state == SIW_GET_HDR && + qp->rx_fpdu->first_ddp_seg) || error == -ENODATA) + /* possible RREQ in ORQ left untouched */ + break; + + if (wc_status == SIW_WC_SUCCESS) + wc_status = SIW_WC_GENERAL_ERR; + } else if (qp->kernel_verbs && + rx_type(wqe) == SIW_OP_READ_LOCAL_INV) { + /* + * Handle any STag invalidation request + */ + rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey); + if (rv) { + siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_CATASTROPHIC, + RDMAP_ECODE_UNSPECIFIED, 0); + + if (wc_status == SIW_WC_SUCCESS) { + wc_status = SIW_WC_GENERAL_ERR; + error = rv; + } + } + } + /* + * All errors turn the wqe into signalled. + */ + if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0) + rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed, + wc_status); + siw_wqe_put_mem(wqe, SIW_OP_READ); + + if (!error) + rv = siw_check_tx_fence(qp); + else + /* Disable current ORQ eleement */ + WRITE_ONCE(orq_get_current(qp)->flags, 0); + break; + + case RDMAP_RDMA_READ_REQ: + if (!error) { + rv = siw_init_rresp(qp, srx); + srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++; + } + break; + + case RDMAP_RDMA_WRITE: + if (wqe->wr_status == SIW_WR_IDLE) + break; + + /* + * Free References from memory object if + * attached to receive context (inbound WRITE). + * While a zero-length WRITE is allowed, + * no memory reference got created. + */ + if (rx_mem(&qp->rx_tagged)) { + siw_mem_put(rx_mem(&qp->rx_tagged)); + rx_mem(&qp->rx_tagged) = NULL; + } + break; + + default: + break; + } + wqe->wr_status = SIW_WR_IDLE; + + return rv; +} + +/* + * siw_tcp_rx_data() + * + * Main routine to consume inbound TCP payload + * + * @rd_desc: read descriptor + * @skb: socket buffer + * @off: offset in skb + * @len: skb->len - offset : payload in skb + */ +int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int off, size_t len) +{ + struct siw_qp *qp = rd_desc->arg.data; + struct siw_rx_stream *srx = &qp->rx_stream; + int rv; + + srx->skb = skb; + srx->skb_new = skb->len - off; + srx->skb_offset = off; + srx->skb_copied = 0; + + siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new); + + while (srx->skb_new) { + int run_completion = 1; + + if (unlikely(srx->rx_suspend)) { + /* Do not process any more data */ + srx->skb_copied += srx->skb_new; + break; + } + switch (srx->state) { + case SIW_GET_HDR: + rv = siw_get_hdr(srx); + if (!rv) { + srx->fpdu_part_rem = + be16_to_cpu(srx->hdr.ctrl.mpa_len) - + srx->fpdu_part_rcvd + MPA_HDR_SIZE; + + if (srx->fpdu_part_rem) + srx->pad = -srx->fpdu_part_rem & 0x3; + else + srx->pad = 0; + + srx->state = SIW_GET_DATA_START; + srx->fpdu_part_rcvd = 0; + } + break; + + case SIW_GET_DATA_MORE: + /* + * Another data fragment of the same DDP segment. + * Setting first_ddp_seg = 0 avoids repeating + * initializations that shall occur only once per + * DDP segment. + */ + qp->rx_fpdu->first_ddp_seg = 0; + /* Fall through */ + + case SIW_GET_DATA_START: + /* + * Headers will be checked by the opcode-specific + * data receive function below. + */ + rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp); + if (!rv) { + int mpa_len = + be16_to_cpu(srx->hdr.ctrl.mpa_len) + + MPA_HDR_SIZE; + + srx->fpdu_part_rem = (-mpa_len & 0x3) + + MPA_CRC_SIZE; + srx->fpdu_part_rcvd = 0; + srx->state = SIW_GET_TRAILER; + } else { + if (unlikely(rv == -ECONNRESET)) + run_completion = 0; + else + srx->state = SIW_GET_DATA_MORE; + } + break; + + case SIW_GET_TRAILER: + /* + * read CRC + any padding + */ + rv = siw_get_trailer(qp, srx); + if (likely(!rv)) { + /* + * FPDU completed. + * complete RDMAP message if last fragment + */ + srx->state = SIW_GET_HDR; + srx->fpdu_part_rcvd = 0; + + if (!(srx->hdr.ctrl.ddp_rdmap_ctrl & + DDP_FLAG_LAST)) + /* more frags */ + break; + + rv = siw_rdmap_complete(qp, 0); + run_completion = 0; + } + break; + + default: + pr_warn("QP[%u]: RX out of state\n", qp_id(qp)); + rv = -EPROTO; + run_completion = 0; + } + if (unlikely(rv != 0 && rv != -EAGAIN)) { + if ((srx->state > SIW_GET_HDR || + qp->rx_fpdu->more_ddp_segs) && run_completion) + siw_rdmap_complete(qp, rv); + + siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv, + srx->state); + + siw_qp_cm_drop(qp, 1); + + break; + } + if (rv) { + siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n", + srx->state, srx->fpdu_part_rem); + break; + } + } + return srx->skb_copied; +} -- cgit v1.2.3 From b0fff7317bb4325ace221a24c4bfa274f0046ee4 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:32 +0200 Subject: rdma/siw: completion queue methods Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_cq.c | 101 +++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw_cq.c (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/siw_cq.c b/drivers/infiniband/sw/siw/siw_cq.c new file mode 100644 index 000000000000..e2a0ee40d5b5 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_cq.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include +#include + +#include + +#include "siw.h" + +static int map_wc_opcode[SIW_NUM_OPCODES] = { + [SIW_OP_WRITE] = IB_WC_RDMA_WRITE, + [SIW_OP_SEND] = IB_WC_SEND, + [SIW_OP_SEND_WITH_IMM] = IB_WC_SEND, + [SIW_OP_READ] = IB_WC_RDMA_READ, + [SIW_OP_READ_LOCAL_INV] = IB_WC_RDMA_READ, + [SIW_OP_COMP_AND_SWAP] = IB_WC_COMP_SWAP, + [SIW_OP_FETCH_AND_ADD] = IB_WC_FETCH_ADD, + [SIW_OP_INVAL_STAG] = IB_WC_LOCAL_INV, + [SIW_OP_REG_MR] = IB_WC_REG_MR, + [SIW_OP_RECEIVE] = IB_WC_RECV, + [SIW_OP_READ_RESPONSE] = -1 /* not used */ +}; + +static struct { + enum siw_opcode siw; + enum ib_wc_status ib; +} map_cqe_status[SIW_NUM_WC_STATUS] = { + { SIW_WC_SUCCESS, IB_WC_SUCCESS }, + { SIW_WC_LOC_LEN_ERR, IB_WC_LOC_LEN_ERR }, + { SIW_WC_LOC_PROT_ERR, IB_WC_LOC_PROT_ERR }, + { SIW_WC_LOC_QP_OP_ERR, IB_WC_LOC_QP_OP_ERR }, + { SIW_WC_WR_FLUSH_ERR, IB_WC_WR_FLUSH_ERR }, + { SIW_WC_BAD_RESP_ERR, IB_WC_BAD_RESP_ERR }, + { SIW_WC_LOC_ACCESS_ERR, IB_WC_LOC_ACCESS_ERR }, + { SIW_WC_REM_ACCESS_ERR, IB_WC_REM_ACCESS_ERR }, + { SIW_WC_REM_INV_REQ_ERR, IB_WC_REM_INV_REQ_ERR }, + { SIW_WC_GENERAL_ERR, IB_WC_GENERAL_ERR } +}; + +/* + * Reap one CQE from the CQ. Only used by kernel clients + * during CQ normal operation. Might be called during CQ + * flush for user mapped CQE array as well. + */ +int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc) +{ + struct siw_cqe *cqe; + unsigned long flags; + + spin_lock_irqsave(&cq->lock, flags); + + cqe = &cq->queue[cq->cq_get % cq->num_cqe]; + if (READ_ONCE(cqe->flags) & SIW_WQE_VALID) { + memset(wc, 0, sizeof(*wc)); + wc->wr_id = cqe->id; + wc->status = map_cqe_status[cqe->status].ib; + wc->opcode = map_wc_opcode[cqe->opcode]; + wc->byte_len = cqe->bytes; + + /* + * During CQ flush, also user land CQE's may get + * reaped here, which do not hold a QP reference + * and do not qualify for memory extension verbs. + */ + if (likely(cq->kernel_verbs)) { + if (cqe->flags & SIW_WQE_REM_INVAL) { + wc->ex.invalidate_rkey = cqe->inval_stag; + wc->wc_flags = IB_WC_WITH_INVALIDATE; + } + wc->qp = cqe->base_qp; + siw_dbg_cq(cq, "idx %u, type %d, flags %2x, id 0x%p\n", + cq->cq_get % cq->num_cqe, cqe->opcode, + cqe->flags, (void *)cqe->id); + } + WRITE_ONCE(cqe->flags, 0); + cq->cq_get++; + + spin_unlock_irqrestore(&cq->lock, flags); + + return 1; + } + spin_unlock_irqrestore(&cq->lock, flags); + + return 0; +} + +/* + * siw_cq_flush() + * + * Flush all CQ elements. + */ +void siw_cq_flush(struct siw_cq *cq) +{ + struct ib_wc wc; + + while (siw_reap_cqe(cq, &wc)) + ; +} -- cgit v1.2.3 From c0cf5bdde46c664d583518addc19d6dabb6a8ec9 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Thu, 20 Jun 2019 18:21:33 +0200 Subject: rdma/siw: addition to kernel build environment Broken up commit to add the Soft iWarp RDMA driver. Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- MAINTAINERS | 7 +++++++ drivers/infiniband/Kconfig | 1 + drivers/infiniband/sw/Makefile | 1 + drivers/infiniband/sw/siw/Kconfig | 17 +++++++++++++++++ drivers/infiniband/sw/siw/Makefile | 11 +++++++++++ 5 files changed, 37 insertions(+) create mode 100644 drivers/infiniband/sw/siw/Kconfig create mode 100644 drivers/infiniband/sw/siw/Makefile (limited to 'drivers/infiniband/sw') diff --git a/MAINTAINERS b/MAINTAINERS index 83a62b911692..6d2de0c1520e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14558,6 +14558,13 @@ M: Chris Boot S: Maintained F: drivers/leds/leds-net48xx.c +SOFT-IWARP DRIVER (siw) +M: Bernard Metzler +L: linux-rdma@vger.kernel.org +S: Supported +F: drivers/infiniband/sw/siw/ +F: include/uapi/rdma/siw-abi.h + SOFT-ROCE DRIVER (rxe) M: Moni Shoua L: linux-rdma@vger.kernel.org diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 42af4cd40ba2..f277cb7aea29 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -96,6 +96,7 @@ source "drivers/infiniband/hw/hfi1/Kconfig" source "drivers/infiniband/hw/qedr/Kconfig" source "drivers/infiniband/sw/rdmavt/Kconfig" source "drivers/infiniband/sw/rxe/Kconfig" +source "drivers/infiniband/sw/siw/Kconfig" endif source "drivers/infiniband/ulp/ipoib/Kconfig" diff --git a/drivers/infiniband/sw/Makefile b/drivers/infiniband/sw/Makefile index ab48a9b60844..68e0230f8f31 100644 --- a/drivers/infiniband/sw/Makefile +++ b/drivers/infiniband/sw/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_INFINIBAND_RDMAVT) += rdmavt/ obj-$(CONFIG_RDMA_RXE) += rxe/ +obj-$(CONFIG_RDMA_SIW) += siw/ diff --git a/drivers/infiniband/sw/siw/Kconfig b/drivers/infiniband/sw/siw/Kconfig new file mode 100644 index 000000000000..94f684174ce3 --- /dev/null +++ b/drivers/infiniband/sw/siw/Kconfig @@ -0,0 +1,17 @@ +config RDMA_SIW + tristate "Software RDMA over TCP/IP (iWARP) driver" + depends on INET && INFINIBAND && CRYPTO_CRC32 + help + This driver implements the iWARP RDMA transport over + the Linux TCP/IP network stack. It enables a system with a + standard Ethernet adapter to interoperate with a iWARP + adapter or with another system running the SIW driver. + (See also RXE which is a similar software driver for RoCE.) + + The driver interfaces with the Linux RDMA stack and + implements both a kernel and user space RDMA verbs API. + The user space verbs API requires a support + library named libsiw which is loaded by the generic user + space verbs API, libibverbs. To implement RDMA over + TCP/IP, the driver further interfaces with the Linux + in-kernel TCP socket layer. diff --git a/drivers/infiniband/sw/siw/Makefile b/drivers/infiniband/sw/siw/Makefile new file mode 100644 index 000000000000..f5f7e3867889 --- /dev/null +++ b/drivers/infiniband/sw/siw/Makefile @@ -0,0 +1,11 @@ +obj-$(CONFIG_RDMA_SIW) += siw.o + +siw-y := \ + siw_cm.o \ + siw_cq.o \ + siw_main.o \ + siw_mem.o \ + siw_qp.o \ + siw_qp_tx.o \ + siw_qp_rx.o \ + siw_verbs.o -- cgit v1.2.3 From d3e5397169175628696e92191dfc0e86d8e48db9 Mon Sep 17 00:00:00 2001 From: Maksym Planeta Date: Tue, 2 Jul 2019 15:49:28 +0200 Subject: ibverbs/rxe: Remove variable self-initialization In some cases (not in this particular one) variable self-initialization can lead to undefined behavior. In this case, it is just obscure code. Signed-off-by: Maksym Planeta Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 00eb99d3df86..116cafc9afcf 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -558,7 +558,7 @@ int rxe_completer(void *arg) { struct rxe_qp *qp = (struct rxe_qp *)arg; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); - struct rxe_send_wqe *wqe = wqe; + struct rxe_send_wqe *wqe = NULL; struct sk_buff *skb = NULL; struct rxe_pkt_info *pkt = NULL; enum comp_state state; -- cgit v1.2.3 From 4c7d6dcd364843e408a60952ba914bb72bafc6cc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 8 Jul 2019 11:36:32 -0300 Subject: RDMA/siw: Fix DEFINE_PER_CPU compilation when ARCH_NEEDS_WEAK_PER_CPU The initializer for the variable cannot be inside the macro (and zero initialization isn't needed anyhow). include/linux/percpu-defs.h:92:33: warning: '__pcpu_unique_use_cnt' initialized and declared 'extern' extern __PCPU_DUMMY_ATTRS char __pcpu_unique_##name; \ ^~~~~~~~~~~~~~ include/linux/percpu-defs.h:115:2: note: in expansion of macro 'DEFINE_PER_CPU_SECTION' DEFINE_PER_CPU_SECTION(type, name, "") ^~~~~~~~~~~~~~~~~~~~~~ drivers/infiniband/sw/siw/siw_main.c:129:8: note: in expansion of macro 'DEFINE_PER_CPU' static DEFINE_PER_CPU(atomic_t, use_cnt = ATOMIC_INIT(0)); ^~~~~~~~~~~~~~ Also the rules for PER_CPU require the variable names to be globally unique, so prefix them with siw_ Fixes: b9be6f18cf9e ("rdma/siw: transmit path") Fixes: bdcf26bf9b3a ("rdma/siw: network and RDMA core interface") Reported-by: Stephen Rothwell Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_main.c | 8 ++++---- drivers/infiniband/sw/siw/siw_qp_tx.c | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index 3f5f3d27ebe5..fd2552a9091d 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -126,7 +126,7 @@ static int siw_dev_qualified(struct net_device *netdev) return 0; } -static DEFINE_PER_CPU(atomic_t, use_cnt = ATOMIC_INIT(0)); +static DEFINE_PER_CPU(atomic_t, siw_use_cnt); static struct { struct cpumask **tx_valid_cpus; @@ -215,7 +215,7 @@ int siw_get_tx_cpu(struct siw_device *sdev) if (!siw_tx_thread[cpu]) continue; - usage = atomic_read(&per_cpu(use_cnt, cpu)); + usage = atomic_read(&per_cpu(siw_use_cnt, cpu)); if (usage <= min_use) { tx_cpu = cpu; min_use = usage; @@ -226,7 +226,7 @@ int siw_get_tx_cpu(struct siw_device *sdev) out: if (tx_cpu >= 0) - atomic_inc(&per_cpu(use_cnt, tx_cpu)); + atomic_inc(&per_cpu(siw_use_cnt, tx_cpu)); else pr_warn("siw: no tx cpu found\n"); @@ -235,7 +235,7 @@ out: void siw_put_tx_cpu(int cpu) { - atomic_dec(&per_cpu(use_cnt, cpu)); + atomic_dec(&per_cpu(siw_use_cnt, cpu)); } static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id) diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c index 5e926fac51db..1c9fa8fa96e5 100644 --- a/drivers/infiniband/sw/siw/siw_qp_tx.c +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c @@ -1183,12 +1183,12 @@ struct tx_task_t { wait_queue_head_t waiting; }; -static DEFINE_PER_CPU(struct tx_task_t, tx_task_g); +static DEFINE_PER_CPU(struct tx_task_t, siw_tx_task_g); void siw_stop_tx_thread(int nr_cpu) { kthread_stop(siw_tx_thread[nr_cpu]); - wake_up(&per_cpu(tx_task_g, nr_cpu).waiting); + wake_up(&per_cpu(siw_tx_task_g, nr_cpu).waiting); } int siw_run_sq(void *data) @@ -1196,7 +1196,7 @@ int siw_run_sq(void *data) const int nr_cpu = (unsigned int)(long)data; struct llist_node *active; struct siw_qp *qp; - struct tx_task_t *tx_task = &per_cpu(tx_task_g, nr_cpu); + struct tx_task_t *tx_task = &per_cpu(siw_tx_task_g, nr_cpu); init_llist_head(&tx_task->active); init_waitqueue_head(&tx_task->waiting); @@ -1261,9 +1261,9 @@ int siw_sq_start(struct siw_qp *qp) } siw_qp_get(qp); - llist_add(&qp->tx_list, &per_cpu(tx_task_g, qp->tx_cpu).active); + llist_add(&qp->tx_list, &per_cpu(siw_tx_task_g, qp->tx_cpu).active); - wake_up(&per_cpu(tx_task_g, qp->tx_cpu).waiting); + wake_up(&per_cpu(siw_tx_task_g, qp->tx_cpu).waiting); return 0; } -- cgit v1.2.3 From f10ff380fd7dfba4a36d40f8dd00fe17da8a1a10 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 8 Jul 2019 12:17:48 -0300 Subject: RDMA/rvt: Do not use a kernel header in the ABI rvt was using ib_sge as part of it's ABI, which is not allowed. Introduce a new struct with the same layout and use it instead. Fixes: dabac6e460ce ("IB/hfi1: Move receive work queue struct into uapi directory") Reported-by: Stephen Rothwell Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/qp.c | 32 +++++++++++++++++++++++++++----- include/uapi/rdma/rvt-abi.h | 9 +++++++-- 2 files changed, 34 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 11b4d3c1efd4..0b0a241c57ff 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1847,8 +1847,11 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, wqe = rvt_get_rwqe_ptr(&qp->r_rq, wq->head); wqe->wr_id = wr->wr_id; wqe->num_sge = wr->num_sge; - for (i = 0; i < wr->num_sge; i++) - wqe->sg_list[i] = wr->sg_list[i]; + for (i = 0; i < wr->num_sge; i++) { + wqe->sg_list[i].addr = wr->sg_list[i].addr; + wqe->sg_list[i].length = wr->sg_list[i].length; + wqe->sg_list[i].lkey = wr->sg_list[i].lkey; + } /* * Make sure queue entry is written * before the head index. @@ -2250,8 +2253,11 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, wqe = rvt_get_rwqe_ptr(&srq->rq, wq->head); wqe->wr_id = wr->wr_id; wqe->num_sge = wr->num_sge; - for (i = 0; i < wr->num_sge; i++) - wqe->sg_list[i] = wr->sg_list[i]; + for (i = 0; i < wr->num_sge; i++) { + wqe->sg_list[i].addr = wr->sg_list[i].addr; + wqe->sg_list[i].length = wr->sg_list[i].length; + wqe->sg_list[i].lkey = wr->sg_list[i].lkey; + } /* Make sure queue entry is written before the head index. */ smp_store_release(&wq->head, next); spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags); @@ -2259,6 +2265,22 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, return 0; } +/* + * rvt used the internal kernel struct as part of its ABI, for now make sure + * the kernel struct does not change layout. FIXME: rvt should never cast the + * user struct to a kernel struct. + */ +static struct ib_sge *rvt_cast_sge(struct rvt_wqe_sge *sge) +{ + BUILD_BUG_ON(offsetof(struct ib_sge, addr) != + offsetof(struct rvt_wqe_sge, addr)); + BUILD_BUG_ON(offsetof(struct ib_sge, length) != + offsetof(struct rvt_wqe_sge, length)); + BUILD_BUG_ON(offsetof(struct ib_sge, lkey) != + offsetof(struct rvt_wqe_sge, lkey)); + return (struct ib_sge *)sge; +} + /* * Validate a RWQE and fill in the SGE state. * Return 1 if OK. @@ -2282,7 +2304,7 @@ static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) continue; /* Check LKEY */ ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, - NULL, &wqe->sg_list[i], + NULL, rvt_cast_sge(&wqe->sg_list[i]), IB_ACCESS_LOCAL_WRITE); if (unlikely(ret <= 0)) goto bad_lkey; diff --git a/include/uapi/rdma/rvt-abi.h b/include/uapi/rdma/rvt-abi.h index d2e35d24f1a9..7328293c715c 100644 --- a/include/uapi/rdma/rvt-abi.h +++ b/include/uapi/rdma/rvt-abi.h @@ -10,11 +10,16 @@ #include #include -#include #ifndef RDMA_ATOMIC_UAPI #define RDMA_ATOMIC_UAPI(_type, _name) struct{ _type val; } _name #endif +struct rvt_wqe_sge { + __aligned_u64 addr; + __u32 length; + __u32 lkey; +}; + /* * This structure is used to contain the head pointer, tail pointer, * and completion queue entries as a single memory allocation so @@ -39,7 +44,7 @@ struct rvt_rwqe { __u64 wr_id; __u8 num_sge; __u8 padding[7]; - struct ib_sge sg_list[]; + struct rvt_wqe_sge sg_list[]; }; /* -- cgit v1.2.3 From bdce1290493caa3f8119f24b5dacc3fb7ca27389 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Thu, 27 Jun 2019 16:06:43 +0200 Subject: RDMA/rxe: Fill in wc byte_len with IB_WC_RECV_RDMA_WITH_IMM Calculate the correct byte_len on the receiving side when a work completion is generated with IB_WC_RECV_RDMA_WITH_IMM opcode. According to the IBA byte_len must indicate the number of written bytes, whereas it was always equal to zero for the IB_WC_RECV_RDMA_WITH_IMM opcode, even though data was transferred. Fixes: 8700e3e7c485 ("Soft RoCE driver") Signed-off-by: Konstantin Taranov Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_resp.c | 5 ++++- drivers/infiniband/sw/rxe/rxe_verbs.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index aca9f60f9b21..1cbfbd98eb22 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -431,6 +431,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp, qp->resp.va = reth_va(pkt); qp->resp.rkey = reth_rkey(pkt); qp->resp.resid = reth_len(pkt); + qp->resp.length = reth_len(pkt); } access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ : IB_ACCESS_REMOTE_WRITE; @@ -856,7 +857,9 @@ static enum resp_states do_complete(struct rxe_qp *qp, pkt->mask & RXE_WRITE_MASK) ? IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV; wc->vendor_err = 0; - wc->byte_len = wqe->dma.length - wqe->dma.resid; + wc->byte_len = (pkt->mask & RXE_IMMDT_MASK && + pkt->mask & RXE_WRITE_MASK) ? + qp->resp.length : wqe->dma.length - wqe->dma.resid; /* fields after byte_len are different between kernel and user * space diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 6c997d39a418..5c4b2239129c 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -213,6 +213,7 @@ struct rxe_resp_info { struct rxe_mem *mr; u32 resid; u32 rkey; + u32 length; u64 atomic_orig; /* SRQ only */ -- cgit v1.2.3 From 4d2b8517ba1f3aba9a952ebf153ec972a127c80c Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 9 Jul 2019 16:05:53 -0700 Subject: IB/rdmavt: Fix variable shadowing issue in rvt_create_cq clang warns: drivers/infiniband/sw/rdmavt/cq.c:260:7: warning: variable 'err' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized] if (err) ^~~ drivers/infiniband/sw/rdmavt/cq.c:310:9: note: uninitialized use occurs here return err; ^~~ drivers/infiniband/sw/rdmavt/cq.c:260:3: note: remove the 'if' if its condition is always false if (err) ^~~~~~~~ drivers/infiniband/sw/rdmavt/cq.c:253:7: warning: variable 'err' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized] if (!cq->ip) { ^~~~~~~ drivers/infiniband/sw/rdmavt/cq.c:310:9: note: uninitialized use occurs here return err; ^~~ drivers/infiniband/sw/rdmavt/cq.c:253:3: note: remove the 'if' if its condition is always false if (!cq->ip) { ^~~~~~~~~~~~~~ drivers/infiniband/sw/rdmavt/cq.c:211:9: note: initialize the variable 'err' to silence this warning int err; ^ = 0 2 warnings generated. The function scoped err variable is uninitialized when the flow jumps into the if statement. The if scoped err variable shadows the function scoped err variable, preventing the err assignments within the if statement to be reflected at the function level, which will cause uninitialized use when the goto statements are taken. Just remove the if scoped err declaration so that there is only one copy of the err variable for this function. Fixes: 239b0e52d8aa ("IB/hfi1: Move rvt_cq_wc struct into uapi directory") Link: https://github.com/ClangBuiltLinux/linux/issues/594 Reviewed-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Acked-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/cq.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index fac87b13329d..a85571a4cf57 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -247,8 +247,6 @@ int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, * See rvt_mmap() for details. */ if (udata && udata->outlen >= sizeof(__u64)) { - int err; - cq->ip = rvt_create_mmap_info(rdi, sz, udata, u_wc); if (!cq->ip) { err = -ENOMEM; -- cgit v1.2.3 From 85de5d53366f02dd0f81c9be8f435b27fb82b1f7 Mon Sep 17 00:00:00 2001 From: Bernard Metzler Date: Wed, 10 Jul 2019 08:38:00 +0000 Subject: RDMA/siw: Remove unnecessary kthread create/destroy printouts There is already a warning if we cannot start any thread, and stopping those threads is not worth spamming the console. This also corrects a warning from gcc: drivers/infiniband/sw/siw/siw_main.c: In function 'siw_create_tx_threads': drivers/infiniband/sw/siw/siw_main.c:91:11: warning: variable 'rv' set but not used [-Wunused-but-set-variable] Reported-by: Hulk Robot Signed-off-by: YueHaibing Signed-off-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_main.c | 4 +--- drivers/infiniband/sw/siw/siw_qp_tx.c | 4 ---- 2 files changed, 1 insertion(+), 7 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index fd2552a9091d..f55c4e80aea4 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -88,7 +88,7 @@ static void siw_device_cleanup(struct ib_device *base_dev) static int siw_create_tx_threads(void) { - int cpu, rv, assigned = 0; + int cpu, assigned = 0; for_each_online_cpu(cpu) { /* Skip HT cores */ @@ -99,9 +99,7 @@ static int siw_create_tx_threads(void) kthread_create(siw_run_sq, (unsigned long *)(long)cpu, "siw_tx/%d", cpu); if (IS_ERR(siw_tx_thread[cpu])) { - rv = PTR_ERR(siw_tx_thread[cpu]); siw_tx_thread[cpu] = NULL; - pr_info("Creating TX thread for CPU %d failed", cpu); continue; } kthread_bind(siw_tx_thread[cpu], cpu); diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c index 1c9fa8fa96e5..f0d949e2e318 100644 --- a/drivers/infiniband/sw/siw/siw_qp_tx.c +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c @@ -1201,8 +1201,6 @@ int siw_run_sq(void *data) init_llist_head(&tx_task->active); init_waitqueue_head(&tx_task->waiting); - pr_info("Started siw TX thread on CPU %u\n", nr_cpu); - while (1) { struct llist_node *fifo_list = NULL; @@ -1240,8 +1238,6 @@ int siw_run_sq(void *data) siw_sq_resume(qp); } } - pr_info("Stopped siw TX thread on CPU %u\n", nr_cpu); - return 0; } -- cgit v1.2.3 From 775a41e281cf08b1d9e0f2dd89c062430b289a10 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 10 Jul 2019 10:48:00 -0700 Subject: rdma/siw: Use proper enumerated type in map_cqe_status clang warns several times: drivers/infiniband/sw/siw/siw_cq.c:31:4: warning: implicit conversion from enumeration type 'enum siw_wc_status' to different enumeration type 'enum siw_opcode' [-Wenum-conversion] { SIW_WC_SUCCESS, IB_WC_SUCCESS }, ~ ^~~~~~~~~~~~~~ Fixes: b0fff7317bb4 ("rdma/siw: completion queue methods") Link: https://github.com/ClangBuiltLinux/linux/issues/596 Signed-off-by: Nathan Chancellor Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_cq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/siw_cq.c b/drivers/infiniband/sw/siw/siw_cq.c index e2a0ee40d5b5..e381ae9b7d62 100644 --- a/drivers/infiniband/sw/siw/siw_cq.c +++ b/drivers/infiniband/sw/siw/siw_cq.c @@ -25,7 +25,7 @@ static int map_wc_opcode[SIW_NUM_OPCODES] = { }; static struct { - enum siw_opcode siw; + enum siw_wc_status siw; enum ib_wc_status ib; } map_cqe_status[SIW_NUM_WC_STATUS] = { { SIW_WC_SUCCESS, IB_WC_SUCCESS }, -- cgit v1.2.3 From c421651fa2295d1219c36674c7eb8c574542ceea Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 11 Jul 2019 11:29:42 -0300 Subject: RDMA/siw: Add missing rtnl_lock around access to ifa ifa is protected by rcu or rtnl, add the missing locking. In this case we have to use rtnl since siw_listen_address() is sleeping. Fixes: 6c52fdc244b5 ("rdma/siw: connection management") Reviewed-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_cm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index 8e618cb7261f..c25be723c15b 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -1975,6 +1975,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) id, &s_laddr.sin_addr, ntohs(s_laddr.sin_port), &s_raddr->sin_addr, ntohs(s_raddr->sin_port)); + rtnl_lock(); for_ifa(in_dev) { if (ipv4_is_zeronet(s_laddr.sin_addr.s_addr) || @@ -1989,6 +1990,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) } } endfor_ifa(in_dev); + rtnl_unlock(); in_dev_put(in_dev); } else if (id->local_addr.ss_family == AF_INET6) { struct inet6_dev *in6_dev = in6_dev_get(dev); -- cgit v1.2.3 From b45305d777f2f9209dae5a3b8249ca03166a4df3 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 10 Jul 2019 15:39:30 +0200 Subject: rdma/siw: Add missing dependencies on LIBCRC32C and DMA_VIRT_OPS If LIBCRC32C and DMA_VIRT_OPS are not enabled: drivers/infiniband/sw/siw/siw_main.o: In function `siw_newlink': siw_main.c:(.text+0x35c): undefined reference to `dma_virt_ops' drivers/infiniband/sw/siw/siw_qp_rx.o: In function `siw_csum_update': siw_qp_rx.c:(.text+0x16): undefined reference to `crc32c' Fix the first issue by adding a select of DMA_VIRT_OPS. Fix the second issue by replacing the unneeded dependency on CRYPTO_CRC32 by a dependency on LIBCRC32C. Reported-by: noreply@ellerman.id.au (first issue) Fixes: c0cf5bdde46c ("rdma/siw: addition to kernel build environment") Signed-off-by: Geert Uytterhoeven Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/Kconfig b/drivers/infiniband/sw/siw/Kconfig index 94f684174ce3..b622fc62f2cd 100644 --- a/drivers/infiniband/sw/siw/Kconfig +++ b/drivers/infiniband/sw/siw/Kconfig @@ -1,6 +1,7 @@ config RDMA_SIW tristate "Software RDMA over TCP/IP (iWARP) driver" - depends on INET && INFINIBAND && CRYPTO_CRC32 + depends on INET && INFINIBAND && LIBCRC32C + select DMA_VIRT_OPS help This driver implements the iWARP RDMA transport over the Linux TCP/IP network stack. It enables a system with a -- cgit v1.2.3 From 855085d9686e107b77a1bdb935224a238a9fd8b9 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 11 Jul 2019 15:12:13 +0800 Subject: rdma/siw: Remove set but not used variable 's' Fixes gcc '-Wunused-but-set-variable' warning: drivers/infiniband/sw/siw/siw_cm.c: In function siw_cm_llp_state_change: drivers/infiniband/sw/siw/siw_cm.c:1278:17: warning: variable s set but not used [-Wunused-but-set-variable] Fixes: 6c52fdc244b5 ("rdma/siw: connection management") Reported-by: Hulk Robot Signed-off-by: YueHaibing Reviewed-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_cm.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index c25be723c15b..43f7f12e5f7f 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -1275,7 +1275,6 @@ static void siw_cm_llp_error_report(struct sock *sk) static void siw_cm_llp_state_change(struct sock *sk) { struct siw_cep *cep; - struct socket *s; void (*orig_state_change)(struct sock *s); read_lock(&sk->sk_callback_lock); @@ -1288,8 +1287,6 @@ static void siw_cm_llp_state_change(struct sock *sk) } orig_state_change = cep->sk_state_change; - s = sk->sk_socket; - siw_dbg_cep(cep, "state: %d\n", cep->state); switch (sk->sk_state) { -- cgit v1.2.3 From cea743f2ea814d3d54dfab667b68271d4f4e5fdf Mon Sep 17 00:00:00 2001 From: Gustavo A. R. Silva Date: Thu, 11 Jul 2019 11:12:18 -0500 Subject: RDMA/siw: Mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. This patch fixes the following warnings: drivers/infiniband/sw/siw/siw_qp_rx.c: In function ‘siw_rdmap_complete’: drivers/infiniband/sw/siw/siw_qp_rx.c:1214:18: warning: this statement may fall through [-Wimplicit-fallthrough=] wqe->rqe.flags |= SIW_WQE_SOLICITED; ~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~ drivers/infiniband/sw/siw/siw_qp_rx.c:1215:2: note: here case RDMAP_SEND: ^~~~ drivers/infiniband/sw/siw/siw_qp_tx.c: In function ‘siw_qp_sq_process’: drivers/infiniband/sw/siw/siw_qp_tx.c:1044:4: warning: this statement may fall through [-Wimplicit-fallthrough=] siw_wqe_put_mem(wqe, tx_type); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/infiniband/sw/siw/siw_qp_tx.c:1045:3: note: here case SIW_OP_INVAL_STAG: ^~~~ drivers/infiniband/sw/siw/siw_qp_tx.c:1128:4: warning: this statement may fall through [-Wimplicit-fallthrough=] siw_wqe_put_mem(wqe, tx_type); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/infiniband/sw/siw/siw_qp_tx.c:1129:3: note: here case SIW_OP_INVAL_STAG: ^~~~ Warning level 3 was used: -Wimplicit-fallthrough=3 This patch is part of the ongoing efforts to enable -Wimplicit-fallthrough. Signed-off-by: Gustavo A. R. Silva Reviewed-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_qp_rx.c | 2 ++ drivers/infiniband/sw/siw/siw_qp_tx.c | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c index 682a290bc11e..f87657a11657 100644 --- a/drivers/infiniband/sw/siw/siw_qp_rx.c +++ b/drivers/infiniband/sw/siw/siw_qp_rx.c @@ -1212,6 +1212,8 @@ static int siw_rdmap_complete(struct siw_qp *qp, int error) case RDMAP_SEND_SE: case RDMAP_SEND_SE_INVAL: wqe->rqe.flags |= SIW_WQE_SOLICITED; + /* Fall through */ + case RDMAP_SEND: case RDMAP_SEND_INVAL: if (wqe->wr_status == SIW_WR_IDLE) diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c index f0d949e2e318..43020d2040fc 100644 --- a/drivers/infiniband/sw/siw/siw_qp_tx.c +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c @@ -1042,6 +1042,8 @@ next_wqe: case SIW_OP_SEND_REMOTE_INV: case SIW_OP_WRITE: siw_wqe_put_mem(wqe, tx_type); + /* Fall through */ + case SIW_OP_INVAL_STAG: case SIW_OP_REG_MR: if (tx_flags(wqe) & SIW_WQE_SIGNALLED) @@ -1126,6 +1128,8 @@ next_wqe: case SIW_OP_READ: case SIW_OP_READ_LOCAL_INV: siw_wqe_put_mem(wqe, tx_type); + /* Fall through */ + case SIW_OP_INVAL_STAG: case SIW_OP_REG_MR: siw_sqe_complete(qp, &wqe->sqe, wqe->bytes, -- cgit v1.2.3 From 0b043644c0ca601cb19943a81aa1f1455dbe9461 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 12 Jul 2019 12:12:06 -0300 Subject: RMDA/siw: Require a 64 bit arch The new siw driver fails to build on i386 with drivers/infiniband/sw/siw/siw_qp.c:1025:3: error: invalid output size for constraint '+q' smp_store_mb(*cq->notify, SIW_NOTIFY_NOT); As it is using 64 bit values with the smp_store_mb. Since the entire scheme here seems questionable, and we are in the merge window, fix the compile failures by disabling 32 bit support on this driver. A proper fix will be reviewed post merge window. Fixes: c0cf5bdde46c ("rdma/siw: addition to kernel build environment") Reported-by: Arnd Bergmann Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/sw') diff --git a/drivers/infiniband/sw/siw/Kconfig b/drivers/infiniband/sw/siw/Kconfig index b622fc62f2cd..dace276aea14 100644 --- a/drivers/infiniband/sw/siw/Kconfig +++ b/drivers/infiniband/sw/siw/Kconfig @@ -1,6 +1,6 @@ config RDMA_SIW tristate "Software RDMA over TCP/IP (iWARP) driver" - depends on INET && INFINIBAND && LIBCRC32C + depends on INET && INFINIBAND && LIBCRC32C && 64BIT select DMA_VIRT_OPS help This driver implements the iWARP RDMA transport over -- cgit v1.2.3