diff options
author | Linus Torvalds | 2018-01-31 12:05:10 -0800 |
---|---|---|
committer | Linus Torvalds | 2018-01-31 12:05:10 -0800 |
commit | 7b1cd95d65eb3b1e13f8a90eb757e0ea232c7899 (patch) | |
tree | cbc3ec5d45b04666c24f7c0b1df04a85d29c7d0f /drivers | |
parent | 2155e69a9d9acd42488ef994a4e1ff535438c128 (diff) | |
parent | e7996a9a77fc669387da43ff4823b91cc4872bd0 (diff) |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull RDMA subsystem updates from Jason Gunthorpe:
"Overall this cycle did not have any major excitement, and did not
require any shared branch with netdev.
Lots of driver updates, particularly of the scale-up and performance
variety. The largest body of core work was Parav's patches fixing and
restructing some of the core code to make way for future RDMA
containerization.
Summary:
- misc small driver fixups to
bnxt_re/hfi1/qib/hns/ocrdma/rdmavt/vmw_pvrdma/nes
- several major feature adds to bnxt_re driver: SRIOV VF RoCE
support, HugePages support, extended hardware stats support, and
SRQ support
- a notable number of fixes to the i40iw driver from debugging scale
up testing
- more work to enable the new hip08 chip in the hns driver
- misc small ULP fixups to srp/srpt//ipoib
- preparation for srp initiator and target to support the RDMA-CM
protocol for connections
- add RDMA-CM support to srp initiator, srp target is still a WIP
- fixes for a couple of places where ipoib could spam the dmesg log
- fix encode/decode of FDR/EDR data rates in the core
- many patches from Parav with ongoing work to clean up
inconsistencies and bugs in RoCE support around the rdma_cm
- mlx5 driver support for the userspace features 'thread domain',
'wallclock timestamps' and 'DV Direct Connected transport'. Support
for the firmware dual port rocee capability
- core support for more than 32 rdma devices in the char dev
allocation
- kernel doc updates from Randy Dunlap
- new netlink uAPI for inspecting RDMA objects similar in spirit to 'ss'
- one minor change to the kobject code acked by Greg KH"
* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (259 commits)
RDMA/nldev: Provide detailed QP information
RDMA/nldev: Provide global resource utilization
RDMA/core: Add resource tracking for create and destroy PDs
RDMA/core: Add resource tracking for create and destroy CQs
RDMA/core: Add resource tracking for create and destroy QPs
RDMA/restrack: Add general infrastructure to track RDMA resources
RDMA/core: Save kernel caller name when creating PD and CQ objects
RDMA/core: Use the MODNAME instead of the function name for pd callers
RDMA: Move enum ib_cq_creation_flags to uapi headers
IB/rxe: Change RDMA_RXE kconfig to use select
IB/qib: remove qib_keys.c
IB/mthca: remove mthca_user.h
RDMA/cm: Fix access to uninitialized variable
RDMA/cma: Use existing netif_is_bond_master function
IB/core: Avoid SGID attributes query while converting GID from OPA to IB
RDMA/mlx5: Avoid memory leak in case of XRCD dealloc failure
IB/umad: Fix use of unprotected device pointer
IB/iser: Combine substrings for three messages
IB/iser: Delete an unnecessary variable initialisation in iser_send_data_out()
IB/iser: Delete an error message for a failed memory allocation in iser_send_data_out()
...
Diffstat (limited to 'drivers')
164 files changed, 9682 insertions, 3905 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 504b926552c6..f69833db0a32 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -12,7 +12,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o \ - security.o nldev.o + security.o nldev.o restrack.o ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index f4e8185bccd3..a5b4cf030c11 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -243,8 +243,7 @@ void rdma_copy_addr(struct rdma_dev_addr *dev_addr, EXPORT_SYMBOL(rdma_copy_addr); int rdma_translate_ip(const struct sockaddr *addr, - struct rdma_dev_addr *dev_addr, - u16 *vlan_id) + struct rdma_dev_addr *dev_addr) { struct net_device *dev; @@ -266,9 +265,6 @@ int rdma_translate_ip(const struct sockaddr *addr, return -EADDRNOTAVAIL; rdma_copy_addr(dev_addr, dev, NULL); - dev_addr->bound_dev_if = dev->ifindex; - if (vlan_id) - *vlan_id = rdma_vlan_dev_vlan_id(dev); dev_put(dev); break; #if IS_ENABLED(CONFIG_IPV6) @@ -279,9 +275,6 @@ int rdma_translate_ip(const struct sockaddr *addr, &((const struct sockaddr_in6 *)addr)->sin6_addr, dev, 1)) { rdma_copy_addr(dev_addr, dev, NULL); - dev_addr->bound_dev_if = dev->ifindex; - if (vlan_id) - *vlan_id = rdma_vlan_dev_vlan_id(dev); break; } } @@ -481,7 +474,7 @@ static int addr_resolve_neigh(struct dst_entry *dst, if (dst->dev->flags & IFF_LOOPBACK) { int ret; - ret = rdma_translate_ip(dst_in, addr, NULL); + ret = rdma_translate_ip(dst_in, addr); if (!ret) memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); @@ -558,7 +551,7 @@ static int addr_resolve(struct sockaddr *src_in, } if (ndev->flags & IFF_LOOPBACK) { - ret = rdma_translate_ip(dst_in, addr, NULL); + ret = rdma_translate_ip(dst_in, addr); /* * Put the loopback device and get the translated * device instead. @@ -744,7 +737,6 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) EXPORT_SYMBOL(rdma_addr_cancel); struct resolve_cb_context { - struct rdma_dev_addr *addr; struct completion comp; int status; }; @@ -752,39 +744,31 @@ struct resolve_cb_context { static void resolve_cb(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context) { - if (!status) - memcpy(((struct resolve_cb_context *)context)->addr, - addr, sizeof(struct rdma_dev_addr)); ((struct resolve_cb_context *)context)->status = status; complete(&((struct resolve_cb_context *)context)->comp); } int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *dmac, u16 *vlan_id, int *if_index, + u8 *dmac, const struct net_device *ndev, int *hoplimit) { - int ret = 0; struct rdma_dev_addr dev_addr; struct resolve_cb_context ctx; - struct net_device *dev; - union { struct sockaddr _sockaddr; struct sockaddr_in _sockaddr_in; struct sockaddr_in6 _sockaddr_in6; } sgid_addr, dgid_addr; - + int ret; rdma_gid2ip(&sgid_addr._sockaddr, sgid); rdma_gid2ip(&dgid_addr._sockaddr, dgid); memset(&dev_addr, 0, sizeof(dev_addr)); - if (if_index) - dev_addr.bound_dev_if = *if_index; + dev_addr.bound_dev_if = ndev->ifindex; dev_addr.net = &init_net; - ctx.addr = &dev_addr; init_completion(&ctx.comp); ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr, &dev_addr, 1000, resolve_cb, &ctx); @@ -798,42 +782,9 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, return ret; memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN); - dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if); - if (!dev) - return -ENODEV; - if (if_index) - *if_index = dev_addr.bound_dev_if; - if (vlan_id) - *vlan_id = rdma_vlan_dev_vlan_id(dev); - if (hoplimit) - *hoplimit = dev_addr.hoplimit; - dev_put(dev); - return ret; -} -EXPORT_SYMBOL(rdma_addr_find_l2_eth_by_grh); - -int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id) -{ - int ret = 0; - struct rdma_dev_addr dev_addr; - union { - struct sockaddr _sockaddr; - struct sockaddr_in _sockaddr_in; - struct sockaddr_in6 _sockaddr_in6; - } gid_addr; - - rdma_gid2ip(&gid_addr._sockaddr, sgid); - - memset(&dev_addr, 0, sizeof(dev_addr)); - dev_addr.net = &init_net; - ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id); - if (ret) - return ret; - - memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN); - return ret; + *hoplimit = dev_addr.hoplimit; + return 0; } -EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid); static int netevent_callback(struct notifier_block *self, unsigned long event, void *ctx) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 77515638c55c..e9a409d7f4e2 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -573,27 +573,24 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, struct ib_gid_attr attr; if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID) - goto next; + continue; if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid))) - goto next; + continue; memcpy(&attr, &table->data_vec[i].attr, sizeof(attr)); - if (filter(gid, &attr, context)) + if (filter(gid, &attr, context)) { found = true; - -next: - if (found) + if (index) + *index = i; break; + } } read_unlock_irqrestore(&table->rwlock, flags); if (!found) return -ENOENT; - - if (index) - *index = i; return 0; } @@ -824,12 +821,7 @@ static int gid_table_setup_one(struct ib_device *ib_dev) if (err) return err; - err = roce_rescan_device(ib_dev); - - if (err) { - gid_table_cleanup_one(ib_dev); - gid_table_release_one(ib_dev); - } + rdma_roce_rescan_device(ib_dev); return err; } @@ -883,7 +875,6 @@ int ib_find_gid_by_filter(struct ib_device *device, port_num, filter, context, index); } -EXPORT_SYMBOL(ib_find_gid_by_filter); int ib_get_cached_pkey(struct ib_device *device, u8 port_num, diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index f6b159d79977..e6749157fd86 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -452,13 +452,14 @@ static void cm_set_private_data(struct cm_id_private *cm_id_priv, cm_id_priv->private_data_len = private_data_len; } -static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, - struct ib_grh *grh, struct cm_av *av) +static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, + struct ib_grh *grh, struct cm_av *av) { av->port = port; av->pkey_index = wc->pkey_index; - ib_init_ah_from_wc(port->cm_dev->ib_device, port->port_num, wc, - grh, &av->ah_attr); + return ib_init_ah_attr_from_wc(port->cm_dev->ib_device, + port->port_num, wc, + grh, &av->ah_attr); } static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, @@ -494,8 +495,11 @@ static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, return ret; av->port = port; - ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path, - &av->ah_attr); + ret = ib_init_ah_attr_from_path(cm_dev->ib_device, port->port_num, path, + &av->ah_attr); + if (ret) + return ret; + av->timeout = path->packet_life_time + 1; spin_lock_irqsave(&cm.lock, flags); @@ -1560,6 +1564,35 @@ static u16 cm_get_bth_pkey(struct cm_work *work) return pkey; } +/** + * Convert OPA SGID to IB SGID + * ULPs (such as IPoIB) do not understand OPA GIDs and will + * reject them as the local_gid will not match the sgid. Therefore, + * change the pathrec's SGID to an IB SGID. + * + * @work: Work completion + * @path: Path record + */ +static void cm_opa_to_ib_sgid(struct cm_work *work, + struct sa_path_rec *path) +{ + struct ib_device *dev = work->port->cm_dev->ib_device; + u8 port_num = work->port->port_num; + + if (rdma_cap_opa_ah(dev, port_num) && + (ib_is_opa_gid(&path->sgid))) { + union ib_gid sgid; + + if (ib_get_cached_gid(dev, port_num, 0, &sgid, NULL)) { + dev_warn(&dev->dev, + "Error updating sgid in CM request\n"); + return; + } + + path->sgid = sgid; + } +} + static void cm_format_req_event(struct cm_work *work, struct cm_id_private *cm_id_priv, struct ib_cm_id *listen_id) @@ -1573,10 +1606,13 @@ static void cm_format_req_event(struct cm_work *work, param->bth_pkey = cm_get_bth_pkey(work); param->port = cm_id_priv->av.port->port_num; param->primary_path = &work->path[0]; - if (cm_req_has_alt_path(req_msg)) + cm_opa_to_ib_sgid(work, param->primary_path); + if (cm_req_has_alt_path(req_msg)) { param->alternate_path = &work->path[1]; - else + cm_opa_to_ib_sgid(work, param->alternate_path); + } else { param->alternate_path = NULL; + } param->remote_ca_guid = req_msg->local_ca_guid; param->remote_qkey = be32_to_cpu(req_msg->local_qkey); param->remote_qpn = be32_to_cpu(cm_req_get_local_qpn(req_msg)); @@ -1826,9 +1862,11 @@ static int cm_req_handler(struct cm_work *work) cm_id_priv = container_of(cm_id, struct cm_id_private, id); cm_id_priv->id.remote_id = req_msg->local_comm_id; - cm_init_av_for_response(work->port, work->mad_recv_wc->wc, - work->mad_recv_wc->recv_buf.grh, - &cm_id_priv->av); + ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &cm_id_priv->av); + if (ret) + goto destroy; cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> id.local_id); if (IS_ERR(cm_id_priv->timewait_info)) { @@ -1841,9 +1879,10 @@ static int cm_req_handler(struct cm_work *work) listen_cm_id_priv = cm_match_req(work, cm_id_priv); if (!listen_cm_id_priv) { + pr_debug("%s: local_id %d, no listen_cm_id_priv\n", __func__, + be32_to_cpu(cm_id->local_id)); ret = -EINVAL; - kfree(cm_id_priv->timewait_info); - goto destroy; + goto free_timeinfo; } cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler; @@ -1861,56 +1900,50 @@ static int cm_req_handler(struct cm_work *work) work->port->port_num, grh->sgid_index, &gid, &gid_attr); - if (!ret) { - if (gid_attr.ndev) { - work->path[0].rec_type = - sa_conv_gid_to_pathrec_type(gid_attr.gid_type); - sa_path_set_ifindex(&work->path[0], - gid_attr.ndev->ifindex); - sa_path_set_ndev(&work->path[0], - dev_net(gid_attr.ndev)); - dev_put(gid_attr.ndev); - } else { - cm_path_set_rec_type(work->port->cm_dev->ib_device, - work->port->port_num, - &work->path[0], - &req_msg->primary_local_gid); - } - if (cm_req_has_alt_path(req_msg)) - work->path[1].rec_type = work->path[0].rec_type; - cm_format_paths_from_req(req_msg, &work->path[0], - &work->path[1]); - if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) - sa_path_set_dmac(&work->path[0], - cm_id_priv->av.ah_attr.roce.dmac); - work->path[0].hop_limit = grh->hop_limit; - ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av, - cm_id_priv); + if (ret) { + ib_send_cm_rej(cm_id, IB_CM_REJ_UNSUPPORTED, NULL, 0, NULL, 0); + goto rejected; + } + + if (gid_attr.ndev) { + work->path[0].rec_type = + sa_conv_gid_to_pathrec_type(gid_attr.gid_type); + sa_path_set_ifindex(&work->path[0], + gid_attr.ndev->ifindex); + sa_path_set_ndev(&work->path[0], + dev_net(gid_attr.ndev)); + dev_put(gid_attr.ndev); + } else { + cm_path_set_rec_type(work->port->cm_dev->ib_device, + work->port->port_num, + &work->path[0], + &req_msg->primary_local_gid); } + if (cm_req_has_alt_path(req_msg)) + work->path[1].rec_type = work->path[0].rec_type; + cm_format_paths_from_req(req_msg, &work->path[0], + &work->path[1]); + if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) + sa_path_set_dmac(&work->path[0], + cm_id_priv->av.ah_attr.roce.dmac); + work->path[0].hop_limit = grh->hop_limit; + ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av, + cm_id_priv); if (ret) { - int err = ib_get_cached_gid(work->port->cm_dev->ib_device, - work->port->port_num, 0, - &work->path[0].sgid, - &gid_attr); - if (!err && gid_attr.ndev) { - work->path[0].rec_type = - sa_conv_gid_to_pathrec_type(gid_attr.gid_type); - sa_path_set_ifindex(&work->path[0], - gid_attr.ndev->ifindex); - sa_path_set_ndev(&work->path[0], - dev_net(gid_attr.ndev)); - dev_put(gid_attr.ndev); - } else { - cm_path_set_rec_type(work->port->cm_dev->ib_device, - work->port->port_num, - &work->path[0], - &req_msg->primary_local_gid); - } - if (cm_req_has_alt_path(req_msg)) - work->path[1].rec_type = work->path[0].rec_type; - ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, - &work->path[0].sgid, sizeof work->path[0].sgid, - NULL, 0); + int err; + + err = ib_get_cached_gid(work->port->cm_dev->ib_device, + work->port->port_num, 0, + &work->path[0].sgid, + NULL); + if (err) + ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, + NULL, 0, NULL, 0); + else + ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, + &work->path[0].sgid, + sizeof(work->path[0].sgid), + NULL, 0); goto rejected; } if (cm_req_has_alt_path(req_msg)) { @@ -1919,7 +1952,7 @@ static int cm_req_handler(struct cm_work *work) if (ret) { ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID, &work->path[0].sgid, - sizeof work->path[0].sgid, NULL, 0); + sizeof(work->path[0].sgid), NULL, 0); goto rejected; } } @@ -1945,6 +1978,8 @@ static int cm_req_handler(struct cm_work *work) rejected: atomic_dec(&cm_id_priv->refcount); cm_deref_id(listen_cm_id_priv); +free_timeinfo: + kfree(cm_id_priv->timewait_info); destroy: ib_destroy_cm_id(cm_id); return ret; @@ -1997,6 +2032,8 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id, spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REQ_RCVD && cm_id->state != IB_CM_MRA_REQ_SENT) { + pr_debug("%s: local_comm_id %d, cm_id->state: %d\n", __func__, + be32_to_cpu(cm_id_priv->id.local_id), cm_id->state); ret = -EINVAL; goto out; } @@ -2063,6 +2100,8 @@ int ib_send_cm_rtu(struct ib_cm_id *cm_id, spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REP_RCVD && cm_id->state != IB_CM_MRA_REP_SENT) { + pr_debug("%s: local_id %d, cm_id->state %d\n", __func__, + be32_to_cpu(cm_id->local_id), cm_id->state); ret = -EINVAL; goto error; } @@ -2170,6 +2209,8 @@ static int cm_rep_handler(struct cm_work *work) cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0); if (!cm_id_priv) { cm_dup_rep_handler(work); + pr_debug("%s: remote_comm_id %d, no cm_id_priv\n", __func__, + be32_to_cpu(rep_msg->remote_comm_id)); return -EINVAL; } @@ -2183,6 +2224,10 @@ static int cm_rep_handler(struct cm_work *work) default: spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; + pr_debug("%s: cm_id_priv->id.state: %d, local_comm_id %d, remote_comm_id %d\n", + __func__, cm_id_priv->id.state, + be32_to_cpu(rep_msg->local_comm_id), + be32_to_cpu(rep_msg->remote_comm_id)); goto error; } @@ -2196,6 +2241,8 @@ static int cm_rep_handler(struct cm_work *work) spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; + pr_debug("%s: Failed to insert remote id %d\n", __func__, + be32_to_cpu(rep_msg->remote_comm_id)); goto error; } /* Check for a stale connection. */ @@ -2213,6 +2260,10 @@ static int cm_rep_handler(struct cm_work *work) IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP, NULL, 0); ret = -EINVAL; + pr_debug("%s: Stale connection. local_comm_id %d, remote_comm_id %d\n", + __func__, be32_to_cpu(rep_msg->local_comm_id), + be32_to_cpu(rep_msg->remote_comm_id)); + if (cur_cm_id_priv) { cm_id = &cur_cm_id_priv->id; ib_send_cm_dreq(cm_id, NULL, 0); @@ -2359,6 +2410,8 @@ int ib_send_cm_dreq(struct ib_cm_id *cm_id, cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_ESTABLISHED) { + pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, + be32_to_cpu(cm_id->local_id), cm_id->state); ret = -EINVAL; goto out; } @@ -2428,6 +2481,8 @@ int ib_send_cm_drep(struct ib_cm_id *cm_id, if (cm_id->state != IB_CM_DREQ_RCVD) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); kfree(data); + pr_debug("%s: local_id %d, cm_idcm_id->state(%d) != IB_CM_DREQ_RCVD\n", + __func__, be32_to_cpu(cm_id->local_id), cm_id->state); return -EINVAL; } @@ -2493,6 +2548,9 @@ static int cm_dreq_handler(struct cm_work *work) atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_DREQ_COUNTER]); cm_issue_drep(work->port, work->mad_recv_wc); + pr_debug("%s: no cm_id_priv, local_comm_id %d, remote_comm_id %d\n", + __func__, be32_to_cpu(dreq_msg->local_comm_id), + be32_to_cpu(dreq_msg->remote_comm_id)); return -EINVAL; } @@ -2535,6 +2593,9 @@ static int cm_dreq_handler(struct cm_work *work) counter[CM_DREQ_COUNTER]); goto unlock; default: + pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); goto unlock; } cm_id_priv->id.state = IB_CM_DREQ_RCVD; @@ -2638,6 +2699,8 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id, cm_enter_timewait(cm_id_priv); break; default: + pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, + be32_to_cpu(cm_id_priv->id.local_id), cm_id->state); ret = -EINVAL; goto out; } @@ -2748,6 +2811,9 @@ static int cm_rej_handler(struct cm_work *work) /* fall through */ default: spin_unlock_irq(&cm_id_priv->lock); + pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); ret = -EINVAL; goto out; } @@ -2811,6 +2877,9 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id, } /* fall through */ default: + pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); ret = -EINVAL; goto error1; } @@ -2912,6 +2981,9 @@ static int cm_mra_handler(struct cm_work *work) counter[CM_MRA_COUNTER]); /* fall through */ default: + pr_debug("%s local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); goto out; } @@ -3085,6 +3157,12 @@ static int cm_lap_handler(struct cm_work *work) if (!cm_id_priv) return -EINVAL; + ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &cm_id_priv->av); + if (ret) + goto deref; + param = &work->cm_event.param.lap_rcvd; memset(&work->path[0], 0, sizeof(work->path[1])); cm_path_set_rec_type(work->port->cm_dev->ib_device, @@ -3131,9 +3209,6 @@ static int cm_lap_handler(struct cm_work *work) cm_id_priv->id.lap_state = IB_CM_LAP_RCVD; cm_id_priv->tid = lap_msg->hdr.tid; - cm_init_av_for_response(work->port, work->mad_recv_wc->wc, - work->mad_recv_wc->recv_buf.grh, - &cm_id_priv->av); cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av, cm_id_priv); ret = atomic_inc_and_test(&cm_id_priv->work_count); @@ -3386,6 +3461,7 @@ static int cm_sidr_req_handler(struct cm_work *work) struct cm_id_private *cm_id_priv, *cur_cm_id_priv; struct cm_sidr_req_msg *sidr_req_msg; struct ib_wc *wc; + int ret; cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL); if (IS_ERR(cm_id)) @@ -3398,9 +3474,12 @@ static int cm_sidr_req_handler(struct cm_work *work) wc = work->mad_recv_wc->wc; cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid); cm_id_priv->av.dgid.global.interface_id = 0; - cm_init_av_for_response(work->port, work->mad_recv_wc->wc, - work->mad_recv_wc->recv_buf.grh, - &cm_id_priv->av); + ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &cm_id_priv->av); + if (ret) + goto out; + cm_id_priv->id.remote_id = sidr_req_msg->request_id; cm_id_priv->tid = sidr_req_msg->hdr.tid; atomic_inc(&cm_id_priv->work_count); @@ -3692,6 +3771,7 @@ static void cm_work_handler(struct work_struct *_work) ret = cm_timewait_handler(work); break; default: + pr_debug("cm_event.event: 0x%x\n", work->cm_event.event); ret = -EINVAL; break; } @@ -3727,6 +3807,8 @@ static int cm_establish(struct ib_cm_id *cm_id) ret = -EISCONN; break; default: + pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, + be32_to_cpu(cm_id->local_id), cm_id->state); ret = -EINVAL; break; } @@ -3924,6 +4006,9 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: + pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); ret = -EINVAL; break; } @@ -3971,6 +4056,9 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: + pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); ret = -EINVAL; break; } @@ -4030,6 +4118,9 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: + pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); ret = -EINVAL; break; } diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 6294a7001d33..e66963ca58bd 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -601,7 +601,7 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a int ret; if (addr->sa_family != AF_IB) { - ret = rdma_translate_ip(addr, dev_addr, NULL); + ret = rdma_translate_ip(addr, dev_addr); } else { cma_translate_ib((struct sockaddr_ib *) addr, dev_addr); ret = 0; @@ -612,11 +612,14 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a static inline int cma_validate_port(struct ib_device *device, u8 port, enum ib_gid_type gid_type, - union ib_gid *gid, int dev_type, - int bound_if_index) + union ib_gid *gid, + struct rdma_id_private *id_priv) { - int ret = -ENODEV; + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + int bound_if_index = dev_addr->bound_dev_if; + int dev_type = dev_addr->dev_type; struct net_device *ndev = NULL; + int ret = -ENODEV; if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) return ret; @@ -624,11 +627,13 @@ static inline int cma_validate_port(struct ib_device *device, u8 port, if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) return ret; - if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) - ndev = dev_get_by_index(&init_net, bound_if_index); - else + if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { + ndev = dev_get_by_index(dev_addr->net, bound_if_index); + if (!ndev) + return ret; + } else { gid_type = IB_GID_TYPE_IB; - + } ret = ib_find_cached_gid_by_port(device, gid, gid_type, port, ndev, NULL); @@ -669,8 +674,7 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, rdma_protocol_ib(cma_dev->device, port) ? IB_GID_TYPE_IB : listen_id_priv->gid_type, gidp, - dev_addr->dev_type, - dev_addr->bound_dev_if); + id_priv); if (!ret) { id_priv->id.port_num = port; goto out; @@ -691,8 +695,7 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, rdma_protocol_ib(cma_dev->device, port) ? IB_GID_TYPE_IB : cma_dev->default_gid_type[port - 1], - gidp, dev_addr->dev_type, - dev_addr->bound_dev_if); + gidp, id_priv); if (!ret) { id_priv->id.port_num = port; goto out; @@ -2036,6 +2039,33 @@ __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr) } EXPORT_SYMBOL(rdma_get_service_id); +void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid, + union ib_gid *dgid) +{ + struct rdma_addr *addr = &cm_id->route.addr; + + if (!cm_id->device) { + if (sgid) + memset(sgid, 0, sizeof(*sgid)); + if (dgid) + memset(dgid, 0, sizeof(*dgid)); + return; + } + + if (rdma_protocol_roce(cm_id->device, cm_id->port_num)) { + if (sgid) + rdma_ip2gid((struct sockaddr *)&addr->src_addr, sgid); + if (dgid) + rdma_ip2gid((struct sockaddr *)&addr->dst_addr, dgid); + } else { + if (sgid) + rdma_addr_get_sgid(&addr->dev_addr, sgid); + if (dgid) + rdma_addr_get_dgid(&addr->dev_addr, dgid); + } +} +EXPORT_SYMBOL(rdma_read_gids); + static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; @@ -2132,7 +2162,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; - ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr, NULL); + ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); @@ -2414,6 +2444,26 @@ out: kfree(work); } +static void cma_init_resolve_route_work(struct cma_work *work, + struct rdma_id_private *id_priv) +{ + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ROUTE_QUERY; + work->new_state = RDMA_CM_ROUTE_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; +} + +static void cma_init_resolve_addr_work(struct cma_work *work, + struct rdma_id_private *id_priv) +{ + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ADDR_QUERY; + work->new_state = RDMA_CM_ADDR_RESOLVED; + work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; +} + static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) { struct rdma_route *route = &id_priv->id.route; @@ -2424,11 +2474,7 @@ static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) if (!work) return -ENOMEM; - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ROUTE_RESOLVED; - work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + cma_init_resolve_route_work(work, id_priv); route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { @@ -2449,10 +2495,63 @@ err1: return ret; } -int rdma_set_ib_paths(struct rdma_cm_id *id, - struct sa_path_rec *path_rec, int num_paths) +static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, + unsigned long supported_gids, + enum ib_gid_type default_gid) +{ + if ((network_type == RDMA_NETWORK_IPV4 || + network_type == RDMA_NETWORK_IPV6) && + test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) + return IB_GID_TYPE_ROCE_UDP_ENCAP; + + return default_gid; +} + +/* + * cma_iboe_set_path_rec_l2_fields() is helper function which sets + * path record type based on GID type. + * It also sets up other L2 fields which includes destination mac address + * netdev ifindex, of the path record. + * It returns the netdev of the bound interface for this path record entry. + */ +static struct net_device * +cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv) +{ + struct rdma_route *route = &id_priv->id.route; + enum ib_gid_type gid_type = IB_GID_TYPE_ROCE; + struct rdma_addr *addr = &route->addr; + unsigned long supported_gids; + struct net_device *ndev; + + if (!addr->dev_addr.bound_dev_if) + return NULL; + + ndev = dev_get_by_index(addr->dev_addr.net, + addr->dev_addr.bound_dev_if); + if (!ndev) + return NULL; + + supported_gids = roce_gid_type_mask_support(id_priv->id.device, + id_priv->id.port_num); + gid_type = cma_route_gid_type(addr->dev_addr.network, + supported_gids, + id_priv->gid_type); + /* Use the hint from IP Stack to select GID Type */ + if (gid_type < ib_network_to_gid_type(addr->dev_addr.network)) + gid_type = ib_network_to_gid_type(addr->dev_addr.network); + route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); + + sa_path_set_ndev(route->path_rec, addr->dev_addr.net); + sa_path_set_ifindex(route->path_rec, ndev->ifindex); + sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); + return ndev; +} + +int rdma_set_ib_path(struct rdma_cm_id *id, + struct sa_path_rec *path_rec) { struct rdma_id_private *id_priv; + struct net_device *ndev; int ret; id_priv = container_of(id, struct rdma_id_private, id); @@ -2460,20 +2559,33 @@ int rdma_set_ib_paths(struct rdma_cm_id *id, RDMA_CM_ROUTE_RESOLVED)) return -EINVAL; - id->route.path_rec = kmemdup(path_rec, sizeof *path_rec * num_paths, + id->route.path_rec = kmemdup(path_rec, sizeof(*path_rec), GFP_KERNEL); if (!id->route.path_rec) { ret = -ENOMEM; goto err; } - id->route.num_paths = num_paths; + if (rdma_protocol_roce(id->device, id->port_num)) { + ndev = cma_iboe_set_path_rec_l2_fields(id_priv); + if (!ndev) { + ret = -ENODEV; + goto err_free; + } + dev_put(ndev); + } + + id->route.num_paths = 1; return 0; + +err_free: + kfree(id->route.path_rec); + id->route.path_rec = NULL; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); return ret; } -EXPORT_SYMBOL(rdma_set_ib_paths); +EXPORT_SYMBOL(rdma_set_ib_path); static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) { @@ -2483,11 +2595,7 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) if (!work) return -ENOMEM; - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ROUTE_RESOLVED; - work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; } @@ -2510,26 +2618,14 @@ static int iboe_tos_to_sl(struct net_device *ndev, int tos) return 0; } -static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, - unsigned long supported_gids, - enum ib_gid_type default_gid) -{ - if ((network_type == RDMA_NETWORK_IPV4 || - network_type == RDMA_NETWORK_IPV6) && - test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) - return IB_GID_TYPE_ROCE_UDP_ENCAP; - - return default_gid; -} - static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; struct rdma_addr *addr = &route->addr; struct cma_work *work; int ret; - struct net_device *ndev = NULL; - enum ib_gid_type gid_type = IB_GID_TYPE_IB; + struct net_device *ndev; + u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; u8 tos = id_priv->tos_set ? id_priv->tos : default_roce_tos; @@ -2539,9 +2635,6 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) if (!work) return -ENOMEM; - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; @@ -2550,42 +2643,17 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->num_paths = 1; - if (addr->dev_addr.bound_dev_if) { - unsigned long supported_gids; - - ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); - if (!ndev) { - ret = -ENODEV; - goto err2; - } - - supported_gids = roce_gid_type_mask_support(id_priv->id.device, - id_priv->id.port_num); - gid_type = cma_route_gid_type(addr->dev_addr.network, - supported_gids, - id_priv->gid_type); - route->path_rec->rec_type = - sa_conv_gid_to_pathrec_type(gid_type); - sa_path_set_ndev(route->path_rec, &init_net); - sa_path_set_ifindex(route->path_rec, ndev->ifindex); - } + ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { ret = -ENODEV; goto err2; } - sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); - rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &route->path_rec->sgid); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); - /* Use the hint from IP Stack to select GID Type */ - if (gid_type < ib_network_to_gid_type(addr->dev_addr.network)) - gid_type = ib_network_to_gid_type(addr->dev_addr.network); - route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); - if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) /* TODO: get the hoplimit from the inet/inet6 device */ route->path_rec->hop_limit = addr->dev_addr.hoplimit; @@ -2607,11 +2675,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) goto err2; } - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ROUTE_RESOLVED; - work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; - work->event.status = 0; - + cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; @@ -2791,11 +2855,7 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv) rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - work->old_state = RDMA_CM_ADDR_QUERY; - work->new_state = RDMA_CM_ADDR_RESOLVED; - work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; + cma_init_resolve_addr_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; err: @@ -2821,11 +2881,7 @@ static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - work->old_state = RDMA_CM_ADDR_QUERY; - work->new_state = RDMA_CM_ADDR_RESOLVED; - work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; + cma_init_resolve_addr_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; err: @@ -3404,9 +3460,10 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, event.status = ret; break; } - ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num, - id_priv->id.route.path_rec, - &event.param.ud.ah_attr); + ib_init_ah_attr_from_path(id_priv->id.device, + id_priv->id.port_num, + id_priv->id.route.path_rec, + &event.param.ud.ah_attr); event.param.ud.qp_num = rep->qpn; event.param.ud.qkey = rep->qkey; event.event = RDMA_CM_EVENT_ESTABLISHED; @@ -3873,7 +3930,7 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct net_device *ndev = - dev_get_by_index(&init_net, dev_addr->bound_dev_if); + dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); enum ib_gid_type gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; @@ -4010,8 +4067,10 @@ static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, } else if (addr->sa_family == AF_INET6) { memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else { - mgid->raw[0] = (gid_type == IB_GID_TYPE_IB) ? 0xff : 0; - mgid->raw[1] = (gid_type == IB_GID_TYPE_IB) ? 0x0e : 0; + mgid->raw[0] = + (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0xff; + mgid->raw[1] = + (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0x0e; mgid->raw[2] = 0; mgid->raw[3] = 0; mgid->raw[4] = 0; @@ -4061,7 +4120,7 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); if (dev_addr->bound_dev_if) - ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!ndev) { err = -ENODEV; goto out2; @@ -4179,7 +4238,7 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) struct net_device *ndev = NULL; if (dev_addr->bound_dev_if) - ndev = dev_get_by_index(&init_net, + ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (ndev) { cma_igmp_send(ndev, @@ -4235,7 +4294,7 @@ static int cma_netdev_callback(struct notifier_block *self, unsigned long event, if (event != NETDEV_BONDING_FAILOVER) return NOTIFY_DONE; - if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING)) + if (!netif_is_bond_master(ndev)) return NOTIFY_DONE; mutex_lock(&lock); @@ -4432,7 +4491,7 @@ static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb) RDMA_NL_RDMA_CM_ATTR_SRC_ADDR)) goto out; if (ibnl_put_attr(skb, nlh, - rdma_addr_size(cma_src_addr(id_priv)), + rdma_addr_size(cma_dst_addr(id_priv)), cma_dst_addr(id_priv), RDMA_NL_RDMA_CM_ATTR_DST_ADDR)) goto out; @@ -4444,6 +4503,7 @@ static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb) id_stats->qp_type = id->qp_type; i_id++; + nlmsg_end(skb, nlh); } cb->args[1] = 0; diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c index 31dfee0c8295..eee38b40be99 100644 --- a/drivers/infiniband/core/cma_configfs.c +++ b/drivers/infiniband/core/cma_configfs.c @@ -295,7 +295,7 @@ static struct config_group *make_cma_dev(struct config_group *group, goto fail; } - strncpy(cma_dev_group->name, name, sizeof(cma_dev_group->name)); + strlcpy(cma_dev_group->name, name, sizeof(cma_dev_group->name)); config_group_init_type_name(&cma_dev_group->ports_group, "ports", &cma_ports_group_type); diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 66f0268f37a6..c4560d84dfae 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -40,8 +40,12 @@ #include <rdma/ib_verbs.h> #include <rdma/opa_addr.h> #include <rdma/ib_mad.h> +#include <rdma/restrack.h> #include "mad_priv.h" +/* Total number of ports combined across all struct ib_devices's */ +#define RDMA_MAX_PORTS 1024 + struct pkey_index_qp_list { struct list_head pkey_index_list; u16 pkey_index; @@ -137,7 +141,6 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, int roce_gid_mgmt_init(void); void roce_gid_mgmt_cleanup(void); -int roce_rescan_device(struct ib_device *ib_dev); unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port); int ib_cache_setup_one(struct ib_device *device); @@ -191,13 +194,6 @@ void ib_sa_cleanup(void); int rdma_nl_init(void); void rdma_nl_exit(void); -/** - * Check if there are any listeners to the netlink group - * @group: the netlink group ID - * Returns 0 on success or a negative for no listeners. - */ -int ibnl_chk_listeners(unsigned int group); - int ib_nl_handle_resolve_resp(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); @@ -213,11 +209,6 @@ int ib_get_cached_subnet_prefix(struct ib_device *device, u64 *sn_pfx); #ifdef CONFIG_SECURITY_INFINIBAND -int ib_security_pkey_access(struct ib_device *dev, - u8 port_num, - u16 pkey_index, - void *sec); - void ib_security_destroy_port_pkey_list(struct ib_device *device); void ib_security_cache_change(struct ib_device *device, @@ -240,14 +231,6 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent); int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index); #else -static inline int ib_security_pkey_access(struct ib_device *dev, - u8 port_num, - u16 pkey_index, - void *sec) -{ - return 0; -} - static inline void ib_security_destroy_port_pkey_list(struct ib_device *device) { } @@ -318,4 +301,31 @@ struct ib_device *ib_device_get_by_index(u32 ifindex); /* RDMA device netlink */ void nldev_init(void); void nldev_exit(void); + +static inline struct ib_qp *_ib_create_qp(struct ib_device *dev, + struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata) +{ + struct ib_qp *qp; + + qp = dev->create_qp(pd, attr, udata); + if (IS_ERR(qp)) + return qp; + + qp->device = dev; + qp->pd = pd; + /* + * We don't track XRC QPs for now, because they don't have PD + * and more importantly they are created internaly by driver, + * see mlx5 create_dev_resources() as an example. + */ + if (attr->qp_type < IB_QPT_XRC_INI) { + qp->res.type = RDMA_RESTRACK_QP; + rdma_restrack_add(&qp->res); + } else + qp->res.valid = false; + + return qp; +} #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index f2ae75fa3128..bc79ca8215d7 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -25,9 +25,10 @@ #define IB_POLL_FLAGS \ (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) -static int __ib_process_cq(struct ib_cq *cq, int budget) +static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *poll_wc) { int i, n, completed = 0; + struct ib_wc *wcs = poll_wc ? : cq->wc; /* * budget might be (-1) if the caller does not @@ -35,9 +36,9 @@ static int __ib_process_cq(struct ib_cq *cq, int budget) * minimum here. */ while ((n = ib_poll_cq(cq, min_t(u32, IB_POLL_BATCH, - budget - completed), cq->wc)) > 0) { + budget - completed), wcs)) > 0) { for (i = 0; i < n; i++) { - struct ib_wc *wc = &cq->wc[i]; + struct ib_wc *wc = &wcs[i]; if (wc->wr_cqe) wc->wr_cqe->done(cq, wc); @@ -60,18 +61,20 @@ static int __ib_process_cq(struct ib_cq *cq, int budget) * @cq: CQ to process * @budget: number of CQEs to poll for * - * This function is used to process all outstanding CQ entries on a - * %IB_POLL_DIRECT CQ. It does not offload CQ processing to a different - * context and does not ask for completion interrupts from the HCA. + * This function is used to process all outstanding CQ entries. + * It does not offload CQ processing to a different context and does + * not ask for completion interrupts from the HCA. + * Using direct processing on CQ with non IB_POLL_DIRECT type may trigger + * concurrent processing. * * Note: do not pass -1 as %budget unless it is guaranteed that the number * of completions that will be processed is small. */ int ib_process_cq_direct(struct ib_cq *cq, int budget) { - WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT); + struct ib_wc wcs[IB_POLL_BATCH]; - return __ib_process_cq(cq, budget); + return __ib_process_cq(cq, budget, wcs); } EXPORT_SYMBOL(ib_process_cq_direct); @@ -85,7 +88,7 @@ static int ib_poll_handler(struct irq_poll *iop, int budget) struct ib_cq *cq = container_of(iop, struct ib_cq, iop); int completed; - completed = __ib_process_cq(cq, budget); + completed = __ib_process_cq(cq, budget, NULL); if (completed < budget) { irq_poll_complete(&cq->iop); if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) @@ -105,7 +108,7 @@ static void ib_cq_poll_work(struct work_struct *work) struct ib_cq *cq = container_of(work, struct ib_cq, work); int completed; - completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE); + completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, NULL); if (completed >= IB_POLL_BUDGET_WORKQUEUE || ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) queue_work(ib_comp_wq, &cq->work); @@ -117,20 +120,22 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) } /** - * ib_alloc_cq - allocate a completion queue + * __ib_alloc_cq - allocate a completion queue * @dev: device to allocate the CQ for * @private: driver private data, accessible from cq->cq_context * @nr_cqe: number of CQEs to allocate * @comp_vector: HCA completion vectors for this CQ * @poll_ctx: context to poll the CQ from. + * @caller: module owner name. * * This is the proper interface to allocate a CQ for in-kernel users. A * CQ allocated with this interface will automatically be polled from the * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id * to use this CQ abstraction. */ -struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, - int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx) +struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, + int nr_cqe, int comp_vector, + enum ib_poll_context poll_ctx, const char *caller) { struct ib_cq_init_attr cq_attr = { .cqe = nr_cqe, @@ -154,6 +159,10 @@ struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, if (!cq->wc) goto out_destroy_cq; + cq->res.type = RDMA_RESTRACK_CQ; + cq->res.kern_name = caller; + rdma_restrack_add(&cq->res); + switch (cq->poll_ctx) { case IB_POLL_DIRECT: cq->comp_handler = ib_cq_completion_direct; @@ -178,11 +187,12 @@ struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, out_free_wc: kfree(cq->wc); + rdma_restrack_del(&cq->res); out_destroy_cq: cq->device->destroy_cq(cq); return ERR_PTR(ret); } -EXPORT_SYMBOL(ib_alloc_cq); +EXPORT_SYMBOL(__ib_alloc_cq); /** * ib_free_cq - free a completion queue @@ -209,6 +219,7 @@ void ib_free_cq(struct ib_cq *cq) } kfree(cq->wc); + rdma_restrack_del(&cq->res); ret = cq->device->destroy_cq(cq); WARN_ON_ONCE(ret); } diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 465520627e4b..e8010e73a1cf 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -263,6 +263,8 @@ struct ib_device *ib_alloc_device(size_t size) if (!device) return NULL; + rdma_restrack_init(&device->res); + device->dev.class = &ib_class; device_initialize(&device->dev); @@ -288,7 +290,7 @@ void ib_dealloc_device(struct ib_device *device) { WARN_ON(device->reg_state != IB_DEV_UNREGISTERED && device->reg_state != IB_DEV_UNINITIALIZED); - kobject_put(&device->dev.kobj); + put_device(&device->dev); } EXPORT_SYMBOL(ib_dealloc_device); @@ -462,7 +464,6 @@ int ib_register_device(struct ib_device *device, struct ib_udata uhw = {.outlen = 0, .inlen = 0}; struct device *parent = device->dev.parent; - WARN_ON_ONCE(!parent); WARN_ON_ONCE(device->dma_device); if (device->dev.dma_ops) { /* @@ -471,16 +472,25 @@ int ib_register_device(struct ib_device *device, * into device->dev. */ device->dma_device = &device->dev; - if (!device->dev.dma_mask) - device->dev.dma_mask = parent->dma_mask; - if (!device->dev.coherent_dma_mask) - device->dev.coherent_dma_mask = - parent->coherent_dma_mask; + if (!device->dev.dma_mask) { + if (parent) + device->dev.dma_mask = parent->dma_mask; + else + WARN_ON_ONCE(true); + } + if (!device->dev.coherent_dma_mask) { + if (parent) + device->dev.coherent_dma_mask = + parent->coherent_dma_mask; + else + WARN_ON_ONCE(true); + } } else { /* * The caller did not provide custom DMA operations. Use the * DMA mapping operations of the parent device. */ + WARN_ON_ONCE(!parent); device->dma_device = parent; } @@ -588,6 +598,8 @@ void ib_unregister_device(struct ib_device *device) } up_read(&lists_rwsem); + rdma_restrack_clean(&device->res); + ib_device_unregister_rdmacg(device); ib_device_unregister_sysfs(device); @@ -1033,32 +1045,22 @@ EXPORT_SYMBOL(ib_modify_port); /** * ib_find_gid - Returns the port number and GID table index where - * a specified GID value occurs. + * a specified GID value occurs. Its searches only for IB link layer. * @device: The device to query. * @gid: The GID value to search for. - * @gid_type: Type of GID. * @ndev: The ndev related to the GID to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, - enum ib_gid_type gid_type, struct net_device *ndev, - u8 *port_num, u16 *index) + struct net_device *ndev, u8 *port_num, u16 *index) { union ib_gid tmp_gid; int ret, port, i; for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) { - if (rdma_cap_roce_gid_table(device, port)) { - if (!ib_find_cached_gid_by_port(device, gid, gid_type, port, - ndev, index)) { - *port_num = port; - return 0; - } - } - - if (gid_type != IB_GID_TYPE_IB) + if (rdma_cap_roce_gid_table(device, port)) continue; for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) { diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index 84d2615b5d4b..a0a9ed719031 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -388,13 +388,11 @@ int ib_flush_fmr_pool(struct ib_fmr_pool *pool) EXPORT_SYMBOL(ib_flush_fmr_pool); /** - * ib_fmr_pool_map_phys - - * @pool:FMR pool to allocate FMR from - * @page_list:List of pages to map - * @list_len:Number of pages in @page_list - * @io_virtual_address:I/O virtual address for new FMR - * - * Map an FMR from an FMR pool. + * ib_fmr_pool_map_phys - Map an FMR from an FMR pool. + * @pool_handle: FMR pool to allocate FMR from + * @page_list: List of pages to map + * @list_len: Number of pages in @page_list + * @io_virtual_address: I/O virtual address for new FMR */ struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, u64 *page_list, diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index 3c4faadb8cdd..81528f64061a 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -654,6 +654,7 @@ int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) } skb_num++; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + ret = -EINVAL; for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { hlist_for_each_entry(map_info, &iwpm_hash_bucket[i], hlist_node) { diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index cb91245e9163..c50596f7f98a 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -49,7 +49,6 @@ #include "smi.h" #include "opa_smi.h" #include "agent.h" -#include "core_priv.h" static int mad_sendq_size = IB_MAD_QP_SEND_SIZE; static int mad_recvq_size = IB_MAD_QP_RECV_SIZE; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 1fb72c356e36..3ccaae18ad75 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -41,8 +41,6 @@ #include <linux/module.h> #include "core_priv.h" -#include "core_priv.h" - static DEFINE_MUTEX(rdma_nl_mutex); static struct sock *nls; static struct { @@ -83,15 +81,13 @@ static bool is_nl_valid(unsigned int type, unsigned int op) if (!is_nl_msg_valid(type, op)) return false; - cb_table = rdma_nl_types[type].cb_table; -#ifdef CONFIG_MODULES - if (!cb_table) { + if (!rdma_nl_types[type].cb_table) { mutex_unlock(&rdma_nl_mutex); request_module("rdma-netlink-subsys-%d", type); mutex_lock(&rdma_nl_mutex); - cb_table = rdma_nl_types[type].cb_table; } -#endif + + cb_table = rdma_nl_types[type].cb_table; if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit)) return false; diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 0dcd1aa6f683..fa8655e3b3ed 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -31,6 +31,8 @@ */ #include <linux/module.h> +#include <linux/pid.h> +#include <linux/pid_namespace.h> #include <net/netlink.h> #include <rdma/rdma_netlink.h> @@ -52,16 +54,42 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_SUMMARY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME] = { .type = NLA_NUL_STRING, + .len = 16 }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_QP] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_QP_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_LQPN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_RQPN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_RQ_PSN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_SQ_PSN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING, + .len = TASK_COMM_LEN }, }; -static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) +static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) { - char fw[IB_FW_VERSION_NAME_MAX]; - if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) return -EMSGSIZE; if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name)) return -EMSGSIZE; + + return 0; +} + +static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) +{ + char fw[IB_FW_VERSION_NAME_MAX]; + + if (fill_nldev_handle(msg, device)) + return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, rdma_end_port(device))) return -EMSGSIZE; @@ -92,10 +120,9 @@ static int fill_port_info(struct sk_buff *msg, struct ib_port_attr attr; int ret; - if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) - return -EMSGSIZE; - if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name)) + if (fill_nldev_handle(msg, device)) return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) return -EMSGSIZE; @@ -126,6 +153,137 @@ static int fill_port_info(struct sk_buff *msg, return 0; } +static int fill_res_info_entry(struct sk_buff *msg, + const char *name, u64 curr) +{ + struct nlattr *entry_attr; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME, name)) + goto err; + if (nla_put_u64_64bit(msg, + RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR, curr, 0)) + goto err; + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); + return -EMSGSIZE; +} + +static int fill_res_info(struct sk_buff *msg, struct ib_device *device) +{ + static const char * const names[RDMA_RESTRACK_MAX] = { + [RDMA_RESTRACK_PD] = "pd", + [RDMA_RESTRACK_CQ] = "cq", + [RDMA_RESTRACK_QP] = "qp", + }; + + struct rdma_restrack_root *res = &device->res; + struct nlattr *table_attr; + int ret, i, curr; + + if (fill_nldev_handle(msg, device)) + return -EMSGSIZE; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY); + if (!table_attr) + return -EMSGSIZE; + + for (i = 0; i < RDMA_RESTRACK_MAX; i++) { + if (!names[i]) + continue; + curr = rdma_restrack_count(res, i, task_active_pid_ns(current)); + ret = fill_res_info_entry(msg, names[i], curr); + if (ret) + goto err; + } + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return ret; +} + +static int fill_res_qp_entry(struct sk_buff *msg, + struct ib_qp *qp, uint32_t port) +{ + struct rdma_restrack_entry *res = &qp->res; + struct ib_qp_init_attr qp_init_attr; + struct nlattr *entry_attr; + struct ib_qp_attr qp_attr; + int ret; + + ret = ib_query_qp(qp, &qp_attr, 0, &qp_init_attr); + if (ret) + return ret; + + if (port && port != qp_attr.port_num) + return 0; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); + if (!entry_attr) + goto out; + + /* In create_qp() port is not set yet */ + if (qp_attr.port_num && + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp_attr.port_num)) + goto err; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num)) + goto err; + if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQPN, + qp_attr.dest_qp_num)) + goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQ_PSN, + qp_attr.rq_psn)) + goto err; + } + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SQ_PSN, qp_attr.sq_psn)) + goto err; + + if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC || + qp->qp_type == IB_QPT_XRC_INI || qp->qp_type == IB_QPT_XRC_TGT) { + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE, + qp_attr.path_mig_state)) + goto err; + } + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, qp->qp_type)) + goto err; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state)) + goto err; + + /* + * Existence of task means that it is user QP and netlink + * user is invited to go and read /proc/PID/comm to get name + * of the task file and res->task_com should be NULL. + */ + if (rdma_is_kernel_res(res)) { + if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME, res->kern_name)) + goto err; + } else { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID, task_pid_vnr(res->task))) + goto err; + } + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); +out: + return -EMSGSIZE; +} + static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -321,6 +479,213 @@ out: return skb->len; } +static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + u32 index; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + goto err; + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), + 0, 0); + + ret = fill_res_info(msg, device); + if (ret) + goto err_free; + + nlmsg_end(msg, nlh); + put_device(&device->dev); + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +err_free: + nlmsg_free(msg); +err: + put_device(&device->dev); + return ret; +} + +static int _nldev_res_get_dumpit(struct ib_device *device, + struct sk_buff *skb, + struct netlink_callback *cb, + unsigned int idx) +{ + int start = cb->args[0]; + struct nlmsghdr *nlh; + + if (idx < start) + return 0; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), + 0, NLM_F_MULTI); + + if (fill_res_info(skb, device)) { + nlmsg_cancel(skb, nlh); + goto out; + } + + nlmsg_end(skb, nlh); + + idx++; + +out: + cb->args[0] = idx; + return skb->len; +} + +static int nldev_res_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return ib_enum_all_devs(_nldev_res_get_dumpit, skb, cb); +} + +static int nldev_res_get_qp_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct rdma_restrack_entry *res; + int err, ret = 0, idx = 0; + struct nlattr *table_attr; + struct ib_device *device; + int start = cb->args[0]; + struct ib_qp *qp = NULL; + struct nlmsghdr *nlh; + u32 index, port = 0; + + err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NULL); + /* + * Right now, we are expecting the device index to get QP information, + * but it is possible to extend this code to return all devices in + * one shot by checking the existence of RDMA_NLDEV_ATTR_DEV_INDEX. + * if it doesn't exist, we will iterate over all devices. + * + * But it is not needed for now. + */ + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + /* + * If no PORT_INDEX is supplied, we will return all QPs from that device + */ + if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err_index; + } + } + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_QP_GET), + 0, NLM_F_MULTI); + + if (fill_nldev_handle(skb, device)) { + ret = -EMSGSIZE; + goto err; + } + + table_attr = nla_nest_start(skb, RDMA_NLDEV_ATTR_RES_QP); + if (!table_attr) { + ret = -EMSGSIZE; + goto err; + } + + down_read(&device->res.rwsem); + hash_for_each_possible(device->res.hash, res, node, RDMA_RESTRACK_QP) { + if (idx < start) + goto next; + + if ((rdma_is_kernel_res(res) && + task_active_pid_ns(current) != &init_pid_ns) || + (!rdma_is_kernel_res(res) && + task_active_pid_ns(current) != task_active_pid_ns(res->task))) + /* + * 1. Kernel QPs should be visible in init namspace only + * 2. Present only QPs visible in the current namespace + */ + goto next; + + if (!rdma_restrack_get(res)) + /* + * Resource is under release now, but we are not + * relesing lock now, so it will be released in + * our next pass, once we will get ->next pointer. + */ + goto next; + + qp = container_of(res, struct ib_qp, res); + + up_read(&device->res.rwsem); + ret = fill_res_qp_entry(skb, qp, port); + down_read(&device->res.rwsem); + /* + * Return resource back, but it won't be released till + * the &device->res.rwsem will be released for write. + */ + rdma_restrack_put(res); + + if (ret == -EMSGSIZE) + /* + * There is a chance to optimize here. + * It can be done by using list_prepare_entry + * and list_for_each_entry_continue afterwards. + */ + break; + if (ret) + goto res_err; +next: idx++; + } + up_read(&device->res.rwsem); + + nla_nest_end(skb, table_attr); + nlmsg_end(skb, nlh); + cb->args[0] = idx; + + /* + * No more QPs to fill, cancel the message and + * return 0 to mark end of dumpit. + */ + if (!qp) + goto err; + + put_device(&device->dev); + return skb->len; + +res_err: + nla_nest_cancel(skb, table_attr); + up_read(&device->res.rwsem); + +err: + nlmsg_cancel(skb, nlh); + +err_index: + put_device(&device->dev); + return ret; +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -330,6 +695,23 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_port_get_doit, .dump = nldev_port_get_dumpit, }, + [RDMA_NLDEV_CMD_RES_GET] = { + .doit = nldev_res_get_doit, + .dump = nldev_res_get_dumpit, + }, + [RDMA_NLDEV_CMD_RES_QP_GET] = { + .dump = nldev_res_get_qp_dumpit, + /* + * .doit is not implemented yet for two reasons: + * 1. It is not needed yet. + * 2. There is a need to provide identifier, while it is easy + * for the QPs (device index + port index + LQPN), it is not + * the case for the rest of resources (PD and CQ). Because it + * is better to provide similar interface for all resources, + * let's wait till we will have other resources implemented + * too. + */ + }, }; void __init nldev_init(void) diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c new file mode 100644 index 000000000000..857637bf46da --- /dev/null +++ b/drivers/infiniband/core/restrack.c @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* + * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved. + */ + +#include <rdma/ib_verbs.h> +#include <rdma/restrack.h> +#include <linux/mutex.h> +#include <linux/sched/task.h> +#include <linux/uaccess.h> +#include <linux/pid_namespace.h> + +void rdma_restrack_init(struct rdma_restrack_root *res) +{ + init_rwsem(&res->rwsem); +} + +void rdma_restrack_clean(struct rdma_restrack_root *res) +{ + WARN_ON_ONCE(!hash_empty(res->hash)); +} + +int rdma_restrack_count(struct rdma_restrack_root *res, + enum rdma_restrack_type type, + struct pid_namespace *ns) +{ + struct rdma_restrack_entry *e; + u32 cnt = 0; + + down_read(&res->rwsem); + hash_for_each_possible(res->hash, e, node, type) { + if (ns == &init_pid_ns || + (!rdma_is_kernel_res(e) && + ns == task_active_pid_ns(e->task))) + cnt++; + } + up_read(&res->rwsem); + return cnt; +} +EXPORT_SYMBOL(rdma_restrack_count); + +static void set_kern_name(struct rdma_restrack_entry *res) +{ + enum rdma_restrack_type type = res->type; + struct ib_qp *qp; + + if (type != RDMA_RESTRACK_QP) + /* PD and CQ types already have this name embedded in */ + return; + + qp = container_of(res, struct ib_qp, res); + if (!qp->pd) { + WARN_ONCE(true, "XRC QPs are not supported\n"); + /* Survive, despite the programmer's error */ + res->kern_name = " "; + return; + } + + res->kern_name = qp->pd->res.kern_name; +} + +static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) +{ + enum rdma_restrack_type type = res->type; + struct ib_device *dev; + struct ib_xrcd *xrcd; + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + + switch (type) { + case RDMA_RESTRACK_PD: + pd = container_of(res, struct ib_pd, res); + dev = pd->device; + break; + case RDMA_RESTRACK_CQ: + cq = container_of(res, struct ib_cq, res); + dev = cq->device; + break; + case RDMA_RESTRACK_QP: + qp = container_of(res, struct ib_qp, res); + dev = qp->device; + break; + case RDMA_RESTRACK_XRCD: + xrcd = container_of(res, struct ib_xrcd, res); + dev = xrcd->device; + break; + default: + WARN_ONCE(true, "Wrong resource tracking type %u\n", type); + return NULL; + } + + return dev; +} + +void rdma_restrack_add(struct rdma_restrack_entry *res) +{ + struct ib_device *dev = res_to_dev(res); + + if (!dev) + return; + + if (!uaccess_kernel()) { + get_task_struct(current); + res->task = current; + res->kern_name = NULL; + } else { + set_kern_name(res); + res->task = NULL; + } + + kref_init(&res->kref); + init_completion(&res->comp); + res->valid = true; + + down_write(&dev->res.rwsem); + hash_add(dev->res.hash, &res->node, res->type); + up_write(&dev->res.rwsem); +} +EXPORT_SYMBOL(rdma_restrack_add); + +int __must_check rdma_restrack_get(struct rdma_restrack_entry *res) +{ + return kref_get_unless_zero(&res->kref); +} +EXPORT_SYMBOL(rdma_restrack_get); + +static void restrack_release(struct kref *kref) +{ + struct rdma_restrack_entry *res; + + res = container_of(kref, struct rdma_restrack_entry, kref); + complete(&res->comp); +} + +int rdma_restrack_put(struct rdma_restrack_entry *res) +{ + return kref_put(&res->kref, restrack_release); +} +EXPORT_SYMBOL(rdma_restrack_put); + +void rdma_restrack_del(struct rdma_restrack_entry *res) +{ + struct ib_device *dev; + + if (!res->valid) + return; + + dev = res_to_dev(res); + if (!dev) + return; + + rdma_restrack_put(res); + + wait_for_completion(&res->comp); + + down_write(&dev->res.rwsem); + hash_del(&res->node); + res->valid = false; + if (res->task) + put_task_struct(res->task); + up_write(&dev->res.rwsem); +} +EXPORT_SYMBOL(rdma_restrack_del); diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 90e3889b7fbe..5a52ec77940a 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -410,15 +410,18 @@ static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, rtnl_unlock(); } -/* This function will rescan all of the network devices in the system - * and add their gids, as needed, to the relevant RoCE devices. */ -int roce_rescan_device(struct ib_device *ib_dev) +/** + * rdma_roce_rescan_device - Rescan all of the network devices in the system + * and add their gids, as needed, to the relevant RoCE devices. + * + * @device: the rdma device + */ +void rdma_roce_rescan_device(struct ib_device *ib_dev) { ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, enum_all_gids_of_dev_cb, NULL); - - return 0; } +EXPORT_SYMBOL(rdma_roce_rescan_device); static void callback_for_addr_gid_device_scan(struct ib_device *device, u8 port, diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index ab5e1024fea9..8cf15d4a8ac4 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1227,9 +1227,9 @@ static u8 get_src_path_mask(struct ib_device *device, u8 port_num) return src_path_mask; } -int ib_init_ah_from_path(struct ib_device *device, u8 port_num, - struct sa_path_rec *rec, - struct rdma_ah_attr *ah_attr) +int ib_init_ah_attr_from_path(struct ib_device *device, u8 port_num, + struct sa_path_rec *rec, + struct rdma_ah_attr *ah_attr) { int ret; u16 gid_index; @@ -1341,10 +1341,11 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, return 0; } -EXPORT_SYMBOL(ib_init_ah_from_path); +EXPORT_SYMBOL(ib_init_ah_attr_from_path); static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask) { + struct rdma_ah_attr ah_attr; unsigned long flags; spin_lock_irqsave(&query->port->ah_lock, flags); @@ -1356,6 +1357,15 @@ static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask) query->sm_ah = query->port->sm_ah; spin_unlock_irqrestore(&query->port->ah_lock, flags); + /* + * Always check if sm_ah has valid dlid assigned, + * before querying for class port info + */ + if ((rdma_query_ah(query->sm_ah->ah, &ah_attr) < 0) || + !rdma_is_valid_unicast_lid(&ah_attr)) { + kref_put(&query->sm_ah->ref, free_sm_ah); + return -EAGAIN; + } query->mad_buf = ib_create_send_mad(query->port->agent, 1, query->sm_ah->pkey_index, 0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA, diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 59b2f96d986a..b61dda6b04fc 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -653,12 +653,11 @@ int ib_security_modify_qp(struct ib_qp *qp, } return ret; } -EXPORT_SYMBOL(ib_security_modify_qp); -int ib_security_pkey_access(struct ib_device *dev, - u8 port_num, - u16 pkey_index, - void *sec) +static int ib_security_pkey_access(struct ib_device *dev, + u8 port_num, + u16 pkey_index, + void *sec) { u64 subnet_prefix; u16 pkey; @@ -678,7 +677,6 @@ int ib_security_pkey_access(struct ib_device *dev, return security_ib_pkey_access(sec, subnet_prefix, pkey); } -EXPORT_SYMBOL(ib_security_pkey_access); static int ib_mad_agent_security_change(struct notifier_block *nb, unsigned long event, diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index e30d86fa1855..8ae1308eecc7 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1276,7 +1276,6 @@ int ib_device_register_sysfs(struct ib_device *device, int ret; int i; - WARN_ON_ONCE(!device->dev.parent); ret = dev_set_name(class_dev, "%s", device->name); if (ret) return ret; diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index f7adae0adc19..8ae636bb09e5 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -53,6 +53,8 @@ #include <rdma/ib_user_cm.h> #include <rdma/ib_marshall.h> +#include "core_priv.h" + MODULE_AUTHOR("Libor Michalek"); MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access"); MODULE_LICENSE("Dual BSD/GPL"); @@ -104,10 +106,13 @@ struct ib_ucm_event { enum { IB_UCM_MAJOR = 231, IB_UCM_BASE_MINOR = 224, - IB_UCM_MAX_DEVICES = 32 + IB_UCM_MAX_DEVICES = RDMA_MAX_PORTS, + IB_UCM_NUM_FIXED_MINOR = 32, + IB_UCM_NUM_DYNAMIC_MINOR = IB_UCM_MAX_DEVICES - IB_UCM_NUM_FIXED_MINOR, }; #define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR) +static dev_t dynamic_ucm_dev; static void ib_ucm_add_one(struct ib_device *device); static void ib_ucm_remove_one(struct ib_device *device, void *client_data); @@ -1199,7 +1204,6 @@ static int ib_ucm_close(struct inode *inode, struct file *filp) return 0; } -static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES); static void ib_ucm_release_dev(struct device *dev) { struct ib_ucm_device *ucm_dev; @@ -1210,10 +1214,7 @@ static void ib_ucm_release_dev(struct device *dev) static void ib_ucm_free_dev(struct ib_ucm_device *ucm_dev) { - if (ucm_dev->devnum < IB_UCM_MAX_DEVICES) - clear_bit(ucm_dev->devnum, dev_map); - else - clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, overflow_map); + clear_bit(ucm_dev->devnum, dev_map); } static const struct file_operations ucm_fops = { @@ -1235,27 +1236,6 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); -static dev_t overflow_maj; -static int find_overflow_devnum(void) -{ - int ret; - - if (!overflow_maj) { - ret = alloc_chrdev_region(&overflow_maj, 0, IB_UCM_MAX_DEVICES, - "infiniband_cm"); - if (ret) { - pr_err("ucm: couldn't register dynamic device number\n"); - return ret; - } - } - - ret = find_first_zero_bit(overflow_map, IB_UCM_MAX_DEVICES); - if (ret >= IB_UCM_MAX_DEVICES) - return -1; - - return ret; -} - static void ib_ucm_add_one(struct ib_device *device) { int devnum; @@ -1274,19 +1254,14 @@ static void ib_ucm_add_one(struct ib_device *device) ucm_dev->dev.release = ib_ucm_release_dev; devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES); - if (devnum >= IB_UCM_MAX_DEVICES) { - devnum = find_overflow_devnum(); - if (devnum < 0) - goto err; - - ucm_dev->devnum = devnum + IB_UCM_MAX_DEVICES; - base = devnum + overflow_maj; - set_bit(devnum, overflow_map); - } else { - ucm_dev->devnum = devnum; - base = devnum + IB_UCM_BASE_DEV; - set_bit(devnum, dev_map); - } + if (devnum >= IB_UCM_MAX_DEVICES) + goto err; + ucm_dev->devnum = devnum; + set_bit(devnum, dev_map); + if (devnum >= IB_UCM_NUM_FIXED_MINOR) + base = dynamic_ucm_dev + devnum - IB_UCM_NUM_FIXED_MINOR; + else + base = IB_UCM_BASE_DEV + devnum; cdev_init(&ucm_dev->cdev, &ucm_fops); ucm_dev->cdev.owner = THIS_MODULE; @@ -1334,13 +1309,20 @@ static int __init ib_ucm_init(void) { int ret; - ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES, + ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR, "infiniband_cm"); if (ret) { pr_err("ucm: couldn't register device number\n"); goto error1; } + ret = alloc_chrdev_region(&dynamic_ucm_dev, 0, IB_UCM_NUM_DYNAMIC_MINOR, + "infiniband_cm"); + if (ret) { + pr_err("ucm: couldn't register dynamic device number\n"); + goto err_alloc; + } + ret = class_create_file(&cm_class, &class_attr_abi_version.attr); if (ret) { pr_err("ucm: couldn't create abi_version attribute\n"); @@ -1357,7 +1339,9 @@ static int __init ib_ucm_init(void) error3: class_remove_file(&cm_class, &class_attr_abi_version.attr); error2: - unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES); + unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR); +err_alloc: + unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR); error1: return ret; } @@ -1366,9 +1350,8 @@ static void __exit ib_ucm_cleanup(void) { ib_unregister_client(&ucm_client); class_remove_file(&cm_class, &class_attr_abi_version.attr); - unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES); - if (overflow_maj) - unregister_chrdev_region(overflow_maj, IB_UCM_MAX_DEVICES); + unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR); + unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR); idr_destroy(&ctx_id_table); } diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index e4be89d1f3d8..6ba4231f2b07 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -904,13 +904,14 @@ static ssize_t ucma_query_path(struct ucma_context *ctx, resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY | IB_PATH_BIDIRECTIONAL; - if (rec->rec_type == SA_PATH_REC_TYPE_IB) { - ib_sa_pack_path(rec, &resp->path_data[i].path_rec); - } else { + if (rec->rec_type == SA_PATH_REC_TYPE_OPA) { struct sa_path_rec ib; sa_convert_path_opa_to_ib(&ib, rec); ib_sa_pack_path(&ib, &resp->path_data[i].path_rec); + + } else { + ib_sa_pack_path(rec, &resp->path_data[i].path_rec); } } @@ -943,8 +944,8 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx, } else { addr->sib_family = AF_IB; addr->sib_pkey = (__force __be16) resp.pkey; - rdma_addr_get_sgid(&ctx->cm_id->route.addr.dev_addr, - (union ib_gid *) &addr->sib_addr); + rdma_read_gids(ctx->cm_id, (union ib_gid *)&addr->sib_addr, + NULL); addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) &ctx->cm_id->route.addr.src_addr); } @@ -956,8 +957,8 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx, } else { addr->sib_family = AF_IB; addr->sib_pkey = (__force __be16) resp.pkey; - rdma_addr_get_dgid(&ctx->cm_id->route.addr.dev_addr, - (union ib_gid *) &addr->sib_addr); + rdma_read_gids(ctx->cm_id, NULL, + (union ib_gid *)&addr->sib_addr); addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr); } @@ -1231,9 +1232,9 @@ static int ucma_set_ib_path(struct ucma_context *ctx, struct sa_path_rec opa; sa_convert_path_ib_to_opa(&opa, &sa_path); - ret = rdma_set_ib_paths(ctx->cm_id, &opa, 1); + ret = rdma_set_ib_path(ctx->cm_id, &opa); } else { - ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1); + ret = rdma_set_ib_path(ctx->cm_id, &sa_path); } if (ret) return ret; diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 130606c3b07c..9a4e899d94b3 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -352,7 +352,7 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, return -EINVAL; } - ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length, + ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length, offset + ib_umem_offset(umem)); if (ret < 0) diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 0c32d10f23ff..78c77962422e 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -55,16 +55,21 @@ #include <rdma/ib_mad.h> #include <rdma/ib_user_mad.h> +#include "core_priv.h" + MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand userspace MAD packet access"); MODULE_LICENSE("Dual BSD/GPL"); enum { - IB_UMAD_MAX_PORTS = 64, + IB_UMAD_MAX_PORTS = RDMA_MAX_PORTS, IB_UMAD_MAX_AGENTS = 32, IB_UMAD_MAJOR = 231, - IB_UMAD_MINOR_BASE = 0 + IB_UMAD_MINOR_BASE = 0, + IB_UMAD_NUM_FIXED_MINOR = 64, + IB_UMAD_NUM_DYNAMIC_MINOR = IB_UMAD_MAX_PORTS - IB_UMAD_NUM_FIXED_MINOR, + IB_ISSM_MINOR_BASE = IB_UMAD_NUM_FIXED_MINOR, }; /* @@ -127,9 +132,12 @@ struct ib_umad_packet { static struct class *umad_class; -static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE); +static const dev_t base_umad_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE); +static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) + + IB_UMAD_NUM_FIXED_MINOR; +static dev_t dynamic_umad_dev; +static dev_t dynamic_issm_dev; -static DEFINE_SPINLOCK(port_lock); static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); static void ib_umad_add_one(struct ib_device *device); @@ -233,8 +241,7 @@ static void recv_handler(struct ib_mad_agent *agent, * On OPA devices it is okay to lose the upper 16 bits of LID as this * information is obtained elsewhere. Mask off the upper 16 bits. */ - if (agent->device->port_immutable[agent->port_num].core_cap_flags & - RDMA_CORE_PORT_INTEL_OPA) + if (rdma_cap_opa_mad(agent->device, agent->port_num)) packet->mad.hdr.lid = ib_lid_be16(0xFFFF & mad_recv_wc->wc->slid); else @@ -246,10 +253,14 @@ static void recv_handler(struct ib_mad_agent *agent, if (packet->mad.hdr.grh_present) { struct rdma_ah_attr ah_attr; const struct ib_global_route *grh; + int ret; - ib_init_ah_from_wc(agent->device, agent->port_num, - mad_recv_wc->wc, mad_recv_wc->recv_buf.grh, - &ah_attr); + ret = ib_init_ah_attr_from_wc(agent->device, agent->port_num, + mad_recv_wc->wc, + mad_recv_wc->recv_buf.grh, + &ah_attr); + if (ret) + goto err2; grh = rdma_ah_read_grh(&ah_attr); packet->mad.hdr.gid_index = grh->sgid_index; @@ -500,7 +511,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, } memset(&ah_attr, 0, sizeof ah_attr); - ah_attr.type = rdma_ah_find_type(file->port->ib_dev, + ah_attr.type = rdma_ah_find_type(agent->device, file->port->port_num); rdma_ah_set_dlid(&ah_attr, be16_to_cpu(packet->mad.hdr.lid)); rdma_ah_set_sl(&ah_attr, packet->mad.hdr.sl); @@ -1139,54 +1150,26 @@ static DEVICE_ATTR(port, S_IRUGO, show_port, NULL); static CLASS_ATTR_STRING(abi_version, S_IRUGO, __stringify(IB_USER_MAD_ABI_VERSION)); -static dev_t overflow_maj; -static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS); -static int find_overflow_devnum(struct ib_device *device) -{ - int ret; - - if (!overflow_maj) { - ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2, - "infiniband_mad"); - if (ret) { - dev_err(&device->dev, - "couldn't register dynamic device number\n"); - return ret; - } - } - - ret = find_first_zero_bit(overflow_map, IB_UMAD_MAX_PORTS); - if (ret >= IB_UMAD_MAX_PORTS) - return -1; - - return ret; -} - static int ib_umad_init_port(struct ib_device *device, int port_num, struct ib_umad_device *umad_dev, struct ib_umad_port *port) { int devnum; - dev_t base; + dev_t base_umad; + dev_t base_issm; - spin_lock(&port_lock); devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS); - if (devnum >= IB_UMAD_MAX_PORTS) { - spin_unlock(&port_lock); - devnum = find_overflow_devnum(device); - if (devnum < 0) - return -1; - - spin_lock(&port_lock); - port->dev_num = devnum + IB_UMAD_MAX_PORTS; - base = devnum + overflow_maj; - set_bit(devnum, overflow_map); + if (devnum >= IB_UMAD_MAX_PORTS) + return -1; + port->dev_num = devnum; + set_bit(devnum, dev_map); + if (devnum >= IB_UMAD_NUM_FIXED_MINOR) { + base_umad = dynamic_umad_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; + base_issm = dynamic_issm_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; } else { - port->dev_num = devnum; - base = devnum + base_dev; - set_bit(devnum, dev_map); + base_umad = devnum + base_umad_dev; + base_issm = devnum + base_issm_dev; } - spin_unlock(&port_lock); port->ib_dev = device; port->port_num = port_num; @@ -1198,7 +1181,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, port->cdev.owner = THIS_MODULE; cdev_set_parent(&port->cdev, &umad_dev->kobj); kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num); - if (cdev_add(&port->cdev, base, 1)) + if (cdev_add(&port->cdev, base_umad, 1)) goto err_cdev; port->dev = device_create(umad_class, device->dev.parent, @@ -1212,12 +1195,11 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, if (device_create_file(port->dev, &dev_attr_port)) goto err_dev; - base += IB_UMAD_MAX_PORTS; cdev_init(&port->sm_cdev, &umad_sm_fops); port->sm_cdev.owner = THIS_MODULE; cdev_set_parent(&port->sm_cdev, &umad_dev->kobj); kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num); - if (cdev_add(&port->sm_cdev, base, 1)) + if (cdev_add(&port->sm_cdev, base_issm, 1)) goto err_sm_cdev; port->sm_dev = device_create(umad_class, device->dev.parent, @@ -1244,10 +1226,7 @@ err_dev: err_cdev: cdev_del(&port->cdev); - if (port->dev_num < IB_UMAD_MAX_PORTS) - clear_bit(devnum, dev_map); - else - clear_bit(devnum, overflow_map); + clear_bit(devnum, dev_map); return -1; } @@ -1281,11 +1260,7 @@ static void ib_umad_kill_port(struct ib_umad_port *port) } mutex_unlock(&port->file_mutex); - - if (port->dev_num < IB_UMAD_MAX_PORTS) - clear_bit(port->dev_num, dev_map); - else - clear_bit(port->dev_num - IB_UMAD_MAX_PORTS, overflow_map); + clear_bit(port->dev_num, dev_map); } static void ib_umad_add_one(struct ib_device *device) @@ -1361,13 +1336,23 @@ static int __init ib_umad_init(void) { int ret; - ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2, + ret = register_chrdev_region(base_umad_dev, + IB_UMAD_NUM_FIXED_MINOR * 2, "infiniband_mad"); if (ret) { pr_err("couldn't register device number\n"); goto out; } + ret = alloc_chrdev_region(&dynamic_umad_dev, 0, + IB_UMAD_NUM_DYNAMIC_MINOR * 2, + "infiniband_mad"); + if (ret) { + pr_err("couldn't register dynamic device number\n"); + goto out_alloc; + } + dynamic_issm_dev = dynamic_umad_dev + IB_UMAD_NUM_DYNAMIC_MINOR; + umad_class = class_create(THIS_MODULE, "infiniband_mad"); if (IS_ERR(umad_class)) { ret = PTR_ERR(umad_class); @@ -1395,7 +1380,12 @@ out_class: class_destroy(umad_class); out_chrdev: - unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2); + unregister_chrdev_region(dynamic_umad_dev, + IB_UMAD_NUM_DYNAMIC_MINOR * 2); + +out_alloc: + unregister_chrdev_region(base_umad_dev, + IB_UMAD_NUM_FIXED_MINOR * 2); out: return ret; @@ -1405,9 +1395,10 @@ static void __exit ib_umad_cleanup(void) { ib_unregister_client(&umad_client); class_destroy(umad_class); - unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2); - if (overflow_maj) - unregister_chrdev_region(overflow_maj, IB_UMAD_MAX_PORTS * 2); + unregister_chrdev_region(base_umad_dev, + IB_UMAD_NUM_FIXED_MINOR * 2); + unregister_chrdev_region(dynamic_umad_dev, + IB_UMAD_NUM_DYNAMIC_MINOR * 2); } module_init(ib_umad_init); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 840b24096690..256934d1f64f 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -340,6 +340,8 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, uobj->object = pd; memset(&resp, 0, sizeof resp); resp.pd_handle = uobj->id; + pd->res.type = RDMA_RESTRACK_PD; + rdma_restrack_add(&pd->res); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) { ret = -EFAULT; @@ -1033,6 +1035,8 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, goto err_cb; uobj_alloc_commit(&obj->uobject); + cq->res.type = RDMA_RESTRACK_CQ; + rdma_restrack_add(&cq->res); return obj; @@ -1145,10 +1149,7 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file, min(ucore->inlen, sizeof(cmd)), ib_uverbs_ex_create_cq_cb, NULL); - if (IS_ERR(obj)) - return PTR_ERR(obj); - - return 0; + return PTR_ERR_OR_ZERO(obj); } ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, @@ -1199,7 +1200,7 @@ static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest, tmp.opcode = wc->opcode; tmp.vendor_err = wc->vendor_err; tmp.byte_len = wc->byte_len; - tmp.ex.imm_data = (__u32 __force) wc->ex.imm_data; + tmp.ex.imm_data = wc->ex.imm_data; tmp.qp_num = wc->qp->qp_num; tmp.src_qp = wc->src_qp; tmp.wc_flags = wc->wc_flags; @@ -1517,7 +1518,7 @@ static int create_qp(struct ib_uverbs_file *file, if (cmd->qp_type == IB_QPT_XRC_TGT) qp = ib_create_qp(pd, &attr); else - qp = device->create_qp(pd, &attr, uhw); + qp = _ib_create_qp(device, pd, &attr, uhw); if (IS_ERR(qp)) { ret = PTR_ERR(qp); @@ -1530,7 +1531,6 @@ static int create_qp(struct ib_uverbs_file *file, goto err_cb; qp->real_qp = qp; - qp->device = device; qp->pd = pd; qp->send_cq = attr.send_cq; qp->recv_cq = attr.recv_cq; diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 71ff2644e053..d96dc1d17be1 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -243,16 +243,13 @@ static long ib_uverbs_cmd_verbs(struct ib_device *ib_dev, size_t ctx_size; uintptr_t data[UVERBS_OPTIMIZE_USING_STACK_SZ / sizeof(uintptr_t)]; - if (hdr->reserved) - return -EINVAL; - object_spec = uverbs_get_object(ib_dev, hdr->object_id); if (!object_spec) - return -EOPNOTSUPP; + return -EPROTONOSUPPORT; method_spec = uverbs_get_method(object_spec, hdr->method_id); if (!method_spec) - return -EOPNOTSUPP; + return -EPROTONOSUPPORT; if ((method_spec->flags & UVERBS_ACTION_FLAG_CREATE_ROOT) ^ !file->ucontext) return -EINVAL; @@ -305,6 +302,16 @@ static long ib_uverbs_cmd_verbs(struct ib_device *ib_dev, err = uverbs_handle_method(buf, ctx->uattrs, hdr->num_attrs, ib_dev, file, method_spec, ctx->uverbs_attr_bundle); + + /* + * EPROTONOSUPPORT is ONLY to be returned if the ioctl framework can + * not invoke the method because the request is not supported. No + * other cases should return this code. + */ + if (unlikely(err == -EPROTONOSUPPORT)) { + WARN_ON_ONCE(err == -EPROTONOSUPPORT); + err = -EINVAL; + } out: if (ctx != (void *)data) kfree(ctx); @@ -341,7 +348,7 @@ long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } if (hdr.reserved) { - err = -EOPNOTSUPP; + err = -EPROTONOSUPPORT; goto out; } diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 5f216ffb465a..5b811bf574d6 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -62,14 +62,16 @@ MODULE_LICENSE("Dual BSD/GPL"); enum { IB_UVERBS_MAJOR = 231, IB_UVERBS_BASE_MINOR = 192, - IB_UVERBS_MAX_DEVICES = 32 + IB_UVERBS_MAX_DEVICES = RDMA_MAX_PORTS, + IB_UVERBS_NUM_FIXED_MINOR = 32, + IB_UVERBS_NUM_DYNAMIC_MINOR = IB_UVERBS_MAX_DEVICES - IB_UVERBS_NUM_FIXED_MINOR, }; #define IB_UVERBS_BASE_DEV MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR) +static dev_t dynamic_uverbs_dev; static struct class *uverbs_class; -static DEFINE_SPINLOCK(map_lock); static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, @@ -1005,34 +1007,6 @@ static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL); static CLASS_ATTR_STRING(abi_version, S_IRUGO, __stringify(IB_USER_VERBS_ABI_VERSION)); -static dev_t overflow_maj; -static DECLARE_BITMAP(overflow_map, IB_UVERBS_MAX_DEVICES); - -/* - * If we have more than IB_UVERBS_MAX_DEVICES, dynamically overflow by - * requesting a new major number and doubling the number of max devices we - * support. It's stupid, but simple. - */ -static int find_overflow_devnum(void) -{ - int ret; - - if (!overflow_maj) { - ret = alloc_chrdev_region(&overflow_maj, 0, IB_UVERBS_MAX_DEVICES, - "infiniband_verbs"); - if (ret) { - pr_err("user_verbs: couldn't register dynamic device number\n"); - return ret; - } - } - - ret = find_first_zero_bit(overflow_map, IB_UVERBS_MAX_DEVICES); - if (ret >= IB_UVERBS_MAX_DEVICES) - return -1; - - return ret; -} - static void ib_uverbs_add_one(struct ib_device *device) { int devnum; @@ -1062,24 +1036,15 @@ static void ib_uverbs_add_one(struct ib_device *device) INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list); INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list); - spin_lock(&map_lock); devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); - if (devnum >= IB_UVERBS_MAX_DEVICES) { - spin_unlock(&map_lock); - devnum = find_overflow_devnum(); - if (devnum < 0) - goto err; - - spin_lock(&map_lock); - uverbs_dev->devnum = devnum + IB_UVERBS_MAX_DEVICES; - base = devnum + overflow_maj; - set_bit(devnum, overflow_map); - } else { - uverbs_dev->devnum = devnum; - base = devnum + IB_UVERBS_BASE_DEV; - set_bit(devnum, dev_map); - } - spin_unlock(&map_lock); + if (devnum >= IB_UVERBS_MAX_DEVICES) + goto err; + uverbs_dev->devnum = devnum; + set_bit(devnum, dev_map); + if (devnum >= IB_UVERBS_NUM_FIXED_MINOR) + base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR; + else + base = IB_UVERBS_BASE_DEV + devnum; rcu_assign_pointer(uverbs_dev->ib_dev, device); uverbs_dev->num_comp_vectors = device->num_comp_vectors; @@ -1124,10 +1089,7 @@ err_class: err_cdev: cdev_del(&uverbs_dev->cdev); - if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES) - clear_bit(devnum, dev_map); - else - clear_bit(devnum, overflow_map); + clear_bit(devnum, dev_map); err: if (atomic_dec_and_test(&uverbs_dev->refcount)) @@ -1219,11 +1181,7 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) dev_set_drvdata(uverbs_dev->dev, NULL); device_destroy(uverbs_class, uverbs_dev->cdev.dev); cdev_del(&uverbs_dev->cdev); - - if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES) - clear_bit(uverbs_dev->devnum, dev_map); - else - clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map); + clear_bit(uverbs_dev->devnum, dev_map); if (device->disassociate_ucontext) { /* We disassociate HW resources and immediately return. @@ -1265,13 +1223,22 @@ static int __init ib_uverbs_init(void) { int ret; - ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES, + ret = register_chrdev_region(IB_UVERBS_BASE_DEV, + IB_UVERBS_NUM_FIXED_MINOR, "infiniband_verbs"); if (ret) { pr_err("user_verbs: couldn't register device number\n"); goto out; } + ret = alloc_chrdev_region(&dynamic_uverbs_dev, 0, + IB_UVERBS_NUM_DYNAMIC_MINOR, + "infiniband_verbs"); + if (ret) { + pr_err("couldn't register dynamic device number\n"); + goto out_alloc; + } + uverbs_class = class_create(THIS_MODULE, "infiniband_verbs"); if (IS_ERR(uverbs_class)) { ret = PTR_ERR(uverbs_class); @@ -1299,7 +1266,12 @@ out_class: class_destroy(uverbs_class); out_chrdev: - unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES); + unregister_chrdev_region(dynamic_uverbs_dev, + IB_UVERBS_NUM_DYNAMIC_MINOR); + +out_alloc: + unregister_chrdev_region(IB_UVERBS_BASE_DEV, + IB_UVERBS_NUM_FIXED_MINOR); out: return ret; @@ -1309,9 +1281,10 @@ static void __exit ib_uverbs_cleanup(void) { ib_unregister_client(&uverbs_client); class_destroy(uverbs_class); - unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES); - if (overflow_maj) - unregister_chrdev_region(overflow_maj, IB_UVERBS_MAX_DEVICES); + unregister_chrdev_region(IB_UVERBS_BASE_DEV, + IB_UVERBS_NUM_FIXED_MINOR); + unregister_chrdev_region(dynamic_uverbs_dev, + IB_UVERBS_NUM_DYNAMIC_MINOR); } module_init(ib_uverbs_init); diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index c3ee5d9b336d..b571176babbe 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -35,6 +35,7 @@ #include <rdma/ib_verbs.h> #include <linux/bug.h> #include <linux/file.h> +#include <rdma/restrack.h> #include "rdma_core.h" #include "uverbs.h" @@ -319,6 +320,8 @@ static int uverbs_create_cq_handler(struct ib_device *ib_dev, obj->uobject.object = cq; obj->uobject.user_handle = user_handle; atomic_set(&cq->usecnt, 0); + cq->res.type = RDMA_RESTRACK_CQ; + rdma_restrack_add(&cq->res); ret = uverbs_copy_to(attrs, CREATE_CQ_RESP_CQE, &cq->cqe); if (ret) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index e36d27ed4daa..16ebc6372c31 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -124,16 +124,24 @@ EXPORT_SYMBOL(ib_wc_status_msg); __attribute_const__ int ib_rate_to_mult(enum ib_rate rate) { switch (rate) { - case IB_RATE_2_5_GBPS: return 1; - case IB_RATE_5_GBPS: return 2; - case IB_RATE_10_GBPS: return 4; - case IB_RATE_20_GBPS: return 8; - case IB_RATE_30_GBPS: return 12; - case IB_RATE_40_GBPS: return 16; - case IB_RATE_60_GBPS: return 24; - case IB_RATE_80_GBPS: return 32; - case IB_RATE_120_GBPS: return 48; - default: return -1; + case IB_RATE_2_5_GBPS: return 1; + case IB_RATE_5_GBPS: return 2; + case IB_RATE_10_GBPS: return 4; + case IB_RATE_20_GBPS: return 8; + case IB_RATE_30_GBPS: return 12; + case IB_RATE_40_GBPS: return 16; + case IB_RATE_60_GBPS: return 24; + case IB_RATE_80_GBPS: return 32; + case IB_RATE_120_GBPS: return 48; + case IB_RATE_14_GBPS: return 6; + case IB_RATE_56_GBPS: return 22; + case IB_RATE_112_GBPS: return 45; + case IB_RATE_168_GBPS: return 67; + case IB_RATE_25_GBPS: return 10; + case IB_RATE_100_GBPS: return 40; + case IB_RATE_200_GBPS: return 80; + case IB_RATE_300_GBPS: return 120; + default: return -1; } } EXPORT_SYMBOL(ib_rate_to_mult); @@ -141,16 +149,24 @@ EXPORT_SYMBOL(ib_rate_to_mult); __attribute_const__ enum ib_rate mult_to_ib_rate(int mult) { switch (mult) { - case 1: return IB_RATE_2_5_GBPS; - case 2: return IB_RATE_5_GBPS; - case 4: return IB_RATE_10_GBPS; - case 8: return IB_RATE_20_GBPS; - case 12: return IB_RATE_30_GBPS; - case 16: return IB_RATE_40_GBPS; - case 24: return IB_RATE_60_GBPS; - case 32: return IB_RATE_80_GBPS; - case 48: return IB_RATE_120_GBPS; - default: return IB_RATE_PORT_CURRENT; + case 1: return IB_RATE_2_5_GBPS; + case 2: return IB_RATE_5_GBPS; + case 4: return IB_RATE_10_GBPS; + case 8: return IB_RATE_20_GBPS; + case 12: return IB_RATE_30_GBPS; + case 16: return IB_RATE_40_GBPS; + case 24: return IB_RATE_60_GBPS; + case 32: return IB_RATE_80_GBPS; + case 48: return IB_RATE_120_GBPS; + case 6: return IB_RATE_14_GBPS; + case 22: return IB_RATE_56_GBPS; + case 45: return IB_RATE_112_GBPS; + case 67: return IB_RATE_168_GBPS; + case 10: return IB_RATE_25_GBPS; + case 40: return IB_RATE_100_GBPS; + case 80: return IB_RATE_200_GBPS; + case 120: return IB_RATE_300_GBPS; + default: return IB_RATE_PORT_CURRENT; } } EXPORT_SYMBOL(mult_to_ib_rate); @@ -247,6 +263,10 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; } + pd->res.type = RDMA_RESTRACK_PD; + pd->res.kern_name = caller; + rdma_restrack_add(&pd->res); + if (mr_access_flags) { struct ib_mr *mr; @@ -296,6 +316,7 @@ void ib_dealloc_pd(struct ib_pd *pd) requires the caller to guarantee we can't race here. */ WARN_ON(atomic_read(&pd->usecnt)); + rdma_restrack_del(&pd->res); /* Making delalloc_pd a void return is a WIP, no driver should return an error here. */ ret = pd->device->dealloc_pd(pd); @@ -421,8 +442,7 @@ static bool find_gid_index(const union ib_gid *gid, const struct ib_gid_attr *gid_attr, void *context) { - struct find_gid_index_context *ctx = - (struct find_gid_index_context *)context; + struct find_gid_index_context *ctx = context; if (ctx->gid_type != gid_attr->gid_type) return false; @@ -481,8 +501,53 @@ int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, } EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr); +/* Resolve destination mac address and hop limit for unicast destination + * GID entry, considering the source GID entry as well. + * ah_attribute must have have valid port_num, sgid_index. + */ +static int ib_resolve_unicast_gid_dmac(struct ib_device *device, + struct rdma_ah_attr *ah_attr) +{ + struct ib_gid_attr sgid_attr; + struct ib_global_route *grh; + int hop_limit = 0xff; + union ib_gid sgid; + int ret; + + grh = rdma_ah_retrieve_grh(ah_attr); + + ret = ib_query_gid(device, + rdma_ah_get_port_num(ah_attr), + grh->sgid_index, + &sgid, &sgid_attr); + if (ret || !sgid_attr.ndev) { + if (!ret) + ret = -ENXIO; + return ret; + } + + /* If destination is link local and source GID is RoCEv1, + * IP stack is not used. + */ + if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw) && + sgid_attr.gid_type == IB_GID_TYPE_ROCE) { + rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw, + ah_attr->roce.dmac); + goto done; + } + + ret = rdma_addr_find_l2_eth_by_grh(&sgid, &grh->dgid, + ah_attr->roce.dmac, + sgid_attr.ndev, &hop_limit); +done: + dev_put(sgid_attr.ndev); + + grh->hop_limit = hop_limit; + return ret; +} + /* - * This function creates ah from the incoming packet. + * This function initializes address handle attributes from the incoming packet. * Incoming packet has dgid of the receiver node on which this code is * getting executed and, sgid contains the GID of the sender. * @@ -490,13 +555,10 @@ EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr); * as sgid and, sgid is used as dgid because sgid contains destinations * GID whom to respond to. * - * This is why when calling rdma_addr_find_l2_eth_by_grh() function, the - * position of arguments dgid and sgid do not match the order of the - * parameters. */ -int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, - const struct ib_wc *wc, const struct ib_grh *grh, - struct rdma_ah_attr *ah_attr) +int ib_init_ah_attr_from_wc(struct ib_device *device, u8 port_num, + const struct ib_wc *wc, const struct ib_grh *grh, + struct rdma_ah_attr *ah_attr) { u32 flow_class; u16 gid_index; @@ -523,57 +585,33 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, if (ret) return ret; + rdma_ah_set_sl(ah_attr, wc->sl); + rdma_ah_set_port_num(ah_attr, port_num); + if (rdma_protocol_roce(device, port_num)) { - int if_index = 0; u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ? wc->vlan_id : 0xffff; - struct net_device *idev; - struct net_device *resolved_dev; if (!(wc->wc_flags & IB_WC_GRH)) return -EPROTOTYPE; - if (!device->get_netdev) - return -EOPNOTSUPP; - - idev = device->get_netdev(device, port_num); - if (!idev) - return -ENODEV; - - ret = rdma_addr_find_l2_eth_by_grh(&dgid, &sgid, - ah_attr->roce.dmac, - wc->wc_flags & IB_WC_WITH_VLAN ? - NULL : &vlan_id, - &if_index, &hoplimit); - if (ret) { - dev_put(idev); - return ret; - } - - resolved_dev = dev_get_by_index(&init_net, if_index); - rcu_read_lock(); - if (resolved_dev != idev && !rdma_is_upper_dev_rcu(idev, - resolved_dev)) - ret = -EHOSTUNREACH; - rcu_read_unlock(); - dev_put(idev); - dev_put(resolved_dev); + ret = get_sgid_index_from_eth(device, port_num, + vlan_id, &dgid, + gid_type, &gid_index); if (ret) return ret; - ret = get_sgid_index_from_eth(device, port_num, vlan_id, - &dgid, gid_type, &gid_index); - if (ret) - return ret; - } - - rdma_ah_set_dlid(ah_attr, wc->slid); - rdma_ah_set_sl(ah_attr, wc->sl); - rdma_ah_set_path_bits(ah_attr, wc->dlid_path_bits); - rdma_ah_set_port_num(ah_attr, port_num); + flow_class = be32_to_cpu(grh->version_tclass_flow); + rdma_ah_set_grh(ah_attr, &sgid, + flow_class & 0xFFFFF, + (u8)gid_index, hoplimit, + (flow_class >> 20) & 0xFF); + return ib_resolve_unicast_gid_dmac(device, ah_attr); + } else { + rdma_ah_set_dlid(ah_attr, wc->slid); + rdma_ah_set_path_bits(ah_attr, wc->dlid_path_bits); - if (wc->wc_flags & IB_WC_GRH) { - if (!rdma_cap_eth_ah(device, port_num)) { + if (wc->wc_flags & IB_WC_GRH) { if (dgid.global.interface_id != cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) { ret = ib_find_cached_gid_by_port(device, &dgid, IB_GID_TYPE_IB, @@ -584,18 +622,17 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, } else { gid_index = 0; } - } - - flow_class = be32_to_cpu(grh->version_tclass_flow); - rdma_ah_set_grh(ah_attr, &sgid, - flow_class & 0xFFFFF, - (u8)gid_index, hoplimit, - (flow_class >> 20) & 0xFF); + flow_class = be32_to_cpu(grh->version_tclass_flow); + rdma_ah_set_grh(ah_attr, &sgid, + flow_class & 0xFFFFF, + (u8)gid_index, hoplimit, + (flow_class >> 20) & 0xFF); + } + return 0; } - return 0; } -EXPORT_SYMBOL(ib_init_ah_from_wc); +EXPORT_SYMBOL(ib_init_ah_attr_from_wc); struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, const struct ib_grh *grh, u8 port_num) @@ -603,7 +640,7 @@ struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, struct rdma_ah_attr ah_attr; int ret; - ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr); + ret = ib_init_ah_attr_from_wc(pd->device, port_num, wc, grh, &ah_attr); if (ret) return ERR_PTR(ret); @@ -850,7 +887,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, if (qp_init_attr->cap.max_rdma_ctxs) rdma_rw_init_qp(device, qp_init_attr); - qp = device->create_qp(pd, qp_init_attr, NULL); + qp = _ib_create_qp(device, pd, qp_init_attr, NULL); if (IS_ERR(qp)) return qp; @@ -860,7 +897,6 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, return ERR_PTR(ret); } - qp->device = device; qp->real_qp = qp; qp->uobject = NULL; qp->qp_type = qp_init_attr->qp_type; @@ -890,7 +926,6 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, atomic_inc(&qp_init_attr->srq->usecnt); } - qp->pd = pd; qp->send_cq = qp_init_attr->send_cq; qp->xrcd = NULL; @@ -1269,16 +1304,8 @@ static int ib_resolve_eth_dmac(struct ib_device *device, if (!rdma_is_port_valid(device, rdma_ah_get_port_num(ah_attr))) return -EINVAL; - if (ah_attr->type != RDMA_AH_ATTR_TYPE_ROCE) - return 0; - grh = rdma_ah_retrieve_grh(ah_attr); - if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw)) { - rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw, - ah_attr->roce.dmac); - return 0; - } if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) { if (ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw)) { __be32 addr = 0; @@ -1290,40 +1317,52 @@ static int ib_resolve_eth_dmac(struct ib_device *device, (char *)ah_attr->roce.dmac); } } else { - union ib_gid sgid; - struct ib_gid_attr sgid_attr; - int ifindex; - int hop_limit; - - ret = ib_query_gid(device, - rdma_ah_get_port_num(ah_attr), - grh->sgid_index, - &sgid, &sgid_attr); - - if (ret || !sgid_attr.ndev) { - if (!ret) - ret = -ENXIO; - goto out; - } - - ifindex = sgid_attr.ndev->ifindex; + ret = ib_resolve_unicast_gid_dmac(device, ah_attr); + } + return ret; +} - ret = - rdma_addr_find_l2_eth_by_grh(&sgid, &grh->dgid, - ah_attr->roce.dmac, - NULL, &ifindex, &hop_limit); +/** + * IB core internal function to perform QP attributes modification. + */ +static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + u8 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + int ret; - dev_put(sgid_attr.ndev); + if (rdma_ib_or_roce(qp->device, port)) { + if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) { + pr_warn("%s: %s rq_psn overflow, masking to 24 bits\n", + __func__, qp->device->name); + attr->rq_psn &= 0xffffff; + } - grh->hop_limit = hop_limit; + if (attr_mask & IB_QP_SQ_PSN && attr->sq_psn & ~0xffffff) { + pr_warn("%s: %s sq_psn overflow, masking to 24 bits\n", + __func__, qp->device->name); + attr->sq_psn &= 0xffffff; + } } -out: + + ret = ib_security_modify_qp(qp, attr, attr_mask, udata); + if (!ret && (attr_mask & IB_QP_PORT)) + qp->port = attr->port_num; + return ret; } +static bool is_qp_type_connected(const struct ib_qp *qp) +{ + return (qp->qp_type == IB_QPT_UC || + qp->qp_type == IB_QPT_RC || + qp->qp_type == IB_QPT_XRC_INI || + qp->qp_type == IB_QPT_XRC_TGT); +} + /** * ib_modify_qp_with_udata - Modifies the attributes for the specified QP. - * @qp: The QP to modify. + * @ib_qp: The QP to modify. * @attr: On input, specifies the QP attributes to modify. On output, * the current values of selected QP attributes are returned. * @attr_mask: A bit-mask used to specify which attributes of the QP @@ -1332,21 +1371,20 @@ out: * are being modified. * It returns 0 on success and returns appropriate error code on error. */ -int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr, +int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { + struct ib_qp *qp = ib_qp->real_qp; int ret; - if (attr_mask & IB_QP_AV) { + if (attr_mask & IB_QP_AV && + attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE && + is_qp_type_connected(qp)) { ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr); if (ret) return ret; } - ret = ib_security_modify_qp(qp, attr, attr_mask, udata); - if (!ret && (attr_mask & IB_QP_PORT)) - qp->port = attr->port_num; - - return ret; + return _ib_modify_qp(qp, attr, attr_mask, udata); } EXPORT_SYMBOL(ib_modify_qp_with_udata); @@ -1409,7 +1447,7 @@ int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask) { - return ib_modify_qp_with_udata(qp, qp_attr, qp_attr_mask, NULL); + return _ib_modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); } EXPORT_SYMBOL(ib_modify_qp); @@ -1503,6 +1541,7 @@ int ib_destroy_qp(struct ib_qp *qp) if (!qp->uobject) rdma_rw_cleanup_mrs(qp); + rdma_restrack_del(&qp->res); ret = qp->device->destroy_qp(qp); if (!ret) { if (pd) @@ -1545,6 +1584,8 @@ struct ib_cq *ib_create_cq(struct ib_device *device, cq->event_handler = event_handler; cq->cq_context = cq_context; atomic_set(&cq->usecnt, 0); + cq->res.type = RDMA_RESTRACK_CQ; + rdma_restrack_add(&cq->res); } return cq; @@ -1563,6 +1604,7 @@ int ib_destroy_cq(struct ib_cq *cq) if (atomic_read(&cq->usecnt)) return -EBUSY; + rdma_restrack_del(&cq->res); return cq->device->destroy_cq(cq); } EXPORT_SYMBOL(ib_destroy_cq); @@ -1747,7 +1789,7 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) } EXPORT_SYMBOL(ib_detach_mcast); -struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device) +struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller) { struct ib_xrcd *xrcd; @@ -1765,7 +1807,7 @@ struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device) return xrcd; } -EXPORT_SYMBOL(ib_alloc_xrcd); +EXPORT_SYMBOL(__ib_alloc_xrcd); int ib_dealloc_xrcd(struct ib_xrcd *xrcd) { @@ -1790,11 +1832,11 @@ EXPORT_SYMBOL(ib_dealloc_xrcd); * ib_create_wq - Creates a WQ associated with the specified protection * domain. * @pd: The protection domain associated with the WQ. - * @wq_init_attr: A list of initial attributes required to create the + * @wq_attr: A list of initial attributes required to create the * WQ. If WQ creation succeeds, then the attributes are updated to * the actual capabilities of the created WQ. * - * wq_init_attr->max_wr and wq_init_attr->max_sge determine + * wq_attr->max_wr and wq_attr->max_sge determine * the requested size of the WQ, and set to the actual values allocated * on return. * If ib_create_wq() succeeds, then max_wr and max_sge will always be @@ -2156,16 +2198,16 @@ static void __ib_drain_sq(struct ib_qp *qp) struct ib_send_wr swr = {}, *bad_swr; int ret; - swr.wr_cqe = &sdrain.cqe; - sdrain.cqe.done = ib_drain_qp_done; - init_completion(&sdrain.done); - ret = ib_modify_qp(qp, &attr, IB_QP_STATE); if (ret) { WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); return; } + swr.wr_cqe = &sdrain.cqe; + sdrain.cqe.done = ib_drain_qp_done; + init_completion(&sdrain.done); + ret = ib_post_send(qp, &swr, &bad_swr); if (ret) { WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); @@ -2190,16 +2232,16 @@ static void __ib_drain_rq(struct ib_qp *qp) struct ib_recv_wr rwr = {}, *bad_rwr; int ret; - rwr.wr_cqe = &rdrain.cqe; - rdrain.cqe.done = ib_drain_qp_done; - init_completion(&rdrain.done); - ret = ib_modify_qp(qp, &attr, IB_QP_STATE); if (ret) { WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); return; } + rwr.wr_cqe = &rdrain.cqe; + rdrain.cqe.done = ib_drain_qp_done; + init_completion(&rdrain.done); + ret = ib_post_recv(qp, &rwr, &bad_rwr); if (ret) { WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index ecbac91b2e14..ca32057e886f 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -43,20 +43,41 @@ #define ROCE_DRV_MODULE_VERSION "1.0.0" #define BNXT_RE_DESC "Broadcom NetXtreme-C/E RoCE Driver" - -#define BNXT_RE_PAGE_SIZE_4K BIT(12) -#define BNXT_RE_PAGE_SIZE_8K BIT(13) -#define BNXT_RE_PAGE_SIZE_64K BIT(16) -#define BNXT_RE_PAGE_SIZE_2M BIT(21) -#define BNXT_RE_PAGE_SIZE_8M BIT(23) -#define BNXT_RE_PAGE_SIZE_1G BIT(30) - -#define BNXT_RE_MAX_MR_SIZE BIT(30) +#define BNXT_RE_PAGE_SHIFT_4K (12) +#define BNXT_RE_PAGE_SHIFT_8K (13) +#define BNXT_RE_PAGE_SHIFT_64K (16) +#define BNXT_RE_PAGE_SHIFT_2M (21) +#define BNXT_RE_PAGE_SHIFT_8M (23) +#define BNXT_RE_PAGE_SHIFT_1G (30) + +#define BNXT_RE_PAGE_SIZE_4K BIT(BNXT_RE_PAGE_SHIFT_4K) +#define BNXT_RE_PAGE_SIZE_8K BIT(BNXT_RE_PAGE_SHIFT_8K) +#define BNXT_RE_PAGE_SIZE_64K BIT(BNXT_RE_PAGE_SHIFT_64K) +#define BNXT_RE_PAGE_SIZE_2M BIT(BNXT_RE_PAGE_SHIFT_2M) +#define BNXT_RE_PAGE_SIZE_8M BIT(BNXT_RE_PAGE_SHIFT_8M) +#define BNXT_RE_PAGE_SIZE_1G BIT(BNXT_RE_PAGE_SHIFT_1G) + +#define BNXT_RE_MAX_MR_SIZE_LOW BIT(BNXT_RE_PAGE_SHIFT_1G) +#define BNXT_RE_MAX_MR_SIZE_HIGH BIT(39) +#define BNXT_RE_MAX_MR_SIZE BNXT_RE_MAX_MR_SIZE_HIGH #define BNXT_RE_MAX_QPC_COUNT (64 * 1024) #define BNXT_RE_MAX_MRW_COUNT (64 * 1024) #define BNXT_RE_MAX_SRQC_COUNT (64 * 1024) #define BNXT_RE_MAX_CQ_COUNT (64 * 1024) +#define BNXT_RE_MAX_MRW_COUNT_64K (64 * 1024) +#define BNXT_RE_MAX_MRW_COUNT_256K (256 * 1024) + +/* Number of MRs to reserve for PF, leaving remainder for VFs */ +#define BNXT_RE_RESVD_MR_FOR_PF (32 * 1024) +#define BNXT_RE_MAX_GID_PER_VF 128 + +/* + * Percentage of resources of each type reserved for PF. + * Remaining resources are divided equally among VFs. + * [0, 100] + */ +#define BNXT_RE_PCT_RSVD_FOR_PF 50 #define BNXT_RE_UD_QP_HW_STALL 0x400000 @@ -100,6 +121,7 @@ struct bnxt_re_dev { #define BNXT_RE_FLAG_RCFW_CHANNEL_EN 4 #define BNXT_RE_FLAG_QOS_WORK_REG 5 #define BNXT_RE_FLAG_TASK_IN_PROG 6 +#define BNXT_RE_FLAG_ISSUE_ROCE_STATS 29 struct net_device *netdev; unsigned int version, major, minor; struct bnxt_en_dev *en_dev; @@ -145,6 +167,9 @@ struct bnxt_re_dev { struct bnxt_re_ah *sqp_ah; struct bnxt_re_sqp_entries sqp_tbl[1024]; atomic_t nq_alloc_cnt; + u32 is_virtfn; + u32 num_vfs; + struct bnxt_qplib_roce_stats stats; }; #define to_bnxt_re_dev(ptr, member) \ diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c index 7b28219eba46..77416bc61e6e 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.c +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c @@ -58,16 +58,55 @@ #include "hw_counters.h" static const char * const bnxt_re_stat_name[] = { - [BNXT_RE_ACTIVE_QP] = "active_qps", - [BNXT_RE_ACTIVE_SRQ] = "active_srqs", - [BNXT_RE_ACTIVE_CQ] = "active_cqs", - [BNXT_RE_ACTIVE_MR] = "active_mrs", - [BNXT_RE_ACTIVE_MW] = "active_mws", - [BNXT_RE_RX_PKTS] = "rx_pkts", - [BNXT_RE_RX_BYTES] = "rx_bytes", - [BNXT_RE_TX_PKTS] = "tx_pkts", - [BNXT_RE_TX_BYTES] = "tx_bytes", - [BNXT_RE_RECOVERABLE_ERRORS] = "recoverable_errors" + [BNXT_RE_ACTIVE_QP] = "active_qps", + [BNXT_RE_ACTIVE_SRQ] = "active_srqs", + [BNXT_RE_ACTIVE_CQ] = "active_cqs", + [BNXT_RE_ACTIVE_MR] = "active_mrs", + [BNXT_RE_ACTIVE_MW] = "active_mws", + [BNXT_RE_RX_PKTS] = "rx_pkts", + [BNXT_RE_RX_BYTES] = "rx_bytes", + [BNXT_RE_TX_PKTS] = "tx_pkts", + [BNXT_RE_TX_BYTES] = "tx_bytes", + [BNXT_RE_RECOVERABLE_ERRORS] = "recoverable_errors", + [BNXT_RE_TO_RETRANSMITS] = "to_retransmits", + [BNXT_RE_SEQ_ERR_NAKS_RCVD] = "seq_err_naks_rcvd", + [BNXT_RE_MAX_RETRY_EXCEEDED] = "max_retry_exceeded", + [BNXT_RE_RNR_NAKS_RCVD] = "rnr_naks_rcvd", + [BNXT_RE_MISSING_RESP] = "missin_resp", + [BNXT_RE_UNRECOVERABLE_ERR] = "unrecoverable_err", + [BNXT_RE_BAD_RESP_ERR] = "bad_resp_err", + [BNXT_RE_LOCAL_QP_OP_ERR] = "local_qp_op_err", + [BNXT_RE_LOCAL_PROTECTION_ERR] = "local_protection_err", + [BNXT_RE_MEM_MGMT_OP_ERR] = "mem_mgmt_op_err", + [BNXT_RE_REMOTE_INVALID_REQ_ERR] = "remote_invalid_req_err", + [BNXT_RE_REMOTE_ACCESS_ERR] = "remote_access_err", + [BNXT_RE_REMOTE_OP_ERR] = "remote_op_err", + [BNXT_RE_DUP_REQ] = "dup_req", + [BNXT_RE_RES_EXCEED_MAX] = "res_exceed_max", + [BNXT_RE_RES_LENGTH_MISMATCH] = "res_length_mismatch", + [BNXT_RE_RES_EXCEEDS_WQE] = "res_exceeds_wqe", + [BNXT_RE_RES_OPCODE_ERR] = "res_opcode_err", + [BNXT_RE_RES_RX_INVALID_RKEY] = "res_rx_invalid_rkey", + [BNXT_RE_RES_RX_DOMAIN_ERR] = "res_rx_domain_err", + [BNXT_RE_RES_RX_NO_PERM] = "res_rx_no_perm", + [BNXT_RE_RES_RX_RANGE_ERR] = "res_rx_range_err", + [BNXT_RE_RES_TX_INVALID_RKEY] = "res_tx_invalid_rkey", + [BNXT_RE_RES_TX_DOMAIN_ERR] = "res_tx_domain_err", + [BNXT_RE_RES_TX_NO_PERM] = "res_tx_no_perm", + [BNXT_RE_RES_TX_RANGE_ERR] = "res_tx_range_err", + [BNXT_RE_RES_IRRQ_OFLOW] = "res_irrq_oflow", + [BNXT_RE_RES_UNSUP_OPCODE] = "res_unsup_opcode", + [BNXT_RE_RES_UNALIGNED_ATOMIC] = "res_unaligned_atomic", + [BNXT_RE_RES_REM_INV_ERR] = "res_rem_inv_err", + [BNXT_RE_RES_MEM_ERROR] = "res_mem_err", + [BNXT_RE_RES_SRQ_ERR] = "res_srq_err", + [BNXT_RE_RES_CMP_ERR] = "res_cmp_err", + [BNXT_RE_RES_INVALID_DUP_RKEY] = "res_invalid_dup_rkey", + [BNXT_RE_RES_WQE_FORMAT_ERR] = "res_wqe_format_err", + [BNXT_RE_RES_CQ_LOAD_ERR] = "res_cq_load_err", + [BNXT_RE_RES_SRQ_LOAD_ERR] = "res_srq_load_err", + [BNXT_RE_RES_TX_PCI_ERR] = "res_tx_pci_err", + [BNXT_RE_RES_RX_PCI_ERR] = "res_rx_pci_err" }; int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, @@ -76,6 +115,7 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, { struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); struct ctx_hw_stats *bnxt_re_stats = rdev->qplib_ctx.stats.dma; + int rc = 0; if (!port || !stats) return -EINVAL; @@ -97,6 +137,91 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, stats->value[BNXT_RE_TX_BYTES] = le64_to_cpu(bnxt_re_stats->tx_ucast_bytes); } + if (test_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags)) { + rc = bnxt_qplib_get_roce_stats(&rdev->rcfw, &rdev->stats); + if (rc) + clear_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, + &rdev->flags); + stats->value[BNXT_RE_TO_RETRANSMITS] = + rdev->stats.to_retransmits; + stats->value[BNXT_RE_SEQ_ERR_NAKS_RCVD] = + rdev->stats.seq_err_naks_rcvd; + stats->value[BNXT_RE_MAX_RETRY_EXCEEDED] = + rdev->stats.max_retry_exceeded; + stats->value[BNXT_RE_RNR_NAKS_RCVD] = + rdev->stats.rnr_naks_rcvd; + stats->value[BNXT_RE_MISSING_RESP] = + rdev->stats.missing_resp; + stats->value[BNXT_RE_UNRECOVERABLE_ERR] = + rdev->stats.unrecoverable_err; + stats->value[BNXT_RE_BAD_RESP_ERR] = + rdev->stats.bad_resp_err; + stats->value[BNXT_RE_LOCAL_QP_OP_ERR] = + rdev->stats.local_qp_op_err; + stats->value[BNXT_RE_LOCAL_PROTECTION_ERR] = + rdev->stats.local_protection_err; + stats->value[BNXT_RE_MEM_MGMT_OP_ERR] = + rdev->stats.mem_mgmt_op_err; + stats->value[BNXT_RE_REMOTE_INVALID_REQ_ERR] = + rdev->stats.remote_invalid_req_err; + stats->value[BNXT_RE_REMOTE_ACCESS_ERR] = + rdev->stats.remote_access_err; + stats->value[BNXT_RE_REMOTE_OP_ERR] = + rdev->stats.remote_op_err; + stats->value[BNXT_RE_DUP_REQ] = + rdev->stats.dup_req; + stats->value[BNXT_RE_RES_EXCEED_MAX] = + rdev->stats.res_exceed_max; + stats->value[BNXT_RE_RES_LENGTH_MISMATCH] = + rdev->stats.res_length_mismatch; + stats->value[BNXT_RE_RES_EXCEEDS_WQE] = + rdev->stats.res_exceeds_wqe; + stats->value[BNXT_RE_RES_OPCODE_ERR] = + rdev->stats.res_opcode_err; + stats->value[BNXT_RE_RES_RX_INVALID_RKEY] = + rdev->stats.res_rx_invalid_rkey; + stats->value[BNXT_RE_RES_RX_DOMAIN_ERR] = + rdev->stats.res_rx_domain_err; + stats->value[BNXT_RE_RES_RX_NO_PERM] = + rdev->stats.res_rx_no_perm; + stats->value[BNXT_RE_RES_RX_RANGE_ERR] = + rdev->stats.res_rx_range_err; + stats->value[BNXT_RE_RES_TX_INVALID_RKEY] = + rdev->stats.res_tx_invalid_rkey; + stats->value[BNXT_RE_RES_TX_DOMAIN_ERR] = + rdev->stats.res_tx_domain_err; + stats->value[BNXT_RE_RES_TX_NO_PERM] = + rdev->stats.res_tx_no_perm; + stats->value[BNXT_RE_RES_TX_RANGE_ERR] = + rdev->stats.res_tx_range_err; + stats->value[BNXT_RE_RES_IRRQ_OFLOW] = + rdev->stats.res_irrq_oflow; + stats->value[BNXT_RE_RES_UNSUP_OPCODE] = + rdev->stats.res_unsup_opcode; + stats->value[BNXT_RE_RES_UNALIGNED_ATOMIC] = + rdev->stats.res_unaligned_atomic; + stats->value[BNXT_RE_RES_REM_INV_ERR] = + rdev->stats.res_rem_inv_err; + stats->value[BNXT_RE_RES_MEM_ERROR] = + rdev->stats.res_mem_error; + stats->value[BNXT_RE_RES_SRQ_ERR] = + rdev->stats.res_srq_err; + stats->value[BNXT_RE_RES_CMP_ERR] = + rdev->stats.res_cmp_err; + stats->value[BNXT_RE_RES_INVALID_DUP_RKEY] = + rdev->stats.res_invalid_dup_rkey; + stats->value[BNXT_RE_RES_WQE_FORMAT_ERR] = + rdev->stats.res_wqe_format_err; + stats->value[BNXT_RE_RES_CQ_LOAD_ERR] = + rdev->stats.res_cq_load_err; + stats->value[BNXT_RE_RES_SRQ_LOAD_ERR] = + rdev->stats.res_srq_load_err; + stats->value[BNXT_RE_RES_TX_PCI_ERR] = + rdev->stats.res_tx_pci_err; + stats->value[BNXT_RE_RES_RX_PCI_ERR] = + rdev->stats.res_rx_pci_err; + } + return ARRAY_SIZE(bnxt_re_stat_name); } diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.h b/drivers/infiniband/hw/bnxt_re/hw_counters.h index be0dc0093b58..a01a922717d5 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.h +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.h @@ -51,6 +51,45 @@ enum bnxt_re_hw_stats { BNXT_RE_TX_PKTS, BNXT_RE_TX_BYTES, BNXT_RE_RECOVERABLE_ERRORS, + BNXT_RE_TO_RETRANSMITS, + BNXT_RE_SEQ_ERR_NAKS_RCVD, + BNXT_RE_MAX_RETRY_EXCEEDED, + BNXT_RE_RNR_NAKS_RCVD, + BNXT_RE_MISSING_RESP, + BNXT_RE_UNRECOVERABLE_ERR, + BNXT_RE_BAD_RESP_ERR, + BNXT_RE_LOCAL_QP_OP_ERR, + BNXT_RE_LOCAL_PROTECTION_ERR, + BNXT_RE_MEM_MGMT_OP_ERR, + BNXT_RE_REMOTE_INVALID_REQ_ERR, + BNXT_RE_REMOTE_ACCESS_ERR, + BNXT_RE_REMOTE_OP_ERR, + BNXT_RE_DUP_REQ, + BNXT_RE_RES_EXCEED_MAX, + BNXT_RE_RES_LENGTH_MISMATCH, + BNXT_RE_RES_EXCEEDS_WQE, + BNXT_RE_RES_OPCODE_ERR, + BNXT_RE_RES_RX_INVALID_RKEY, + BNXT_RE_RES_RX_DOMAIN_ERR, + BNXT_RE_RES_RX_NO_PERM, + BNXT_RE_RES_RX_RANGE_ERR, + BNXT_RE_RES_TX_INVALID_RKEY, + BNXT_RE_RES_TX_DOMAIN_ERR, + BNXT_RE_RES_TX_NO_PERM, + BNXT_RE_RES_TX_RANGE_ERR, + BNXT_RE_RES_IRRQ_OFLOW, + BNXT_RE_RES_UNSUP_OPCODE, + BNXT_RE_RES_UNALIGNED_ATOMIC, + BNXT_RE_RES_REM_INV_ERR, + BNXT_RE_RES_MEM_ERROR, + BNXT_RE_RES_SRQ_ERR, + BNXT_RE_RES_CMP_ERR, + BNXT_RE_RES_INVALID_DUP_RKEY, + BNXT_RE_RES_WQE_FORMAT_ERR, + BNXT_RE_RES_CQ_LOAD_ERR, + BNXT_RE_RES_SRQ_LOAD_ERR, + BNXT_RE_RES_TX_PCI_ERR, + BNXT_RE_RES_RX_PCI_ERR, BNXT_RE_NUM_COUNTERS }; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 2032db7db766..9b8fa77b8831 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -141,12 +141,13 @@ int bnxt_re_query_device(struct ib_device *ibdev, struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; memset(ib_attr, 0, sizeof(*ib_attr)); - - ib_attr->fw_ver = (u64)(unsigned long)(dev_attr->fw_ver); + memcpy(&ib_attr->fw_ver, dev_attr->fw_ver, + min(sizeof(dev_attr->fw_ver), + sizeof(ib_attr->fw_ver))); bnxt_qplib_get_guid(rdev->netdev->dev_addr, (u8 *)&ib_attr->sys_image_guid); ib_attr->max_mr_size = BNXT_RE_MAX_MR_SIZE; - ib_attr->page_size_cap = BNXT_RE_PAGE_SIZE_4K; + ib_attr->page_size_cap = BNXT_RE_PAGE_SIZE_4K | BNXT_RE_PAGE_SIZE_2M; ib_attr->vendor_id = rdev->en_dev->pdev->vendor; ib_attr->vendor_part_id = rdev->en_dev->pdev->device; @@ -247,8 +248,7 @@ int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num, IB_PORT_VENDOR_CLASS_SUP | IB_PORT_IP_BASED_GIDS; - /* Max MSG size set to 2G for now */ - port_attr->max_msg_sz = 0x80000000; + port_attr->max_msg_sz = (u32)BNXT_RE_MAX_MR_SIZE_LOW; port_attr->bad_pkey_cntr = 0; port_attr->qkey_viol_cntr = 0; port_attr->pkey_tbl_len = dev_attr->max_pkey; @@ -281,6 +281,15 @@ int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } +void bnxt_re_query_fw_str(struct ib_device *ibdev, char *str) +{ + struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); + + snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d.%d", + rdev->dev_attr.fw_ver[0], rdev->dev_attr.fw_ver[1], + rdev->dev_attr.fw_ver[2], rdev->dev_attr.fw_ver[3]); +} + int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num, u16 index, u16 *pkey) { @@ -532,7 +541,7 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd) mr->qplib_mr.total_size = BNXT_RE_FENCE_BYTES; pbl_tbl = dma_addr; rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, &pbl_tbl, - BNXT_RE_FENCE_PBL_SIZE, false); + BNXT_RE_FENCE_PBL_SIZE, false, PAGE_SIZE); if (rc) { dev_err(rdev_to_dev(rdev), "Failed to register fence-MR\n"); goto fail; @@ -1018,6 +1027,7 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd, struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; struct bnxt_re_qp *qp; struct bnxt_re_cq *cq; + struct bnxt_re_srq *srq; int rc, entries; if ((qp_init_attr->cap.max_send_wr > dev_attr->max_qp_wqes) || @@ -1073,9 +1083,15 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd, } if (qp_init_attr->srq) { - dev_err(rdev_to_dev(rdev), "SRQ not supported"); - rc = -ENOTSUPP; - goto fail; + srq = container_of(qp_init_attr->srq, struct bnxt_re_srq, + ib_srq); + if (!srq) { + dev_err(rdev_to_dev(rdev), "SRQ not found"); + rc = -EINVAL; + goto fail; + } + qp->qplib_qp.srq = &srq->qplib_srq; + qp->qplib_qp.rq.max_wqe = 0; } else { /* Allocate 1 more than what's provided so posting max doesn't * mean empty @@ -1280,6 +1296,237 @@ static enum ib_mtu __to_ib_mtu(u32 mtu) } } +/* Shared Receive Queues */ +int bnxt_re_destroy_srq(struct ib_srq *ib_srq) +{ + struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq, + ib_srq); + struct bnxt_re_dev *rdev = srq->rdev; + struct bnxt_qplib_srq *qplib_srq = &srq->qplib_srq; + struct bnxt_qplib_nq *nq = NULL; + int rc; + + if (qplib_srq->cq) + nq = qplib_srq->cq->nq; + rc = bnxt_qplib_destroy_srq(&rdev->qplib_res, qplib_srq); + if (rc) { + dev_err(rdev_to_dev(rdev), "Destroy HW SRQ failed!"); + return rc; + } + + if (srq->umem && !IS_ERR(srq->umem)) + ib_umem_release(srq->umem); + kfree(srq); + atomic_dec(&rdev->srq_count); + if (nq) + nq->budget--; + return 0; +} + +static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev, + struct bnxt_re_pd *pd, + struct bnxt_re_srq *srq, + struct ib_udata *udata) +{ + struct bnxt_re_srq_req ureq; + struct bnxt_qplib_srq *qplib_srq = &srq->qplib_srq; + struct ib_umem *umem; + int bytes = 0; + struct ib_ucontext *context = pd->ib_pd.uobject->context; + struct bnxt_re_ucontext *cntx = container_of(context, + struct bnxt_re_ucontext, + ib_uctx); + if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) + return -EFAULT; + + bytes = (qplib_srq->max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE); + bytes = PAGE_ALIGN(bytes); + umem = ib_umem_get(context, ureq.srqva, bytes, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(umem)) + return PTR_ERR(umem); + + srq->umem = umem; + qplib_srq->nmap = umem->nmap; + qplib_srq->sglist = umem->sg_head.sgl; + qplib_srq->srq_handle = ureq.srq_handle; + qplib_srq->dpi = &cntx->dpi; + + return 0; +} + +struct ib_srq *bnxt_re_create_srq(struct ib_pd *ib_pd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata) +{ + struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); + struct bnxt_re_dev *rdev = pd->rdev; + struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_re_srq *srq; + struct bnxt_qplib_nq *nq = NULL; + int rc, entries; + + if (srq_init_attr->attr.max_wr >= dev_attr->max_srq_wqes) { + dev_err(rdev_to_dev(rdev), "Create CQ failed - max exceeded"); + rc = -EINVAL; + goto exit; + } + + if (srq_init_attr->srq_type != IB_SRQT_BASIC) { + rc = -ENOTSUPP; + goto exit; + } + + srq = kzalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) { + rc = -ENOMEM; + goto exit; + } + srq->rdev = rdev; + srq->qplib_srq.pd = &pd->qplib_pd; + srq->qplib_srq.dpi = &rdev->dpi_privileged; + /* Allocate 1 more than what's provided so posting max doesn't + * mean empty + */ + entries = roundup_pow_of_two(srq_init_attr->attr.max_wr + 1); + if (entries > dev_attr->max_srq_wqes + 1) + entries = dev_attr->max_srq_wqes + 1; + + srq->qplib_srq.max_wqe = entries; + srq->qplib_srq.max_sge = srq_init_attr->attr.max_sge; + srq->qplib_srq.threshold = srq_init_attr->attr.srq_limit; + srq->srq_limit = srq_init_attr->attr.srq_limit; + srq->qplib_srq.eventq_hw_ring_id = rdev->nq[0].ring_id; + nq = &rdev->nq[0]; + + if (udata) { + rc = bnxt_re_init_user_srq(rdev, pd, srq, udata); + if (rc) + goto fail; + } + + rc = bnxt_qplib_create_srq(&rdev->qplib_res, &srq->qplib_srq); + if (rc) { + dev_err(rdev_to_dev(rdev), "Create HW SRQ failed!"); + goto fail; + } + + if (udata) { + struct bnxt_re_srq_resp resp; + + resp.srqid = srq->qplib_srq.id; + rc = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (rc) { + dev_err(rdev_to_dev(rdev), "SRQ copy to udata failed!"); + bnxt_qplib_destroy_srq(&rdev->qplib_res, + &srq->qplib_srq); + goto exit; + } + } + if (nq) + nq->budget++; + atomic_inc(&rdev->srq_count); + + return &srq->ib_srq; + +fail: + if (udata && srq->umem && !IS_ERR(srq->umem)) { + ib_umem_release(srq->umem); + srq->umem = NULL; + } + + kfree(srq); +exit: + return ERR_PTR(rc); +} + +int bnxt_re_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *srq_attr, + enum ib_srq_attr_mask srq_attr_mask, + struct ib_udata *udata) +{ + struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq, + ib_srq); + struct bnxt_re_dev *rdev = srq->rdev; + int rc; + + switch (srq_attr_mask) { + case IB_SRQ_MAX_WR: + /* SRQ resize is not supported */ + break; + case IB_SRQ_LIMIT: + /* Change the SRQ threshold */ + if (srq_attr->srq_limit > srq->qplib_srq.max_wqe) + return -EINVAL; + + srq->qplib_srq.threshold = srq_attr->srq_limit; + rc = bnxt_qplib_modify_srq(&rdev->qplib_res, &srq->qplib_srq); + if (rc) { + dev_err(rdev_to_dev(rdev), "Modify HW SRQ failed!"); + return rc; + } + /* On success, update the shadow */ + srq->srq_limit = srq_attr->srq_limit; + /* No need to Build and send response back to udata */ + break; + default: + dev_err(rdev_to_dev(rdev), + "Unsupported srq_attr_mask 0x%x", srq_attr_mask); + return -EINVAL; + } + return 0; +} + +int bnxt_re_query_srq(struct ib_srq *ib_srq, struct ib_srq_attr *srq_attr) +{ + struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq, + ib_srq); + struct bnxt_re_srq tsrq; + struct bnxt_re_dev *rdev = srq->rdev; + int rc; + + /* Get live SRQ attr */ + tsrq.qplib_srq.id = srq->qplib_srq.id; + rc = bnxt_qplib_query_srq(&rdev->qplib_res, &tsrq.qplib_srq); + if (rc) { + dev_err(rdev_to_dev(rdev), "Query HW SRQ failed!"); + return rc; + } + srq_attr->max_wr = srq->qplib_srq.max_wqe; + srq_attr->max_sge = srq->qplib_srq.max_sge; + srq_attr->srq_limit = tsrq.qplib_srq.threshold; + + return 0; +} + +int bnxt_re_post_srq_recv(struct ib_srq *ib_srq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq, + ib_srq); + struct bnxt_qplib_swqe wqe; + unsigned long flags; + int rc = 0, payload_sz = 0; + + spin_lock_irqsave(&srq->lock, flags); + while (wr) { + /* Transcribe each ib_recv_wr to qplib_swqe */ + wqe.num_sge = wr->num_sge; + payload_sz = bnxt_re_build_sgl(wr->sg_list, wqe.sg_list, + wr->num_sge); + wqe.wr_id = wr->wr_id; + wqe.type = BNXT_QPLIB_SWQE_TYPE_RECV; + + rc = bnxt_qplib_post_srq_recv(&srq->qplib_srq, &wqe); + if (rc) { + *bad_wr = wr; + break; + } + wr = wr->next; + } + spin_unlock_irqrestore(&srq->lock, flags); + + return rc; +} static int bnxt_re_modify_shadow_qp(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp1_qp, int qp_attr_mask) @@ -2295,10 +2542,14 @@ int bnxt_re_post_recv(struct ib_qp *ib_qp, struct ib_recv_wr *wr, /* Completion Queues */ int bnxt_re_destroy_cq(struct ib_cq *ib_cq) { - struct bnxt_re_cq *cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq); - struct bnxt_re_dev *rdev = cq->rdev; int rc; - struct bnxt_qplib_nq *nq = cq->qplib_cq.nq; + struct bnxt_re_cq *cq; + struct bnxt_qplib_nq *nq; + struct bnxt_re_dev *rdev; + + cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq); + rdev = cq->rdev; + nq = cq->qplib_cq.nq; rc = bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq); if (rc) { @@ -2308,12 +2559,11 @@ int bnxt_re_destroy_cq(struct ib_cq *ib_cq) if (!IS_ERR_OR_NULL(cq->umem)) ib_umem_release(cq->umem); - if (cq) { - kfree(cq->cql); - kfree(cq); - } atomic_dec(&rdev->cq_count); nq->budget--; + kfree(cq->cql); + kfree(cq); + return 0; } @@ -3078,7 +3328,8 @@ struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *ib_pd, int mr_access_flags) mr->qplib_mr.hwq.level = PBL_LVL_MAX; mr->qplib_mr.total_size = -1; /* Infinte length */ - rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, &pbl, 0, false); + rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, &pbl, 0, false, + PAGE_SIZE); if (rc) goto fail_mr; @@ -3104,10 +3355,8 @@ int bnxt_re_dereg_mr(struct ib_mr *ib_mr) int rc; rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr); - if (rc) { + if (rc) dev_err(rdev_to_dev(rdev), "Dereg MR failed: %#x\n", rc); - return rc; - } if (mr->pages) { rc = bnxt_qplib_free_fast_reg_page_list(&rdev->qplib_res, @@ -3170,7 +3419,7 @@ struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type type, rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); if (rc) - goto fail; + goto bail; mr->ib_mr.lkey = mr->qplib_mr.lkey; mr->ib_mr.rkey = mr->ib_mr.lkey; @@ -3192,9 +3441,10 @@ struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type type, return &mr->ib_mr; fail_mr: - bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr); -fail: kfree(mr->pages); +fail: + bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr); +bail: kfree(mr); return ERR_PTR(rc); } @@ -3248,6 +3498,46 @@ int bnxt_re_dealloc_mw(struct ib_mw *ib_mw) return rc; } +static int bnxt_re_page_size_ok(int page_shift) +{ + switch (page_shift) { + case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_4K: + case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_8K: + case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_64K: + case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_2M: + case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_256K: + case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1M: + case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_4M: + case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1G: + return 1; + default: + return 0; + } +} + +static int fill_umem_pbl_tbl(struct ib_umem *umem, u64 *pbl_tbl_orig, + int page_shift) +{ + u64 *pbl_tbl = pbl_tbl_orig; + u64 paddr; + u64 page_mask = (1ULL << page_shift) - 1; + int i, pages; + struct scatterlist *sg; + int entry; + + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + pages = sg_dma_len(sg) >> PAGE_SHIFT; + for (i = 0; i < pages; i++) { + paddr = sg_dma_address(sg) + (i << PAGE_SHIFT); + if (pbl_tbl == pbl_tbl_orig) + *pbl_tbl++ = paddr & ~page_mask; + else if ((paddr & page_mask) == 0) + *pbl_tbl++ = paddr; + } + } + return pbl_tbl - pbl_tbl_orig; +} + /* uverbs */ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, @@ -3257,10 +3547,8 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, struct bnxt_re_dev *rdev = pd->rdev; struct bnxt_re_mr *mr; struct ib_umem *umem; - u64 *pbl_tbl, *pbl_tbl_orig; - int i, umem_pgs, pages, rc; - struct scatterlist *sg; - int entry; + u64 *pbl_tbl = NULL; + int umem_pgs, page_shift, rc; if (length > BNXT_RE_MAX_MR_SIZE) { dev_err(rdev_to_dev(rdev), "MR Size: %lld > Max supported:%ld\n", @@ -3277,64 +3565,68 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags); mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_MR; + rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); + if (rc) { + dev_err(rdev_to_dev(rdev), "Failed to allocate MR"); + goto free_mr; + } + /* The fixed portion of the rkey is the same as the lkey */ + mr->ib_mr.rkey = mr->qplib_mr.rkey; + umem = ib_umem_get(ib_pd->uobject->context, start, length, mr_access_flags, 0); if (IS_ERR(umem)) { dev_err(rdev_to_dev(rdev), "Failed to get umem"); rc = -EFAULT; - goto free_mr; + goto free_mrw; } mr->ib_umem = umem; - rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); - if (rc) { - dev_err(rdev_to_dev(rdev), "Failed to allocate MR"); - goto release_umem; - } - /* The fixed portion of the rkey is the same as the lkey */ - mr->ib_mr.rkey = mr->qplib_mr.rkey; - mr->qplib_mr.va = virt_addr; umem_pgs = ib_umem_page_count(umem); if (!umem_pgs) { dev_err(rdev_to_dev(rdev), "umem is invalid!"); rc = -EINVAL; - goto free_mrw; + goto free_umem; } mr->qplib_mr.total_size = length; pbl_tbl = kcalloc(umem_pgs, sizeof(u64 *), GFP_KERNEL); if (!pbl_tbl) { - rc = -EINVAL; - goto free_mrw; + rc = -ENOMEM; + goto free_umem; } - pbl_tbl_orig = pbl_tbl; - if (umem->hugetlb) { - dev_err(rdev_to_dev(rdev), "umem hugetlb not supported!"); + page_shift = umem->page_shift; + + if (!bnxt_re_page_size_ok(page_shift)) { + dev_err(rdev_to_dev(rdev), "umem page size unsupported!"); rc = -EFAULT; goto fail; } - if (umem->page_shift != PAGE_SHIFT) { - dev_err(rdev_to_dev(rdev), "umem page shift unsupported!"); - rc = -EFAULT; + if (!umem->hugetlb && length > BNXT_RE_MAX_MR_SIZE_LOW) { + dev_err(rdev_to_dev(rdev), "Requested MR Sz:%llu Max sup:%llu", + length, (u64)BNXT_RE_MAX_MR_SIZE_LOW); + rc = -EINVAL; goto fail; } - /* Map umem buf ptrs to the PBL */ - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - pages = sg_dma_len(sg) >> umem->page_shift; - for (i = 0; i < pages; i++, pbl_tbl++) - *pbl_tbl = sg_dma_address(sg) + (i << umem->page_shift); + if (umem->hugetlb && length > BNXT_RE_PAGE_SIZE_2M) { + page_shift = BNXT_RE_PAGE_SHIFT_2M; + dev_warn(rdev_to_dev(rdev), "umem hugetlb set page_size %x", + 1 << page_shift); } - rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, pbl_tbl_orig, - umem_pgs, false); + + /* Map umem buf ptrs to the PBL */ + umem_pgs = fill_umem_pbl_tbl(umem, pbl_tbl, page_shift); + rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, pbl_tbl, + umem_pgs, false, 1 << page_shift); if (rc) { dev_err(rdev_to_dev(rdev), "Failed to register user MR"); goto fail; } - kfree(pbl_tbl_orig); + kfree(pbl_tbl); mr->ib_mr.lkey = mr->qplib_mr.lkey; mr->ib_mr.rkey = mr->qplib_mr.lkey; @@ -3342,11 +3634,11 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, return &mr->ib_mr; fail: - kfree(pbl_tbl_orig); + kfree(pbl_tbl); +free_umem: + ib_umem_release(umem); free_mrw: bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr); -release_umem: - ib_umem_release(umem); free_mr: kfree(mr); return ERR_PTR(rc); diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index 1df11ed272ea..423ebe012f95 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -68,6 +68,15 @@ struct bnxt_re_ah { struct bnxt_qplib_ah qplib_ah; }; +struct bnxt_re_srq { + struct bnxt_re_dev *rdev; + u32 srq_limit; + struct ib_srq ib_srq; + struct bnxt_qplib_srq qplib_srq; + struct ib_umem *umem; + spinlock_t lock; /* protect srq */ +}; + struct bnxt_re_qp { struct list_head list; struct bnxt_re_dev *rdev; @@ -143,6 +152,7 @@ int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num, struct ib_port_attr *port_attr); int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num, struct ib_port_immutable *immutable); +void bnxt_re_query_fw_str(struct ib_device *ibdev, char *str); int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num, u16 index, u16 *pkey); int bnxt_re_del_gid(struct ib_device *ibdev, u8 port_num, @@ -164,6 +174,16 @@ struct ib_ah *bnxt_re_create_ah(struct ib_pd *pd, int bnxt_re_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); int bnxt_re_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); int bnxt_re_destroy_ah(struct ib_ah *ah); +struct ib_srq *bnxt_re_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata); +int bnxt_re_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, + enum ib_srq_attr_mask srq_attr_mask, + struct ib_udata *udata); +int bnxt_re_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); +int bnxt_re_destroy_srq(struct ib_srq *srq); +int bnxt_re_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr); struct ib_qp *bnxt_re_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index aafc19aa5de1..508d00a5a106 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -80,6 +80,79 @@ static DEFINE_MUTEX(bnxt_re_dev_lock); static struct workqueue_struct *bnxt_re_wq; static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait); +/* SR-IOV helper functions */ + +static void bnxt_re_get_sriov_func_type(struct bnxt_re_dev *rdev) +{ + struct bnxt *bp; + + bp = netdev_priv(rdev->en_dev->net); + if (BNXT_VF(bp)) + rdev->is_virtfn = 1; +} + +/* Set the maximum number of each resource that the driver actually wants + * to allocate. This may be up to the maximum number the firmware has + * reserved for the function. The driver may choose to allocate fewer + * resources than the firmware maximum. + */ +static void bnxt_re_set_resource_limits(struct bnxt_re_dev *rdev) +{ + u32 vf_qps = 0, vf_srqs = 0, vf_cqs = 0, vf_mrws = 0, vf_gids = 0; + u32 i; + u32 vf_pct; + u32 num_vfs; + struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + + rdev->qplib_ctx.qpc_count = min_t(u32, BNXT_RE_MAX_QPC_COUNT, + dev_attr->max_qp); + + rdev->qplib_ctx.mrw_count = BNXT_RE_MAX_MRW_COUNT_256K; + /* Use max_mr from fw since max_mrw does not get set */ + rdev->qplib_ctx.mrw_count = min_t(u32, rdev->qplib_ctx.mrw_count, + dev_attr->max_mr); + rdev->qplib_ctx.srqc_count = min_t(u32, BNXT_RE_MAX_SRQC_COUNT, + dev_attr->max_srq); + rdev->qplib_ctx.cq_count = min_t(u32, BNXT_RE_MAX_CQ_COUNT, + dev_attr->max_cq); + + for (i = 0; i < MAX_TQM_ALLOC_REQ; i++) + rdev->qplib_ctx.tqm_count[i] = + rdev->dev_attr.tqm_alloc_reqs[i]; + + if (rdev->num_vfs) { + /* + * Reserve a set of resources for the PF. Divide the remaining + * resources among the VFs + */ + vf_pct = 100 - BNXT_RE_PCT_RSVD_FOR_PF; + num_vfs = 100 * rdev->num_vfs; + vf_qps = (rdev->qplib_ctx.qpc_count * vf_pct) / num_vfs; + vf_srqs = (rdev->qplib_ctx.srqc_count * vf_pct) / num_vfs; + vf_cqs = (rdev->qplib_ctx.cq_count * vf_pct) / num_vfs; + /* + * The driver allows many more MRs than other resources. If the + * firmware does also, then reserve a fixed amount for the PF + * and divide the rest among VFs. VFs may use many MRs for NFS + * mounts, ISER, NVME applications, etc. If the firmware + * severely restricts the number of MRs, then let PF have + * half and divide the rest among VFs, as for the other + * resource types. + */ + if (rdev->qplib_ctx.mrw_count < BNXT_RE_MAX_MRW_COUNT_64K) + vf_mrws = rdev->qplib_ctx.mrw_count * vf_pct / num_vfs; + else + vf_mrws = (rdev->qplib_ctx.mrw_count - + BNXT_RE_RESVD_MR_FOR_PF) / rdev->num_vfs; + vf_gids = BNXT_RE_MAX_GID_PER_VF; + } + rdev->qplib_ctx.vf_res.max_mrw_per_vf = vf_mrws; + rdev->qplib_ctx.vf_res.max_gid_per_vf = vf_gids; + rdev->qplib_ctx.vf_res.max_qp_per_vf = vf_qps; + rdev->qplib_ctx.vf_res.max_srq_per_vf = vf_srqs; + rdev->qplib_ctx.vf_res.max_cq_per_vf = vf_cqs; +} + /* for handling bnxt_en callbacks later */ static void bnxt_re_stop(void *p) { @@ -91,6 +164,15 @@ static void bnxt_re_start(void *p) static void bnxt_re_sriov_config(void *p, int num_vfs) { + struct bnxt_re_dev *rdev = p; + + if (!rdev) + return; + + rdev->num_vfs = num_vfs; + bnxt_re_set_resource_limits(rdev); + bnxt_qplib_set_func_resources(&rdev->qplib_res, &rdev->rcfw, + &rdev->qplib_ctx); } static void bnxt_re_shutdown(void *p) @@ -417,7 +499,7 @@ static struct bnxt_en_dev *bnxt_re_dev_probe(struct net_device *netdev) return ERR_PTR(-EINVAL); if (!(en_dev->flags & BNXT_EN_FLAG_ROCE_CAP)) { - dev_dbg(&pdev->dev, + dev_info(&pdev->dev, "%s: probe error: RoCE is not supported on this device", ROCE_DRV_MODULE_NAME); return ERR_PTR(-ENODEV); @@ -490,6 +572,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->query_port = bnxt_re_query_port; ibdev->get_port_immutable = bnxt_re_get_port_immutable; + ibdev->get_dev_fw_str = bnxt_re_query_fw_str; ibdev->query_pkey = bnxt_re_query_pkey; ibdev->query_gid = bnxt_re_query_gid; ibdev->get_netdev = bnxt_re_get_netdev; @@ -505,6 +588,12 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->query_ah = bnxt_re_query_ah; ibdev->destroy_ah = bnxt_re_destroy_ah; + ibdev->create_srq = bnxt_re_create_srq; + ibdev->modify_srq = bnxt_re_modify_srq; + ibdev->query_srq = bnxt_re_query_srq; + ibdev->destroy_srq = bnxt_re_destroy_srq; + ibdev->post_srq_recv = bnxt_re_post_srq_recv; + ibdev->create_qp = bnxt_re_create_qp; ibdev->modify_qp = bnxt_re_modify_qp; ibdev->query_qp = bnxt_re_query_qp; @@ -541,14 +630,6 @@ static ssize_t show_rev(struct device *device, struct device_attribute *attr, return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor); } -static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, - char *buf) -{ - struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev); - - return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->dev_attr.fw_ver); -} - static ssize_t show_hca(struct device *device, struct device_attribute *attr, char *buf) { @@ -558,12 +639,10 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr, } static DEVICE_ATTR(hw_rev, 0444, show_rev, NULL); -static DEVICE_ATTR(fw_rev, 0444, show_fw_ver, NULL); static DEVICE_ATTR(hca_type, 0444, show_hca, NULL); static struct device_attribute *bnxt_re_attributes[] = { &dev_attr_hw_rev, - &dev_attr_fw_rev, &dev_attr_hca_type }; @@ -616,10 +695,10 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct net_device *netdev, return rdev; } -static int bnxt_re_aeq_handler(struct bnxt_qplib_rcfw *rcfw, - struct creq_func_event *aeqe) +static int bnxt_re_handle_unaffi_async_event(struct creq_func_event + *unaffi_async) { - switch (aeqe->event) { + switch (unaffi_async->event) { case CREQ_FUNC_EVENT_EVENT_TX_WQE_ERROR: break; case CREQ_FUNC_EVENT_EVENT_TX_DATA_ERROR: @@ -648,6 +727,93 @@ static int bnxt_re_aeq_handler(struct bnxt_qplib_rcfw *rcfw, return 0; } +static int bnxt_re_handle_qp_async_event(struct creq_qp_event *qp_event, + struct bnxt_re_qp *qp) +{ + struct ib_event event; + + memset(&event, 0, sizeof(event)); + if (qp->qplib_qp.srq) { + event.device = &qp->rdev->ibdev; + event.element.qp = &qp->ib_qp; + event.event = IB_EVENT_QP_LAST_WQE_REACHED; + } + + if (event.device && qp->ib_qp.event_handler) + qp->ib_qp.event_handler(&event, qp->ib_qp.qp_context); + + return 0; +} + +static int bnxt_re_handle_affi_async_event(struct creq_qp_event *affi_async, + void *obj) +{ + int rc = 0; + u8 event; + + if (!obj) + return rc; /* QP was already dead, still return success */ + + event = affi_async->event; + if (event == CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION) { + struct bnxt_qplib_qp *lib_qp = obj; + struct bnxt_re_qp *qp = container_of(lib_qp, struct bnxt_re_qp, + qplib_qp); + rc = bnxt_re_handle_qp_async_event(affi_async, qp); + } + return rc; +} + +static int bnxt_re_aeq_handler(struct bnxt_qplib_rcfw *rcfw, + void *aeqe, void *obj) +{ + struct creq_qp_event *affi_async; + struct creq_func_event *unaffi_async; + u8 type; + int rc; + + type = ((struct creq_base *)aeqe)->type; + if (type == CREQ_BASE_TYPE_FUNC_EVENT) { + unaffi_async = aeqe; + rc = bnxt_re_handle_unaffi_async_event(unaffi_async); + } else { + affi_async = aeqe; + rc = bnxt_re_handle_affi_async_event(affi_async, obj); + } + + return rc; +} + +static int bnxt_re_srqn_handler(struct bnxt_qplib_nq *nq, + struct bnxt_qplib_srq *handle, u8 event) +{ + struct bnxt_re_srq *srq = container_of(handle, struct bnxt_re_srq, + qplib_srq); + struct ib_event ib_event; + int rc = 0; + + if (!srq) { + dev_err(NULL, "%s: SRQ is NULL, SRQN not handled", + ROCE_DRV_MODULE_NAME); + rc = -EINVAL; + goto done; + } + ib_event.device = &srq->rdev->ibdev; + ib_event.element.srq = &srq->ib_srq; + if (event == NQ_SRQ_EVENT_EVENT_SRQ_THRESHOLD_EVENT) + ib_event.event = IB_EVENT_SRQ_LIMIT_REACHED; + else + ib_event.event = IB_EVENT_SRQ_ERR; + + if (srq->ib_srq.event_handler) { + /* Lock event_handler? */ + (*srq->ib_srq.event_handler)(&ib_event, + srq->ib_srq.srq_context); + } +done: + return rc; +} + static int bnxt_re_cqn_handler(struct bnxt_qplib_nq *nq, struct bnxt_qplib_cq *handle) { @@ -690,7 +856,8 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev) rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nq[i - 1], i - 1, rdev->msix_entries[i].vector, rdev->msix_entries[i].db_offset, - &bnxt_re_cqn_handler, NULL); + &bnxt_re_cqn_handler, + &bnxt_re_srqn_handler); if (rc) { dev_err(rdev_to_dev(rdev), @@ -734,7 +901,8 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) /* Configure and allocate resources for qplib */ rdev->qplib_res.rcfw = &rdev->rcfw; - rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr); + rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr, + rdev->is_virtfn); if (rc) goto fail; @@ -1035,19 +1203,6 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait) } } -static void bnxt_re_set_resource_limits(struct bnxt_re_dev *rdev) -{ - u32 i; - - rdev->qplib_ctx.qpc_count = BNXT_RE_MAX_QPC_COUNT; - rdev->qplib_ctx.mrw_count = BNXT_RE_MAX_MRW_COUNT; - rdev->qplib_ctx.srqc_count = BNXT_RE_MAX_SRQC_COUNT; - rdev->qplib_ctx.cq_count = BNXT_RE_MAX_CQ_COUNT; - for (i = 0; i < MAX_TQM_ALLOC_REQ; i++) - rdev->qplib_ctx.tqm_count[i] = - rdev->dev_attr.tqm_alloc_reqs[i]; -} - /* worker thread for polling periodic events. Now used for QoS programming*/ static void bnxt_re_worker(struct work_struct *work) { @@ -1070,6 +1225,9 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) } set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + /* Check whether VF or PF */ + bnxt_re_get_sriov_func_type(rdev); + rc = bnxt_re_request_msix(rdev); if (rc) { pr_err("Failed to get MSI-X vectors: %#x\n", rc); @@ -1101,16 +1259,18 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) (rdev->en_dev->pdev, &rdev->rcfw, rdev->msix_entries[BNXT_RE_AEQ_IDX].vector, rdev->msix_entries[BNXT_RE_AEQ_IDX].db_offset, - 0, &bnxt_re_aeq_handler); + rdev->is_virtfn, &bnxt_re_aeq_handler); if (rc) { pr_err("Failed to enable RCFW channel: %#x\n", rc); goto free_ring; } - rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr); + rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr, + rdev->is_virtfn); if (rc) goto disable_rcfw; - bnxt_re_set_resource_limits(rdev); + if (!rdev->is_virtfn) + bnxt_re_set_resource_limits(rdev); rc = bnxt_qplib_alloc_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx, 0); if (rc) { @@ -1125,7 +1285,8 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) goto free_ctx; } - rc = bnxt_qplib_init_rcfw(&rdev->rcfw, &rdev->qplib_ctx, 0); + rc = bnxt_qplib_init_rcfw(&rdev->rcfw, &rdev->qplib_ctx, + rdev->is_virtfn); if (rc) { pr_err("Failed to initialize RCFW: %#x\n", rc); goto free_sctx; @@ -1144,13 +1305,15 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) goto fail; } - rc = bnxt_re_setup_qos(rdev); - if (rc) - pr_info("RoCE priority not yet configured\n"); + if (!rdev->is_virtfn) { + rc = bnxt_re_setup_qos(rdev); + if (rc) + pr_info("RoCE priority not yet configured\n"); - INIT_DELAYED_WORK(&rdev->worker, bnxt_re_worker); - set_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags); - schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000)); + INIT_DELAYED_WORK(&rdev->worker, bnxt_re_worker); + set_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags); + schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000)); + } /* Register ib dev */ rc = bnxt_re_register_ib(rdev); @@ -1176,6 +1339,7 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags); ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed, &rdev->active_width); + set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags); bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_PORT_ACTIVE); bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_GID_CHANGE); @@ -1400,7 +1564,7 @@ err_netdev: static void __exit bnxt_re_mod_exit(void) { - struct bnxt_re_dev *rdev; + struct bnxt_re_dev *rdev, *next; LIST_HEAD(to_be_deleted); mutex_lock(&bnxt_re_dev_lock); @@ -1408,8 +1572,11 @@ static void __exit bnxt_re_mod_exit(void) if (!list_empty(&bnxt_re_dev_list)) list_splice_init(&bnxt_re_dev_list, &to_be_deleted); mutex_unlock(&bnxt_re_dev_lock); - - list_for_each_entry(rdev, &to_be_deleted, list) { + /* + * Cleanup the devices in reverse order so that the VF device + * cleanup is done before PF cleanup + */ + list_for_each_entry_safe_reverse(rdev, next, &to_be_deleted, list) { dev_info(rdev_to_dev(rdev), "Unregistering Device"); bnxt_re_dev_stop(rdev); bnxt_re_ib_unreg(rdev, true); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 61764f7aa79b..8b5f11ac0e42 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -52,6 +52,7 @@ static void bnxt_qplib_arm_cq_enable(struct bnxt_qplib_cq *cq); static void __clean_cq(struct bnxt_qplib_cq *cq, u64 qp); +static void bnxt_qplib_arm_srq(struct bnxt_qplib_srq *srq, u32 arm_type); static void bnxt_qplib_cancel_phantom_processing(struct bnxt_qplib_qp *qp) { @@ -278,6 +279,7 @@ static void bnxt_qplib_service_nq(unsigned long data) struct nq_base *nqe, **nq_ptr; struct bnxt_qplib_cq *cq; int num_cqne_processed = 0; + int num_srqne_processed = 0; u32 sw_cons, raw_cons; u16 type; int budget = nq->budget; @@ -320,6 +322,26 @@ static void bnxt_qplib_service_nq(unsigned long data) spin_unlock_bh(&cq->compl_lock); break; } + case NQ_BASE_TYPE_SRQ_EVENT: + { + struct nq_srq_event *nqsrqe = + (struct nq_srq_event *)nqe; + + q_handle = le32_to_cpu(nqsrqe->srq_handle_low); + q_handle |= (u64)le32_to_cpu(nqsrqe->srq_handle_high) + << 32; + bnxt_qplib_arm_srq((struct bnxt_qplib_srq *)q_handle, + DBR_DBR_TYPE_SRQ_ARMENA); + if (!nq->srqn_handler(nq, + (struct bnxt_qplib_srq *)q_handle, + nqsrqe->event)) + num_srqne_processed++; + else + dev_warn(&nq->pdev->dev, + "QPLIB: SRQ event 0x%x not handled", + nqsrqe->event); + break; + } case NQ_BASE_TYPE_DBQ_EVENT: break; default: @@ -384,17 +406,19 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq, int (*cqn_handler)(struct bnxt_qplib_nq *nq, struct bnxt_qplib_cq *), int (*srqn_handler)(struct bnxt_qplib_nq *nq, - void *, u8 event)) + struct bnxt_qplib_srq *, + u8 event)) { resource_size_t nq_base; int rc = -1; nq->pdev = pdev; nq->vector = msix_vector; + if (cqn_handler) + nq->cqn_handler = cqn_handler; - nq->cqn_handler = cqn_handler; - - nq->srqn_handler = srqn_handler; + if (srqn_handler) + nq->srqn_handler = srqn_handler; tasklet_init(&nq->worker, bnxt_qplib_service_nq, (unsigned long)nq); @@ -410,7 +434,6 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq, if (rc) { dev_err(&nq->pdev->dev, "Failed to request IRQ for NQ: %#x", rc); - bnxt_qplib_disable_nq(nq); goto fail; } @@ -469,6 +492,238 @@ int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq) return 0; } +/* SRQ */ +static void bnxt_qplib_arm_srq(struct bnxt_qplib_srq *srq, u32 arm_type) +{ + struct bnxt_qplib_hwq *srq_hwq = &srq->hwq; + struct dbr_dbr db_msg = { 0 }; + void __iomem *db; + u32 sw_prod = 0; + + /* Ring DB */ + sw_prod = (arm_type == DBR_DBR_TYPE_SRQ_ARM) ? srq->threshold : + HWQ_CMP(srq_hwq->prod, srq_hwq); + db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) & + DBR_DBR_INDEX_MASK); + db_msg.type_xid = cpu_to_le32(((srq->id << DBR_DBR_XID_SFT) & + DBR_DBR_XID_MASK) | arm_type); + db = (arm_type == DBR_DBR_TYPE_SRQ_ARMENA) ? + srq->dbr_base : srq->dpi->dbr; + wmb(); /* barrier before db ring */ + __iowrite64_copy(db, &db_msg, sizeof(db_msg) / sizeof(u64)); +} + +int bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res, + struct bnxt_qplib_srq *srq) +{ + struct bnxt_qplib_rcfw *rcfw = res->rcfw; + struct cmdq_destroy_srq req; + struct creq_destroy_srq_resp resp; + u16 cmd_flags = 0; + int rc; + + RCFW_CMD_PREP(req, DESTROY_SRQ, cmd_flags); + + /* Configure the request */ + req.srq_cid = cpu_to_le32(srq->id); + + rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, + (void *)&resp, NULL, 0); + if (rc) + return rc; + + bnxt_qplib_free_hwq(res->pdev, &srq->hwq); + kfree(srq->swq); + return 0; +} + +int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, + struct bnxt_qplib_srq *srq) +{ + struct bnxt_qplib_rcfw *rcfw = res->rcfw; + struct cmdq_create_srq req; + struct creq_create_srq_resp resp; + struct bnxt_qplib_pbl *pbl; + u16 cmd_flags = 0; + int rc, idx; + + srq->hwq.max_elements = srq->max_wqe; + rc = bnxt_qplib_alloc_init_hwq(res->pdev, &srq->hwq, srq->sglist, + srq->nmap, &srq->hwq.max_elements, + BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0, + PAGE_SIZE, HWQ_TYPE_QUEUE); + if (rc) + goto exit; + + srq->swq = kcalloc(srq->hwq.max_elements, sizeof(*srq->swq), + GFP_KERNEL); + if (!srq->swq) + goto fail; + + RCFW_CMD_PREP(req, CREATE_SRQ, cmd_flags); + + /* Configure the request */ + req.dpi = cpu_to_le32(srq->dpi->dpi); + req.srq_handle = cpu_to_le64(srq); + + req.srq_size = cpu_to_le16((u16)srq->hwq.max_elements); + pbl = &srq->hwq.pbl[PBL_LVL_0]; + req.pg_size_lvl = cpu_to_le16((((u16)srq->hwq.level & + CMDQ_CREATE_SRQ_LVL_MASK) << + CMDQ_CREATE_SRQ_LVL_SFT) | + (pbl->pg_size == ROCE_PG_SIZE_4K ? + CMDQ_CREATE_SRQ_PG_SIZE_PG_4K : + pbl->pg_size == ROCE_PG_SIZE_8K ? + CMDQ_CREATE_SRQ_PG_SIZE_PG_8K : + pbl->pg_size == ROCE_PG_SIZE_64K ? + CMDQ_CREATE_SRQ_PG_SIZE_PG_64K : + pbl->pg_size == ROCE_PG_SIZE_2M ? + CMDQ_CREATE_SRQ_PG_SIZE_PG_2M : + pbl->pg_size == ROCE_PG_SIZE_8M ? + CMDQ_CREATE_SRQ_PG_SIZE_PG_8M : + pbl->pg_size == ROCE_PG_SIZE_1G ? + CMDQ_CREATE_SRQ_PG_SIZE_PG_1G : + CMDQ_CREATE_SRQ_PG_SIZE_PG_4K)); + req.pbl = cpu_to_le64(pbl->pg_map_arr[0]); + req.pd_id = cpu_to_le32(srq->pd->id); + req.eventq_id = cpu_to_le16(srq->eventq_hw_ring_id); + + rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, + (void *)&resp, NULL, 0); + if (rc) + goto fail; + + spin_lock_init(&srq->lock); + srq->start_idx = 0; + srq->last_idx = srq->hwq.max_elements - 1; + for (idx = 0; idx < srq->hwq.max_elements; idx++) + srq->swq[idx].next_idx = idx + 1; + srq->swq[srq->last_idx].next_idx = -1; + + srq->id = le32_to_cpu(resp.xid); + srq->dbr_base = res->dpi_tbl.dbr_bar_reg_iomem; + if (srq->threshold) + bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARMENA); + srq->arm_req = false; + + return 0; +fail: + bnxt_qplib_free_hwq(res->pdev, &srq->hwq); + kfree(srq->swq); +exit: + return rc; +} + +int bnxt_qplib_modify_srq(struct bnxt_qplib_res *res, + struct bnxt_qplib_srq *srq) +{ + struct bnxt_qplib_hwq *srq_hwq = &srq->hwq; + u32 sw_prod, sw_cons, count = 0; + + sw_prod = HWQ_CMP(srq_hwq->prod, srq_hwq); + sw_cons = HWQ_CMP(srq_hwq->cons, srq_hwq); + + count = sw_prod > sw_cons ? sw_prod - sw_cons : + srq_hwq->max_elements - sw_cons + sw_prod; + if (count > srq->threshold) { + srq->arm_req = false; + bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARM); + } else { + /* Deferred arming */ + srq->arm_req = true; + } + + return 0; +} + +int bnxt_qplib_query_srq(struct bnxt_qplib_res *res, + struct bnxt_qplib_srq *srq) +{ + struct bnxt_qplib_rcfw *rcfw = res->rcfw; + struct cmdq_query_srq req; + struct creq_query_srq_resp resp; + struct bnxt_qplib_rcfw_sbuf *sbuf; + struct creq_query_srq_resp_sb *sb; + u16 cmd_flags = 0; + int rc = 0; + + RCFW_CMD_PREP(req, QUERY_SRQ, cmd_flags); + req.srq_cid = cpu_to_le32(srq->id); + + /* Configure the request */ + sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb)); + if (!sbuf) + return -ENOMEM; + sb = sbuf->sb; + rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp, + (void *)sbuf, 0); + srq->threshold = le16_to_cpu(sb->srq_limit); + bnxt_qplib_rcfw_free_sbuf(rcfw, sbuf); + + return rc; +} + +int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq, + struct bnxt_qplib_swqe *wqe) +{ + struct bnxt_qplib_hwq *srq_hwq = &srq->hwq; + struct rq_wqe *srqe, **srqe_ptr; + struct sq_sge *hw_sge; + u32 sw_prod, sw_cons, count = 0; + int i, rc = 0, next; + + spin_lock(&srq_hwq->lock); + if (srq->start_idx == srq->last_idx) { + dev_err(&srq_hwq->pdev->dev, "QPLIB: FP: SRQ (0x%x) is full!", + srq->id); + rc = -EINVAL; + spin_unlock(&srq_hwq->lock); + goto done; + } + next = srq->start_idx; + srq->start_idx = srq->swq[next].next_idx; + spin_unlock(&srq_hwq->lock); + + sw_prod = HWQ_CMP(srq_hwq->prod, srq_hwq); + srqe_ptr = (struct rq_wqe **)srq_hwq->pbl_ptr; + srqe = &srqe_ptr[RQE_PG(sw_prod)][RQE_IDX(sw_prod)]; + memset(srqe, 0, BNXT_QPLIB_MAX_RQE_ENTRY_SIZE); + /* Calculate wqe_size16 and data_len */ + for (i = 0, hw_sge = (struct sq_sge *)srqe->data; + i < wqe->num_sge; i++, hw_sge++) { + hw_sge->va_or_pa = cpu_to_le64(wqe->sg_list[i].addr); + hw_sge->l_key = cpu_to_le32(wqe->sg_list[i].lkey); + hw_sge->size = cpu_to_le32(wqe->sg_list[i].size); + } + srqe->wqe_type = wqe->type; + srqe->flags = wqe->flags; + srqe->wqe_size = wqe->num_sge + + ((offsetof(typeof(*srqe), data) + 15) >> 4); + srqe->wr_id[0] = cpu_to_le32((u32)next); + srq->swq[next].wr_id = wqe->wr_id; + + srq_hwq->prod++; + + spin_lock(&srq_hwq->lock); + sw_prod = HWQ_CMP(srq_hwq->prod, srq_hwq); + /* retaining srq_hwq->cons for this logic + * actually the lock is only required to + * read srq_hwq->cons. + */ + sw_cons = HWQ_CMP(srq_hwq->cons, srq_hwq); + count = sw_prod > sw_cons ? sw_prod - sw_cons : + srq_hwq->max_elements - sw_cons + sw_prod; + spin_unlock(&srq_hwq->lock); + /* Ring DB */ + bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ); + if (srq->arm_req == true && count > srq->threshold) { + srq->arm_req = false; + bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARM); + } +done: + return rc; +} + /* QP */ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) { @@ -737,6 +992,12 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) pbl->pg_size == ROCE_PG_SIZE_1G ? CMDQ_CREATE_QP_RQ_PG_SIZE_PG_1G : CMDQ_CREATE_QP_RQ_PG_SIZE_PG_4K); + } else { + /* SRQ */ + if (qp->srq) { + qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_SRQ_USED; + req.srq_cid = cpu_to_le32(qp->srq->id); + } } if (qp->rcq) @@ -2068,6 +2329,16 @@ done: return rc; } +static void bnxt_qplib_release_srqe(struct bnxt_qplib_srq *srq, u32 tag) +{ + spin_lock(&srq->hwq.lock); + srq->swq[srq->last_idx].next_idx = (int)tag; + srq->last_idx = (int)tag; + srq->swq[srq->last_idx].next_idx = -1; + srq->hwq.cons++; /* Support for SRQE counter */ + spin_unlock(&srq->hwq.lock); +} + static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, struct cq_res_rc *hwcqe, struct bnxt_qplib_cqe **pcqe, @@ -2075,6 +2346,7 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, { struct bnxt_qplib_qp *qp; struct bnxt_qplib_q *rq; + struct bnxt_qplib_srq *srq; struct bnxt_qplib_cqe *cqe; u32 wr_id_idx; int rc = 0; @@ -2102,27 +2374,46 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, wr_id_idx = le32_to_cpu(hwcqe->srq_or_rq_wr_id) & CQ_RES_RC_SRQ_OR_RQ_WR_ID_MASK; - rq = &qp->rq; - if (wr_id_idx > rq->hwq.max_elements) { - dev_err(&cq->hwq.pdev->dev, "QPLIB: FP: CQ Process RC "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x", - wr_id_idx, rq->hwq.max_elements); - return -EINVAL; - } - - cqe->wr_id = rq->swq[wr_id_idx].wr_id; - cqe++; - (*budget)--; - rq->hwq.cons++; - *pcqe = cqe; + if (cqe->flags & CQ_RES_RC_FLAGS_SRQ_SRQ) { + srq = qp->srq; + if (!srq) + return -EINVAL; + if (wr_id_idx > srq->hwq.max_elements) { + dev_err(&cq->hwq.pdev->dev, + "QPLIB: FP: CQ Process RC "); + dev_err(&cq->hwq.pdev->dev, + "QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x", + wr_id_idx, srq->hwq.max_elements); + return -EINVAL; + } + cqe->wr_id = srq->swq[wr_id_idx].wr_id; + bnxt_qplib_release_srqe(srq, wr_id_idx); + cqe++; + (*budget)--; + *pcqe = cqe; + } else { + rq = &qp->rq; + if (wr_id_idx > rq->hwq.max_elements) { + dev_err(&cq->hwq.pdev->dev, + "QPLIB: FP: CQ Process RC "); + dev_err(&cq->hwq.pdev->dev, + "QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x", + wr_id_idx, rq->hwq.max_elements); + return -EINVAL; + } + cqe->wr_id = rq->swq[wr_id_idx].wr_id; + cqe++; + (*budget)--; + rq->hwq.cons++; + *pcqe = cqe; - if (hwcqe->status != CQ_RES_RC_STATUS_OK) { - qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR; - /* Add qp to flush list of the CQ */ - bnxt_qplib_lock_buddy_cq(qp, cq); - __bnxt_qplib_add_flush_qp(qp); - bnxt_qplib_unlock_buddy_cq(qp, cq); + if (hwcqe->status != CQ_RES_RC_STATUS_OK) { + qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR; + /* Add qp to flush list of the CQ */ + bnxt_qplib_lock_buddy_cq(qp, cq); + __bnxt_qplib_add_flush_qp(qp); + bnxt_qplib_unlock_buddy_cq(qp, cq); + } } done: @@ -2136,6 +2427,7 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, { struct bnxt_qplib_qp *qp; struct bnxt_qplib_q *rq; + struct bnxt_qplib_srq *srq; struct bnxt_qplib_cqe *cqe; u32 wr_id_idx; int rc = 0; @@ -2166,27 +2458,48 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, hwcqe->src_qp_high_srq_or_rq_wr_id) & CQ_RES_UD_SRC_QP_HIGH_MASK) >> 8); - rq = &qp->rq; - if (wr_id_idx > rq->hwq.max_elements) { - dev_err(&cq->hwq.pdev->dev, "QPLIB: FP: CQ Process UD "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: wr_id idx %#x exceeded RQ max %#x", - wr_id_idx, rq->hwq.max_elements); - return -EINVAL; - } + if (cqe->flags & CQ_RES_RC_FLAGS_SRQ_SRQ) { + srq = qp->srq; + if (!srq) + return -EINVAL; - cqe->wr_id = rq->swq[wr_id_idx].wr_id; - cqe++; - (*budget)--; - rq->hwq.cons++; - *pcqe = cqe; + if (wr_id_idx > srq->hwq.max_elements) { + dev_err(&cq->hwq.pdev->dev, + "QPLIB: FP: CQ Process UD "); + dev_err(&cq->hwq.pdev->dev, + "QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x", + wr_id_idx, srq->hwq.max_elements); + return -EINVAL; + } + cqe->wr_id = srq->swq[wr_id_idx].wr_id; + bnxt_qplib_release_srqe(srq, wr_id_idx); + cqe++; + (*budget)--; + *pcqe = cqe; + } else { + rq = &qp->rq; + if (wr_id_idx > rq->hwq.max_elements) { + dev_err(&cq->hwq.pdev->dev, + "QPLIB: FP: CQ Process UD "); + dev_err(&cq->hwq.pdev->dev, + "QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x", + wr_id_idx, rq->hwq.max_elements); + return -EINVAL; + } - if (hwcqe->status != CQ_RES_RC_STATUS_OK) { - qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR; - /* Add qp to flush list of the CQ */ - bnxt_qplib_lock_buddy_cq(qp, cq); - __bnxt_qplib_add_flush_qp(qp); - bnxt_qplib_unlock_buddy_cq(qp, cq); + cqe->wr_id = rq->swq[wr_id_idx].wr_id; + cqe++; + (*budget)--; + rq->hwq.cons++; + *pcqe = cqe; + + if (hwcqe->status != CQ_RES_RC_STATUS_OK) { + qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR; + /* Add qp to flush list of the CQ */ + bnxt_qplib_lock_buddy_cq(qp, cq); + __bnxt_qplib_add_flush_qp(qp); + bnxt_qplib_unlock_buddy_cq(qp, cq); + } } done: return rc; @@ -2218,6 +2531,7 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, { struct bnxt_qplib_qp *qp; struct bnxt_qplib_q *rq; + struct bnxt_qplib_srq *srq; struct bnxt_qplib_cqe *cqe; u32 wr_id_idx; int rc = 0; @@ -2256,26 +2570,49 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, cqe->raweth_qp1_flags2 = le32_to_cpu(hwcqe->raweth_qp1_flags2); cqe->raweth_qp1_metadata = le32_to_cpu(hwcqe->raweth_qp1_metadata); - rq = &qp->rq; - if (wr_id_idx > rq->hwq.max_elements) { - dev_err(&cq->hwq.pdev->dev, "QPLIB: FP: CQ Process Raw/QP1 RQ wr_id "); - dev_err(&cq->hwq.pdev->dev, "QPLIB: ix 0x%x exceeded RQ max 0x%x", - wr_id_idx, rq->hwq.max_elements); - return -EINVAL; - } - - cqe->wr_id = rq->swq[wr_id_idx].wr_id; - cqe++; - (*budget)--; - rq->hwq.cons++; - *pcqe = cqe; + if (cqe->flags & CQ_RES_RAWETH_QP1_FLAGS_SRQ_SRQ) { + srq = qp->srq; + if (!srq) { + dev_err(&cq->hwq.pdev->dev, + "QPLIB: FP: SRQ used but not defined??"); + return -EINVAL; + } + if (wr_id_idx > srq->hwq.max_elements) { + dev_err(&cq->hwq.pdev->dev, + "QPLIB: FP: CQ Process Raw/QP1 "); + dev_err(&cq->hwq.pdev->dev, + "QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x", + wr_id_idx, srq->hwq.max_elements); + return -EINVAL; + } + cqe->wr_id = srq->swq[wr_id_idx].wr_id; + bnxt_qplib_release_srqe(srq, wr_id_idx); + cqe++; + (*budget)--; + *pcqe = cqe; + } else { + rq = &qp->rq; + if (wr_id_idx > rq->hwq.max_elements) { + dev_err(&cq->hwq.pdev->dev, + "QPLIB: FP: CQ Process Raw/QP1 RQ wr_id "); + dev_err(&cq->hwq.pdev->dev, + "QPLIB: ix 0x%x exceeded RQ max 0x%x", + wr_id_idx, rq->hwq.max_elements); + return -EINVAL; + } + cqe->wr_id = rq->swq[wr_id_idx].wr_id; + cqe++; + (*budget)--; + rq->hwq.cons++; + *pcqe = cqe; - if (hwcqe->status != CQ_RES_RC_STATUS_OK) { - qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR; - /* Add qp to flush list of the CQ */ - bnxt_qplib_lock_buddy_cq(qp, cq); - __bnxt_qplib_add_flush_qp(qp); - bnxt_qplib_unlock_buddy_cq(qp, cq); + if (hwcqe->status != CQ_RES_RC_STATUS_OK) { + qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR; + /* Add qp to flush list of the CQ */ + bnxt_qplib_lock_buddy_cq(qp, cq); + __bnxt_qplib_add_flush_qp(qp); + bnxt_qplib_unlock_buddy_cq(qp, cq); + } } done: diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index c582d4ec8173..211b27a8f9e2 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -39,6 +39,27 @@ #ifndef __BNXT_QPLIB_FP_H__ #define __BNXT_QPLIB_FP_H__ +struct bnxt_qplib_srq { + struct bnxt_qplib_pd *pd; + struct bnxt_qplib_dpi *dpi; + void __iomem *dbr_base; + u64 srq_handle; + u32 id; + u32 max_wqe; + u32 max_sge; + u32 threshold; + bool arm_req; + struct bnxt_qplib_cq *cq; + struct bnxt_qplib_hwq hwq; + struct bnxt_qplib_swq *swq; + struct scatterlist *sglist; + int start_idx; + int last_idx; + u32 nmap; + u16 eventq_hw_ring_id; + spinlock_t lock; /* protect SRQE link list */ +}; + struct bnxt_qplib_sge { u64 addr; u32 lkey; @@ -79,6 +100,7 @@ static inline u32 get_psne_idx(u32 val) struct bnxt_qplib_swq { u64 wr_id; + int next_idx; u8 type; u8 flags; u32 start_psn; @@ -404,29 +426,27 @@ struct bnxt_qplib_cq { writel(NQ_DB_CP_FLAGS | ((raw_cons) & ((cp_bit) - 1)), db) struct bnxt_qplib_nq { - struct pci_dev *pdev; - - int vector; - cpumask_t mask; - int budget; - bool requested; - struct tasklet_struct worker; - struct bnxt_qplib_hwq hwq; - - u16 bar_reg; - u16 bar_reg_off; - u16 ring_id; - void __iomem *bar_reg_iomem; - - int (*cqn_handler) - (struct bnxt_qplib_nq *nq, - struct bnxt_qplib_cq *cq); - int (*srqn_handler) - (struct bnxt_qplib_nq *nq, - void *srq, - u8 event); - struct workqueue_struct *cqn_wq; - char name[32]; + struct pci_dev *pdev; + + int vector; + cpumask_t mask; + int budget; + bool requested; + struct tasklet_struct worker; + struct bnxt_qplib_hwq hwq; + + u16 bar_reg; + u16 bar_reg_off; + u16 ring_id; + void __iomem *bar_reg_iomem; + + int (*cqn_handler)(struct bnxt_qplib_nq *nq, + struct bnxt_qplib_cq *cq); + int (*srqn_handler)(struct bnxt_qplib_nq *nq, + struct bnxt_qplib_srq *srq, + u8 event); + struct workqueue_struct *cqn_wq; + char name[32]; }; struct bnxt_qplib_nq_work { @@ -441,8 +461,18 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq, int (*cqn_handler)(struct bnxt_qplib_nq *nq, struct bnxt_qplib_cq *cq), int (*srqn_handler)(struct bnxt_qplib_nq *nq, - void *srq, + struct bnxt_qplib_srq *srq, u8 event)); +int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, + struct bnxt_qplib_srq *srq); +int bnxt_qplib_modify_srq(struct bnxt_qplib_res *res, + struct bnxt_qplib_srq *srq); +int bnxt_qplib_query_srq(struct bnxt_qplib_res *res, + struct bnxt_qplib_srq *srq); +int bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res, + struct bnxt_qplib_srq *srq); +int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq, + struct bnxt_qplib_swqe *wqe); int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp); int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp); int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index bb5574adf195..8329ec6a7946 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -93,7 +93,8 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req, opcode = req->opcode; if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags) && (opcode != CMDQ_BASE_OPCODE_QUERY_FUNC && - opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW)) { + opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW && + opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) { dev_err(&rcfw->pdev->dev, "QPLIB: RCFW not initialized, reject opcode 0x%x", opcode); @@ -615,7 +616,7 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev, int msix_vector, int cp_bar_reg_off, int virt_fn, int (*aeq_handler)(struct bnxt_qplib_rcfw *, - struct creq_func_event *)) + void *, void *)) { resource_size_t res_base; struct cmdq_init init; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 2946a7cfae82..6bee6e3636ea 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -167,7 +167,7 @@ struct bnxt_qplib_rcfw { #define FIRMWARE_TIMED_OUT 3 wait_queue_head_t waitq; int (*aeq_handler)(struct bnxt_qplib_rcfw *, - struct creq_func_event *); + void *, void *); u32 seq_num; /* Bar region info */ @@ -199,9 +199,8 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev, struct bnxt_qplib_rcfw *rcfw, int msix_vector, int cp_bar_reg_off, int virt_fn, - int (*aeq_handler) - (struct bnxt_qplib_rcfw *, - struct creq_func_event *)); + int (*aeq_handler)(struct bnxt_qplib_rcfw *, + void *aeqe, void *obj)); struct bnxt_qplib_rcfw_sbuf *bnxt_qplib_rcfw_alloc_sbuf( struct bnxt_qplib_rcfw *rcfw, diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index 4e101704e801..ad37d54affcc 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -104,13 +104,12 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, if (!sghead) { for (i = 0; i < pages; i++) { - pbl->pg_arr[i] = dma_alloc_coherent(&pdev->dev, - pbl->pg_size, - &pbl->pg_map_arr[i], - GFP_KERNEL); + pbl->pg_arr[i] = dma_zalloc_coherent(&pdev->dev, + pbl->pg_size, + &pbl->pg_map_arr[i], + GFP_KERNEL); if (!pbl->pg_arr[i]) goto fail; - memset(pbl->pg_arr[i], 0, pbl->pg_size); pbl->pg_count++; } } else { diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 9543ce51a28a..c015c1861351 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -64,8 +64,28 @@ static bool bnxt_qplib_is_atomic_cap(struct bnxt_qplib_rcfw *rcfw) return !!(pcie_ctl2 & PCI_EXP_DEVCTL2_ATOMIC_REQ); } +static void bnxt_qplib_query_version(struct bnxt_qplib_rcfw *rcfw, + char *fw_ver) +{ + struct cmdq_query_version req; + struct creq_query_version_resp resp; + u16 cmd_flags = 0; + int rc = 0; + + RCFW_CMD_PREP(req, QUERY_VERSION, cmd_flags); + + rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, + (void *)&resp, NULL, 0); + if (rc) + return; + fw_ver[0] = resp.fw_maj; + fw_ver[1] = resp.fw_minor; + fw_ver[2] = resp.fw_bld; + fw_ver[3] = resp.fw_rsvd; +} + int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_dev_attr *attr) + struct bnxt_qplib_dev_attr *attr, bool vf) { struct cmdq_query_func req; struct creq_query_func_resp resp; @@ -95,7 +115,8 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, /* Extract the context from the side buffer */ attr->max_qp = le32_to_cpu(sb->max_qp); /* max_qp value reported by FW for PF doesn't include the QP1 for PF */ - attr->max_qp += 1; + if (!vf) + attr->max_qp += 1; attr->max_qp_rd_atom = sb->max_qp_rd_atom > BNXT_QPLIB_MAX_OUT_RD_ATOM ? BNXT_QPLIB_MAX_OUT_RD_ATOM : sb->max_qp_rd_atom; @@ -133,7 +154,7 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, attr->l2_db_size = (sb->l2_db_space_size + 1) * PAGE_SIZE; attr->max_sgid = le32_to_cpu(sb->max_gid); - strlcpy(attr->fw_ver, "20.6.28.0", sizeof(attr->fw_ver)); + bnxt_qplib_query_version(rcfw, attr->fw_ver); for (i = 0; i < MAX_TQM_ALLOC_REQ / 4; i++) { temp = le32_to_cpu(sb->tqm_alloc_reqs[i]); @@ -150,6 +171,38 @@ bail: return rc; } +int bnxt_qplib_set_func_resources(struct bnxt_qplib_res *res, + struct bnxt_qplib_rcfw *rcfw, + struct bnxt_qplib_ctx *ctx) +{ + struct cmdq_set_func_resources req; + struct creq_set_func_resources_resp resp; + u16 cmd_flags = 0; + int rc = 0; + + RCFW_CMD_PREP(req, SET_FUNC_RESOURCES, cmd_flags); + + req.number_of_qp = cpu_to_le32(ctx->qpc_count); + req.number_of_mrw = cpu_to_le32(ctx->mrw_count); + req.number_of_srq = cpu_to_le32(ctx->srqc_count); + req.number_of_cq = cpu_to_le32(ctx->cq_count); + + req.max_qp_per_vf = cpu_to_le32(ctx->vf_res.max_qp_per_vf); + req.max_mrw_per_vf = cpu_to_le32(ctx->vf_res.max_mrw_per_vf); + req.max_srq_per_vf = cpu_to_le32(ctx->vf_res.max_srq_per_vf); + req.max_cq_per_vf = cpu_to_le32(ctx->vf_res.max_cq_per_vf); + req.max_gid_per_vf = cpu_to_le32(ctx->vf_res.max_gid_per_vf); + + rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, + (void *)&resp, + NULL, 0); + if (rc) { + dev_err(&res->pdev->dev, + "QPLIB: Failed to set function resources"); + } + return rc; +} + /* SGID */ int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res, struct bnxt_qplib_sgid_tbl *sgid_tbl, int index, @@ -604,7 +657,7 @@ int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw, } int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, - u64 *pbl_tbl, int num_pbls, bool block) + u64 *pbl_tbl, int num_pbls, bool block, u32 buf_pg_size) { struct bnxt_qplib_rcfw *rcfw = res->rcfw; struct cmdq_register_mr req; @@ -615,6 +668,9 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, u32 pg_size; if (num_pbls) { + /* Allocate memory for the non-leaf pages to store buf ptrs. + * Non-leaf pages always uses system PAGE_SIZE + */ pg_ptrs = roundup_pow_of_two(num_pbls); pages = pg_ptrs >> MAX_PBL_LVL_1_PGS_SHIFT; if (!pages) @@ -632,6 +688,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, bnxt_qplib_free_hwq(res->pdev, &mr->hwq); mr->hwq.max_elements = pages; + /* Use system PAGE_SIZE */ rc = bnxt_qplib_alloc_init_hwq(res->pdev, &mr->hwq, NULL, 0, &mr->hwq.max_elements, PAGE_SIZE, 0, PAGE_SIZE, @@ -652,18 +709,22 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, /* Configure the request */ if (mr->hwq.level == PBL_LVL_MAX) { + /* No PBL provided, just use system PAGE_SIZE */ level = 0; req.pbl = 0; pg_size = PAGE_SIZE; } else { level = mr->hwq.level + 1; req.pbl = cpu_to_le64(mr->hwq.pbl[PBL_LVL_0].pg_map_arr[0]); - pg_size = mr->hwq.pbl[PBL_LVL_0].pg_size; } + pg_size = buf_pg_size ? buf_pg_size : PAGE_SIZE; req.log2_pg_size_lvl = (level << CMDQ_REGISTER_MR_LVL_SFT) | ((ilog2(pg_size) << CMDQ_REGISTER_MR_LOG2_PG_SIZE_SFT) & CMDQ_REGISTER_MR_LOG2_PG_SIZE_MASK); + req.log2_pbl_pg_size = cpu_to_le16(((ilog2(PAGE_SIZE) << + CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_SFT) & + CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_MASK)); req.access = (mr->flags & 0xFFFF); req.va = cpu_to_le64(mr->va); req.key = cpu_to_le32(mr->lkey); @@ -729,3 +790,73 @@ int bnxt_qplib_map_tc2cos(struct bnxt_qplib_res *res, u16 *cids) 0); return 0; } + +int bnxt_qplib_get_roce_stats(struct bnxt_qplib_rcfw *rcfw, + struct bnxt_qplib_roce_stats *stats) +{ + struct cmdq_query_roce_stats req; + struct creq_query_roce_stats_resp resp; + struct bnxt_qplib_rcfw_sbuf *sbuf; + struct creq_query_roce_stats_resp_sb *sb; + u16 cmd_flags = 0; + int rc = 0; + + RCFW_CMD_PREP(req, QUERY_ROCE_STATS, cmd_flags); + + sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb)); + if (!sbuf) { + dev_err(&rcfw->pdev->dev, + "QPLIB: SP: QUERY_ROCE_STATS alloc side buffer failed"); + return -ENOMEM; + } + + sb = sbuf->sb; + req.resp_size = sizeof(*sb) / BNXT_QPLIB_CMDQE_UNITS; + rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp, + (void *)sbuf, 0); + if (rc) + goto bail; + /* Extract the context from the side buffer */ + stats->to_retransmits = le64_to_cpu(sb->to_retransmits); + stats->seq_err_naks_rcvd = le64_to_cpu(sb->seq_err_naks_rcvd); + stats->max_retry_exceeded = le64_to_cpu(sb->max_retry_exceeded); + stats->rnr_naks_rcvd = le64_to_cpu(sb->rnr_naks_rcvd); + stats->missing_resp = le64_to_cpu(sb->missing_resp); + stats->unrecoverable_err = le64_to_cpu(sb->unrecoverable_err); + stats->bad_resp_err = le64_to_cpu(sb->bad_resp_err); + stats->local_qp_op_err = le64_to_cpu(sb->local_qp_op_err); + stats->local_protection_err = le64_to_cpu(sb->local_protection_err); + stats->mem_mgmt_op_err = le64_to_cpu(sb->mem_mgmt_op_err); + stats->remote_invalid_req_err = le64_to_cpu(sb->remote_invalid_req_err); + stats->remote_access_err = le64_to_cpu(sb->remote_access_err); + stats->remote_op_err = le64_to_cpu(sb->remote_op_err); + stats->dup_req = le64_to_cpu(sb->dup_req); + stats->res_exceed_max = le64_to_cpu(sb->res_exceed_max); + stats->res_length_mismatch = le64_to_cpu(sb->res_length_mismatch); + stats->res_exceeds_wqe = le64_to_cpu(sb->res_exceeds_wqe); + stats->res_opcode_err = le64_to_cpu(sb->res_opcode_err); + stats->res_rx_invalid_rkey = le64_to_cpu(sb->res_rx_invalid_rkey); + stats->res_rx_domain_err = le64_to_cpu(sb->res_rx_domain_err); + stats->res_rx_no_perm = le64_to_cpu(sb->res_rx_no_perm); + stats->res_rx_range_err = le64_to_cpu(sb->res_rx_range_err); + stats->res_tx_invalid_rkey = le64_to_cpu(sb->res_tx_invalid_rkey); + stats->res_tx_domain_err = le64_to_cpu(sb->res_tx_domain_err); + stats->res_tx_no_perm = le64_to_cpu(sb->res_tx_no_perm); + stats->res_tx_range_err = le64_to_cpu(sb->res_tx_range_err); + stats->res_irrq_oflow = le64_to_cpu(sb->res_irrq_oflow); + stats->res_unsup_opcode = le64_to_cpu(sb->res_unsup_opcode); + stats->res_unaligned_atomic = le64_to_cpu(sb->res_unaligned_atomic); + stats->res_rem_inv_err = le64_to_cpu(sb->res_rem_inv_err); + stats->res_mem_error = le64_to_cpu(sb->res_mem_error); + stats->res_srq_err = le64_to_cpu(sb->res_srq_err); + stats->res_cmp_err = le64_to_cpu(sb->res_cmp_err); + stats->res_invalid_dup_rkey = le64_to_cpu(sb->res_invalid_dup_rkey); + stats->res_wqe_format_err = le64_to_cpu(sb->res_wqe_format_err); + stats->res_cq_load_err = le64_to_cpu(sb->res_cq_load_err); + stats->res_srq_load_err = le64_to_cpu(sb->res_srq_load_err); + stats->res_tx_pci_err = le64_to_cpu(sb->res_tx_pci_err); + stats->res_rx_pci_err = le64_to_cpu(sb->res_rx_pci_err); +bail: + bnxt_qplib_rcfw_free_sbuf(rcfw, sbuf); + return rc; +} diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 11322582f5e4..9d3e8b994945 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -45,7 +45,8 @@ #define PCI_EXP_DEVCTL2_ATOMIC_REQ 0x0040 struct bnxt_qplib_dev_attr { - char fw_ver[32]; +#define FW_VER_ARR_LEN 4 + u8 fw_ver[FW_VER_ARR_LEN]; u16 max_sgid; u16 max_mrw; u32 max_qp; @@ -127,6 +128,85 @@ struct bnxt_qplib_frpl { #define BNXT_QPLIB_ACCESS_ZERO_BASED BIT(5) #define BNXT_QPLIB_ACCESS_ON_DEMAND BIT(6) +struct bnxt_qplib_roce_stats { + u64 to_retransmits; + u64 seq_err_naks_rcvd; + /* seq_err_naks_rcvd is 64 b */ + u64 max_retry_exceeded; + /* max_retry_exceeded is 64 b */ + u64 rnr_naks_rcvd; + /* rnr_naks_rcvd is 64 b */ + u64 missing_resp; + u64 unrecoverable_err; + /* unrecoverable_err is 64 b */ + u64 bad_resp_err; + /* bad_resp_err is 64 b */ + u64 local_qp_op_err; + /* local_qp_op_err is 64 b */ + u64 local_protection_err; + /* local_protection_err is 64 b */ + u64 mem_mgmt_op_err; + /* mem_mgmt_op_err is 64 b */ + u64 remote_invalid_req_err; + /* remote_invalid_req_err is 64 b */ + u64 remote_access_err; + /* remote_access_err is 64 b */ + u64 remote_op_err; + /* remote_op_err is 64 b */ + u64 dup_req; + /* dup_req is 64 b */ + u64 res_exceed_max; + /* res_exceed_max is 64 b */ + u64 res_length_mismatch; + /* res_length_mismatch is 64 b */ + u64 res_exceeds_wqe; + /* res_exceeds_wqe is 64 b */ + u64 res_opcode_err; + /* res_opcode_err is 64 b */ + u64 res_rx_invalid_rkey; + /* res_rx_invalid_rkey is 64 b */ + u64 res_rx_domain_err; + /* res_rx_domain_err is 64 b */ + u64 res_rx_no_perm; + /* res_rx_no_perm is 64 b */ + u64 res_rx_range_err; + /* res_rx_range_err is 64 b */ + u64 res_tx_invalid_rkey; + /* res_tx_invalid_rkey is 64 b */ + u64 res_tx_domain_err; + /* res_tx_domain_err is 64 b */ + u64 res_tx_no_perm; + /* res_tx_no_perm is 64 b */ + u64 res_tx_range_err; + /* res_tx_range_err is 64 b */ + u64 res_irrq_oflow; + /* res_irrq_oflow is 64 b */ + u64 res_unsup_opcode; + /* res_unsup_opcode is 64 b */ + u64 res_unaligned_atomic; + /* res_unaligned_atomic is 64 b */ + u64 res_rem_inv_err; + /* res_rem_inv_err is 64 b */ + u64 res_mem_error; + /* res_mem_error is 64 b */ + u64 res_srq_err; + /* res_srq_err is 64 b */ + u64 res_cmp_err; + /* res_cmp_err is 64 b */ + u64 res_invalid_dup_rkey; + /* res_invalid_dup_rkey is 64 b */ + u64 res_wqe_format_err; + /* res_wqe_format_err is 64 b */ + u64 res_cq_load_err; + /* res_cq_load_err is 64 b */ + u64 res_srq_load_err; + /* res_srq_load_err is 64 b */ + u64 res_tx_pci_err; + /* res_tx_pci_err is 64 b */ + u64 res_rx_pci_err; + /* res_rx_pci_err is 64 b */ +}; + int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res, struct bnxt_qplib_sgid_tbl *sgid_tbl, int index, struct bnxt_qplib_gid *gid); @@ -147,7 +227,10 @@ int bnxt_qplib_add_pkey(struct bnxt_qplib_res *res, struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 *pkey, bool update); int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_dev_attr *attr); + struct bnxt_qplib_dev_attr *attr, bool vf); +int bnxt_qplib_set_func_resources(struct bnxt_qplib_res *res, + struct bnxt_qplib_rcfw *rcfw, + struct bnxt_qplib_ctx *ctx); int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah); int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah); int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res, @@ -155,7 +238,7 @@ int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res, int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw, bool block); int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, - u64 *pbl_tbl, int num_pbls, bool block); + u64 *pbl_tbl, int num_pbls, bool block, u32 buf_pg_size); int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr); int bnxt_qplib_alloc_fast_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, int max); @@ -164,4 +247,6 @@ int bnxt_qplib_alloc_fast_reg_page_list(struct bnxt_qplib_res *res, int bnxt_qplib_free_fast_reg_page_list(struct bnxt_qplib_res *res, struct bnxt_qplib_frpl *frpl); int bnxt_qplib_map_tc2cos(struct bnxt_qplib_res *res, u16 *cids); +int bnxt_qplib_get_roce_stats(struct bnxt_qplib_rcfw *rcfw, + struct bnxt_qplib_roce_stats *stats); #endif /* __BNXT_QPLIB_SP_H__*/ diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index c3cba6063a03..2d7ea096a247 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -954,6 +954,7 @@ struct cmdq_base { #define CMDQ_BASE_OPCODE_QUERY_VERSION 0x8bUL #define CMDQ_BASE_OPCODE_MODIFY_CC 0x8cUL #define CMDQ_BASE_OPCODE_QUERY_CC 0x8dUL + #define CMDQ_BASE_OPCODE_QUERY_ROCE_STATS 0x8eUL u8 cmd_size; __le16 flags; __le16 cookie; @@ -1383,8 +1384,20 @@ struct cmdq_register_mr { #define CMDQ_REGISTER_MR_LVL_LVL_0 0x0UL #define CMDQ_REGISTER_MR_LVL_LVL_1 0x1UL #define CMDQ_REGISTER_MR_LVL_LVL_2 0x2UL + #define CMDQ_REGISTER_MR_LVL_LAST CMDQ_REGISTER_MR_LVL_LVL_2 #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_MASK 0x7cUL #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_SFT 2 + #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_4K (0xcUL << 2) + #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_8K (0xdUL << 2) + #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_64K (0x10UL << 2) + #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_256K (0x12UL << 2) + #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_1M (0x14UL << 2) + #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_2M (0x15UL << 2) + #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_4M (0x16UL << 2) + #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_1G (0x1eUL << 2) + #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_LAST \ + CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_1G + #define CMDQ_REGISTER_MR_UNUSED1 0x80UL u8 access; #define CMDQ_REGISTER_MR_ACCESS_LOCAL_WRITE 0x1UL #define CMDQ_REGISTER_MR_ACCESS_REMOTE_READ 0x2UL @@ -1392,7 +1405,21 @@ struct cmdq_register_mr { #define CMDQ_REGISTER_MR_ACCESS_REMOTE_ATOMIC 0x8UL #define CMDQ_REGISTER_MR_ACCESS_MW_BIND 0x10UL #define CMDQ_REGISTER_MR_ACCESS_ZERO_BASED 0x20UL - __le16 unused_1; + __le16 log2_pbl_pg_size; + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_MASK 0x1fUL + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_SFT 0 + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_4K 0xcUL + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_8K 0xdUL + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_64K 0x10UL + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_256K 0x12UL + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1M 0x14UL + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_2M 0x15UL + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_4M 0x16UL + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1G 0x1eUL + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_LAST \ + CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1G + #define CMDQ_REGISTER_MR_UNUSED11_MASK 0xffe0UL + #define CMDQ_REGISTER_MR_UNUSED11_SFT 5 __le32 key; __le64 pbl; __le64 va; @@ -1799,6 +1826,16 @@ struct cmdq_set_func_resources { u8 resp_size; u8 reserved8; __le64 resp_addr; + __le32 number_of_qp; + __le32 number_of_mrw; + __le32 number_of_srq; + __le32 number_of_cq; + __le32 max_qp_per_vf; + __le32 max_mrw_per_vf; + __le32 max_srq_per_vf; + __le32 max_cq_per_vf; + __le32 max_gid_per_vf; + __le32 stat_ctx_id; }; /* Read hardware resource context command (24 bytes) */ @@ -2013,6 +2050,20 @@ struct creq_modify_qp_resp { __le16 reserved48[3]; }; +/* cmdq_query_roce_stats (size:128b/16B) */ +struct cmdq_query_roce_stats { + u8 opcode; + #define CMDQ_QUERY_ROCE_STATS_OPCODE_QUERY_ROCE_STATS 0x8eUL + #define CMDQ_QUERY_ROCE_STATS_OPCODE_LAST \ + CMDQ_QUERY_ROCE_STATS_OPCODE_QUERY_ROCE_STATS + u8 cmd_size; + __le16 flags; + __le16 cookie; + u8 resp_size; + u8 reserved8; + __le64 resp_addr; +}; + /* Query QP command response (16 bytes) */ struct creq_query_qp_resp { u8 type; @@ -2783,6 +2834,80 @@ struct creq_query_cc_resp_sb { __le64 reserved64_1; }; +/* creq_query_roce_stats_resp (size:128b/16B) */ +struct creq_query_roce_stats_resp { + u8 type; + #define CREQ_QUERY_ROCE_STATS_RESP_TYPE_MASK 0x3fUL + #define CREQ_QUERY_ROCE_STATS_RESP_TYPE_SFT 0 + #define CREQ_QUERY_ROCE_STATS_RESP_TYPE_QP_EVENT 0x38UL + #define CREQ_QUERY_ROCE_STATS_RESP_TYPE_LAST \ + CREQ_QUERY_ROCE_STATS_RESP_TYPE_QP_EVENT + u8 status; + __le16 cookie; + __le32 size; + u8 v; + #define CREQ_QUERY_ROCE_STATS_RESP_V 0x1UL + u8 event; + #define CREQ_QUERY_ROCE_STATS_RESP_EVENT_QUERY_ROCE_STATS 0x8eUL + #define CREQ_QUERY_ROCE_STATS_RESP_EVENT_LAST \ + CREQ_QUERY_ROCE_STATS_RESP_EVENT_QUERY_ROCE_STATS + u8 reserved48[6]; +}; + +/* creq_query_roce_stats_resp_sb (size:2624b/328B) */ +struct creq_query_roce_stats_resp_sb { + u8 opcode; + #define CREQ_QUERY_ROCE_STATS_RESP_SB_OPCODE_QUERY_ROCE_STATS 0x8eUL + #define CREQ_QUERY_ROCE_STATS_RESP_SB_OPCODE_LAST \ + CREQ_QUERY_ROCE_STATS_RESP_SB_OPCODE_QUERY_ROCE_STATS + u8 status; + __le16 cookie; + __le16 flags; + u8 resp_size; + u8 rsvd; + __le32 num_counters; + __le32 rsvd1; + __le64 to_retransmits; + __le64 seq_err_naks_rcvd; + __le64 max_retry_exceeded; + __le64 rnr_naks_rcvd; + __le64 missing_resp; + __le64 unrecoverable_err; + __le64 bad_resp_err; + __le64 local_qp_op_err; + __le64 local_protection_err; + __le64 mem_mgmt_op_err; + __le64 remote_invalid_req_err; + __le64 remote_access_err; + __le64 remote_op_err; + __le64 dup_req; + __le64 res_exceed_max; + __le64 res_length_mismatch; + __le64 res_exceeds_wqe; + __le64 res_opcode_err; + __le64 res_rx_invalid_rkey; + __le64 res_rx_domain_err; + __le64 res_rx_no_perm; + __le64 res_rx_range_err; + __le64 res_tx_invalid_rkey; + __le64 res_tx_domain_err; + __le64 res_tx_no_perm; + __le64 res_tx_range_err; + __le64 res_irrq_oflow; + __le64 res_unsup_opcode; + __le64 res_unaligned_atomic; + __le64 res_rem_inv_err; + __le64 res_mem_error; + __le64 res_srq_err; + __le64 res_cmp_err; + __le64 res_invalid_dup_rkey; + __le64 res_wqe_format_err; + __le64 res_cq_load_err; + __le64 res_srq_load_err; + __le64 res_tx_pci_err; + __le64 res_rx_pci_err; +}; + /* QP error notification event (16 bytes) */ struct creq_qp_error_notification { u8 type; diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 21db3b48a617..4cf17c650c36 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -257,8 +257,8 @@ static void set_emss(struct c4iw_ep *ep, u16 opt) if (ep->emss < 128) ep->emss = 128; if (ep->emss & 7) - pr_warn("Warning: misaligned mtu idx %u mss %u emss=%u\n", - TCPOPT_MSS_G(opt), ep->mss, ep->emss); + pr_debug("Warning: misaligned mtu idx %u mss %u emss=%u\n", + TCPOPT_MSS_G(opt), ep->mss, ep->emss); pr_debug("mss_idx %u mss %u emss=%u\n", TCPOPT_MSS_G(opt), ep->mss, ep->emss); } @@ -2733,9 +2733,8 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) return 0; if (cxgb_is_neg_adv(req->status)) { - pr_warn("%s Negative advice on abort- tid %u status %d (%s)\n", - __func__, ep->hwtid, req->status, - neg_adv_str(req->status)); + pr_debug("Negative advice on abort- tid %u status %d (%s)\n", + ep->hwtid, req->status, neg_adv_str(req->status)); ep->stats.abort_neg_adv++; mutex_lock(&dev->rdev.stats.lock); dev->rdev.stats.neg_adv++; @@ -3567,8 +3566,8 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) case MORIBUND: case ABORTING: case DEAD: - pr_info("%s ignoring disconnect ep %p state %u\n", - __func__, ep, ep->com.state); + pr_debug("ignoring disconnect ep %p state %u\n", + ep, ep->com.state); break; default: WARN_ONCE(1, "Bad endpoint state %u\n", ep->com.state); @@ -4097,9 +4096,15 @@ static void process_work(struct work_struct *work) dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *))); opcode = rpl->ot.opcode; - ret = work_handlers[opcode](dev, skb); - if (!ret) + if (opcode >= ARRAY_SIZE(work_handlers) || + !work_handlers[opcode]) { + pr_err("No handler for opcode 0x%x.\n", opcode); kfree_skb(skb); + } else { + ret = work_handlers[opcode](dev, skb); + if (!ret) + kfree_skb(skb); + } process_timedout_eps(); } } @@ -4201,8 +4206,8 @@ static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb) return 0; } if (cxgb_is_neg_adv(req->status)) { - pr_warn("%s Negative advice on abort- tid %u status %d (%s)\n", - __func__, ep->hwtid, req->status, + pr_debug("Negative advice on abort- tid %u status %d (%s)\n", + ep->hwtid, req->status, neg_adv_str(req->status)); goto out; } diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index af77d128d242..7a9d0de89d6a 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -66,7 +66,7 @@ MODULE_PARM_DESC(c4iw_wr_log_size_order, static LIST_HEAD(uld_ctx_list); static DEFINE_MUTEX(dev_mutex); -struct workqueue_struct *reg_workq; +static struct workqueue_struct *reg_workq; #define DB_FC_RESUME_SIZE 64 #define DB_FC_RESUME_DELAY 1 @@ -108,19 +108,19 @@ void c4iw_log_wr_stats(struct t4_wq *wq, struct t4_cqe *cqe) idx = (atomic_inc_return(&wq->rdev->wr_log_idx) - 1) & (wq->rdev->wr_log_size - 1); le.poll_sge_ts = cxgb4_read_sge_timestamp(wq->rdev->lldi.ports[0]); - getnstimeofday(&le.poll_host_ts); + le.poll_host_time = ktime_get(); le.valid = 1; le.cqe_sge_ts = CQE_TS(cqe); if (SQ_TYPE(cqe)) { le.qid = wq->sq.qid; le.opcode = CQE_OPCODE(cqe); - le.post_host_ts = wq->sq.sw_sq[wq->sq.cidx].host_ts; + le.post_host_time = wq->sq.sw_sq[wq->sq.cidx].host_time; le.post_sge_ts = wq->sq.sw_sq[wq->sq.cidx].sge_ts; le.wr_id = CQE_WRID_SQ_IDX(cqe); } else { le.qid = wq->rq.qid; le.opcode = FW_RI_RECEIVE; - le.post_host_ts = wq->rq.sw_rq[wq->rq.cidx].host_ts; + le.post_host_time = wq->rq.sw_rq[wq->rq.cidx].host_time; le.post_sge_ts = wq->rq.sw_rq[wq->rq.cidx].sge_ts; le.wr_id = CQE_WRID_MSN(cqe); } @@ -130,9 +130,9 @@ void c4iw_log_wr_stats(struct t4_wq *wq, struct t4_cqe *cqe) static int wr_log_show(struct seq_file *seq, void *v) { struct c4iw_dev *dev = seq->private; - struct timespec prev_ts = {0, 0}; + ktime_t prev_time; struct wr_log_entry *lep; - int prev_ts_set = 0; + int prev_time_set = 0; int idx, end; #define ts2ns(ts) div64_u64((ts) * dev->rdev.lldi.cclk_ps, 1000) @@ -145,33 +145,29 @@ static int wr_log_show(struct seq_file *seq, void *v) lep = &dev->rdev.wr_log[idx]; while (idx != end) { if (lep->valid) { - if (!prev_ts_set) { - prev_ts_set = 1; - prev_ts = lep->poll_host_ts; + if (!prev_time_set) { + prev_time_set = 1; + prev_time = lep->poll_host_time; } - seq_printf(seq, "%04u: sec %lu nsec %lu qid %u opcode " - "%u %s 0x%x host_wr_delta sec %lu nsec %lu " + seq_printf(seq, "%04u: nsec %llu qid %u opcode " + "%u %s 0x%x host_wr_delta nsec %llu " "post_sge_ts 0x%llx cqe_sge_ts 0x%llx " "poll_sge_ts 0x%llx post_poll_delta_ns %llu " "cqe_poll_delta_ns %llu\n", idx, - timespec_sub(lep->poll_host_ts, - prev_ts).tv_sec, - timespec_sub(lep->poll_host_ts, - prev_ts).tv_nsec, + ktime_to_ns(ktime_sub(lep->poll_host_time, + prev_time)), lep->qid, lep->opcode, lep->opcode == FW_RI_RECEIVE ? "msn" : "wrid", lep->wr_id, - timespec_sub(lep->poll_host_ts, - lep->post_host_ts).tv_sec, - timespec_sub(lep->poll_host_ts, - lep->post_host_ts).tv_nsec, + ktime_to_ns(ktime_sub(lep->poll_host_time, + lep->post_host_time)), lep->post_sge_ts, lep->cqe_sge_ts, lep->poll_sge_ts, ts2ns(lep->poll_sge_ts - lep->post_sge_ts), ts2ns(lep->poll_sge_ts - lep->cqe_sge_ts)); - prev_ts = lep->poll_host_ts; + prev_time = lep->poll_host_time; } idx++; if (idx > (dev->rdev.wr_log_size - 1)) diff --git a/drivers/infiniband/hw/cxgb4/ev.c b/drivers/infiniband/hw/cxgb4/ev.c index a252d5c40ae3..3e9d8b277ab9 100644 --- a/drivers/infiniband/hw/cxgb4/ev.c +++ b/drivers/infiniband/hw/cxgb4/ev.c @@ -236,7 +236,7 @@ int c4iw_ev_handler(struct c4iw_dev *dev, u32 qid) if (atomic_dec_and_test(&chp->refcnt)) wake_up(&chp->wait); } else { - pr_warn("%s unknown cqid 0x%x\n", __func__, qid); + pr_debug("unknown cqid 0x%x\n", qid); spin_unlock_irqrestore(&dev->lock, flag); } return 0; diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 65dd3726ca02..cc929002c05e 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -153,8 +153,8 @@ struct c4iw_hw_queue { }; struct wr_log_entry { - struct timespec post_host_ts; - struct timespec poll_host_ts; + ktime_t post_host_time; + ktime_t poll_host_time; u64 post_sge_ts; u64 cqe_sge_ts; u64 poll_sge_ts; diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index d5c92fc520d6..de77b6027d69 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -1042,7 +1042,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, if (c4iw_wr_log) { swsqe->sge_ts = cxgb4_read_sge_timestamp( qhp->rhp->rdev.lldi.ports[0]); - getnstimeofday(&swsqe->host_ts); + swsqe->host_time = ktime_get(); } init_wr_hdr(wqe, qhp->wq.sq.pidx, fw_opcode, fw_flags, len16); @@ -1117,8 +1117,8 @@ int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].sge_ts = cxgb4_read_sge_timestamp( qhp->rhp->rdev.lldi.ports[0]); - getnstimeofday( - &qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].host_ts); + qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].host_time = + ktime_get(); } wqe->recv.opcode = FW_RI_RECV_WR; diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h index 79e8ee12c391..8369c7c8de83 100644 --- a/drivers/infiniband/hw/cxgb4/t4.h +++ b/drivers/infiniband/hw/cxgb4/t4.h @@ -277,7 +277,7 @@ struct t4_swsqe { int signaled; u16 idx; int flushed; - struct timespec host_ts; + ktime_t host_time; u64 sge_ts; }; @@ -318,7 +318,7 @@ struct t4_sq { struct t4_swrqe { u64 wr_id; - struct timespec host_ts; + ktime_t host_time; u64 sge_ts; }; diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 4f057e8ffe50..6660f920f42e 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -6518,11 +6518,12 @@ static void _dc_start(struct hfi1_devdata *dd) if (!dd->dc_shutdown) return; - /* - * Take the 8051 out of reset, wait until 8051 is ready, and set host - * version bit. - */ - release_and_wait_ready_8051_firmware(dd); + /* Take the 8051 out of reset */ + write_csr(dd, DC_DC8051_CFG_RST, 0ull); + /* Wait until 8051 is ready */ + if (wait_fm_ready(dd, TIMEOUT_8051_START)) + dd_dev_err(dd, "%s: timeout starting 8051 firmware\n", + __func__); /* Take away reset for LCB and RX FPE (set in lcb_shutdown). */ write_csr(dd, DCC_CFG_RESET, 0x10); @@ -8564,23 +8565,27 @@ int write_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 data) } /* - * If the 8051 is in reset mode (dd->dc_shutdown == 1), this function - * will still continue executing. - * * Returns: * < 0 = Linux error, not able to get access * > 0 = 8051 command RETURN_CODE */ -static int _do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data, - u64 *out_data) +static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data, + u64 *out_data) { u64 reg, completed; int return_code; unsigned long timeout; - lockdep_assert_held(&dd->dc8051_lock); hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data); + mutex_lock(&dd->dc8051_lock); + + /* We can't send any commands to the 8051 if it's in reset */ + if (dd->dc_shutdown) { + return_code = -ENODEV; + goto fail; + } + /* * If an 8051 host command timed out previously, then the 8051 is * stuck. @@ -8681,29 +8686,6 @@ static int _do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data, write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0); fail: - return return_code; -} - -/* - * Returns: - * < 0 = Linux error, not able to get access - * > 0 = 8051 command RETURN_CODE - */ -static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data, - u64 *out_data) -{ - int return_code; - - mutex_lock(&dd->dc8051_lock); - /* We can't send any commands to the 8051 if it's in reset */ - if (dd->dc_shutdown) { - return_code = -ENODEV; - goto fail; - } - - return_code = _do_8051_command(dd, type, in_data, out_data); - -fail: mutex_unlock(&dd->dc8051_lock); return return_code; } @@ -8713,17 +8695,16 @@ static int set_physical_link_state(struct hfi1_devdata *dd, u64 state) return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL); } -static int _load_8051_config(struct hfi1_devdata *dd, u8 field_id, - u8 lane_id, u32 config_data) +int load_8051_config(struct hfi1_devdata *dd, u8 field_id, + u8 lane_id, u32 config_data) { u64 data; int ret; - lockdep_assert_held(&dd->dc8051_lock); data = (u64)field_id << LOAD_DATA_FIELD_ID_SHIFT | (u64)lane_id << LOAD_DATA_LANE_ID_SHIFT | (u64)config_data << LOAD_DATA_DATA_SHIFT; - ret = _do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL); + ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL); if (ret != HCMD_SUCCESS) { dd_dev_err(dd, "load 8051 config: field id %d, lane %d, err %d\n", @@ -8732,18 +8713,6 @@ static int _load_8051_config(struct hfi1_devdata *dd, u8 field_id, return ret; } -int load_8051_config(struct hfi1_devdata *dd, u8 field_id, - u8 lane_id, u32 config_data) -{ - int return_code; - - mutex_lock(&dd->dc8051_lock); - return_code = _load_8051_config(dd, field_id, lane_id, config_data); - mutex_unlock(&dd->dc8051_lock); - - return return_code; -} - /* * Read the 8051 firmware "registers". Use the RAM directly. Always * set the result, even on error. @@ -8859,14 +8828,13 @@ int write_host_interface_version(struct hfi1_devdata *dd, u8 version) u32 frame; u32 mask; - lockdep_assert_held(&dd->dc8051_lock); mask = (HOST_INTERFACE_VERSION_MASK << HOST_INTERFACE_VERSION_SHIFT); read_8051_config(dd, RESERVED_REGISTERS, GENERAL_CONFIG, &frame); /* Clear, then set field */ frame &= ~mask; frame |= ((u32)version << HOST_INTERFACE_VERSION_SHIFT); - return _load_8051_config(dd, RESERVED_REGISTERS, GENERAL_CONFIG, - frame); + return load_8051_config(dd, RESERVED_REGISTERS, GENERAL_CONFIG, + frame); } void read_misc_status(struct hfi1_devdata *dd, u8 *ver_major, u8 *ver_minor, @@ -9270,6 +9238,14 @@ static int set_local_link_attributes(struct hfi1_pportdata *ppd) if (ret != HCMD_SUCCESS) goto set_local_link_attributes_fail; + ret = write_host_interface_version(dd, HOST_INTERFACE_VERSION); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to set host interface version, return 0x%x\n", + ret); + goto set_local_link_attributes_fail; + } + /* * DC supports continuous updates. */ @@ -14944,9 +14920,8 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, if (num_vls < HFI1_MIN_VLS_SUPPORTED || num_vls > HFI1_MAX_VLS_SUPPORTED) { - hfi1_early_err(&pdev->dev, - "Invalid num_vls %u, using %u VLs\n", - num_vls, HFI1_MAX_VLS_SUPPORTED); + dd_dev_err(dd, "Invalid num_vls %u, using %u VLs\n", + num_vls, HFI1_MAX_VLS_SUPPORTED); num_vls = HFI1_MAX_VLS_SUPPORTED; } ppd->vls_supported = num_vls; diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 133e313feca4..21fca8ec5076 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -508,6 +508,7 @@ #define DOWN_REMOTE_REASON_SHIFT 16 #define DOWN_REMOTE_REASON_MASK 0xff +#define HOST_INTERFACE_VERSION 1 #define HOST_INTERFACE_VERSION_SHIFT 16 #define HOST_INTERFACE_VERSION_MASK 0xff @@ -713,7 +714,6 @@ void read_misc_status(struct hfi1_devdata *dd, u8 *ver_major, u8 *ver_minor, u8 *ver_patch); int write_host_interface_version(struct hfi1_devdata *dd, u8 version); void read_guid(struct hfi1_devdata *dd); -int release_and_wait_ready_8051_firmware(struct hfi1_devdata *dd); int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout); void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason, u8 neigh_reason, u8 rem_reason); diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index 4f65ac671044..067b29f35f21 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -159,22 +159,6 @@ static int hfi1_caps_get(char *buffer, const struct kernel_param *kp) return scnprintf(buffer, PAGE_SIZE, "0x%lx", cap_mask); } -const char *get_unit_name(int unit) -{ - static char iname[16]; - - snprintf(iname, sizeof(iname), DRIVER_NAME "_%u", unit); - return iname; -} - -const char *get_card_name(struct rvt_dev_info *rdi) -{ - struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi); - struct hfi1_devdata *dd = container_of(ibdev, - struct hfi1_devdata, verbs_dev); - return get_unit_name(dd->unit); -} - struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi) { struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi); diff --git a/drivers/infiniband/hw/hfi1/firmware.c b/drivers/infiniband/hw/hfi1/firmware.c index 98868df78a7e..2b57ba70ddd6 100644 --- a/drivers/infiniband/hw/hfi1/firmware.c +++ b/drivers/infiniband/hw/hfi1/firmware.c @@ -68,7 +68,6 @@ #define ALT_FW_FABRIC_NAME "hfi1_fabric_d.fw" #define ALT_FW_SBUS_NAME "hfi1_sbus_d.fw" #define ALT_FW_PCIE_NAME "hfi1_pcie_d.fw" -#define HOST_INTERFACE_VERSION 1 MODULE_FIRMWARE(DEFAULT_FW_8051_NAME_ASIC); MODULE_FIRMWARE(DEFAULT_FW_FABRIC_NAME); @@ -976,46 +975,6 @@ int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout) } /* - * Clear all reset bits, releasing the 8051. - * Wait for firmware to be ready to accept host requests. - * Then, set host version bit. - * - * This function executes even if the 8051 is in reset mode when - * dd->dc_shutdown == 1. - * - * Expects dd->dc8051_lock to be held. - */ -int release_and_wait_ready_8051_firmware(struct hfi1_devdata *dd) -{ - int ret; - - lockdep_assert_held(&dd->dc8051_lock); - /* clear all reset bits, releasing the 8051 */ - write_csr(dd, DC_DC8051_CFG_RST, 0ull); - - /* - * Wait for firmware to be ready to accept host - * requests. - */ - ret = wait_fm_ready(dd, TIMEOUT_8051_START); - if (ret) { - dd_dev_err(dd, "8051 start timeout, current FW state 0x%x\n", - get_firmware_state(dd)); - return ret; - } - - ret = write_host_interface_version(dd, HOST_INTERFACE_VERSION); - if (ret != HCMD_SUCCESS) { - dd_dev_err(dd, - "Failed to set host interface version, return 0x%x\n", - ret); - return -EIO; - } - - return 0; -} - -/* * Load the 8051 firmware. */ static int load_8051_firmware(struct hfi1_devdata *dd, @@ -1080,22 +1039,31 @@ static int load_8051_firmware(struct hfi1_devdata *dd, if (ret) return ret; + /* clear all reset bits, releasing the 8051 */ + write_csr(dd, DC_DC8051_CFG_RST, 0ull); + /* - * Clear all reset bits, releasing the 8051. * DC reset step 5. Wait for firmware to be ready to accept host * requests. - * Then, set host version bit. */ - mutex_lock(&dd->dc8051_lock); - ret = release_and_wait_ready_8051_firmware(dd); - mutex_unlock(&dd->dc8051_lock); - if (ret) - return ret; + ret = wait_fm_ready(dd, TIMEOUT_8051_START); + if (ret) { /* timed out */ + dd_dev_err(dd, "8051 start timeout, current state 0x%x\n", + get_firmware_state(dd)); + return -ETIMEDOUT; + } read_misc_status(dd, &ver_major, &ver_minor, &ver_patch); dd_dev_info(dd, "8051 firmware version %d.%d.%d\n", (int)ver_major, (int)ver_minor, (int)ver_patch); dd->dc8051_ver = dc8051_ver(ver_major, ver_minor, ver_patch); + ret = write_host_interface_version(dd, HOST_INTERFACE_VERSION); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to set host interface version, return 0x%x\n", + ret); + return -EIO; + } return 0; } diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 8ce9118d4a7f..b42c22292597 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1623,7 +1623,7 @@ static int ingress_pkey_table_search(struct hfi1_pportdata *ppd, u16 pkey) * the 'error info' for this failure. */ static void ingress_pkey_table_fail(struct hfi1_pportdata *ppd, u16 pkey, - u16 slid) + u32 slid) { struct hfi1_devdata *dd = ppd->dd; @@ -1971,8 +1971,6 @@ int get_platform_config_field(struct hfi1_devdata *dd, table_type, int table_index, int field_index, u32 *data, u32 len); -const char *get_unit_name(int unit); -const char *get_card_name(struct rvt_dev_info *rdi); struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi); /* @@ -2122,39 +2120,42 @@ static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd) #define dd_dev_emerg(dd, fmt, ...) \ dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \ - get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) #define dd_dev_err(dd, fmt, ...) \ dev_err(&(dd)->pcidev->dev, "%s: " fmt, \ - get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) #define dd_dev_err_ratelimited(dd, fmt, ...) \ dev_err_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \ - get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), \ + ##__VA_ARGS__) #define dd_dev_warn(dd, fmt, ...) \ dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \ - get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) #define dd_dev_warn_ratelimited(dd, fmt, ...) \ dev_warn_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \ - get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), \ + ##__VA_ARGS__) #define dd_dev_info(dd, fmt, ...) \ dev_info(&(dd)->pcidev->dev, "%s: " fmt, \ - get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) #define dd_dev_info_ratelimited(dd, fmt, ...) \ dev_info_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \ - get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), \ + ##__VA_ARGS__) #define dd_dev_dbg(dd, fmt, ...) \ dev_dbg(&(dd)->pcidev->dev, "%s: " fmt, \ - get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) #define hfi1_dev_porterr(dd, port, fmt, ...) \ dev_err(&(dd)->pcidev->dev, "%s: port %u: " fmt, \ - get_unit_name((dd)->unit), (port), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), (port), ##__VA_ARGS__) /* * this is used for formatting hw error messages... diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 8e3b3e7d829a..9b128268fb28 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1272,6 +1272,8 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) "Could not allocate unit ID: error %d\n", -ret); goto bail; } + rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit); + /* * Initialize all locks for the device. This needs to be as early as * possible so locks are usable. diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index cf8dba34fe30..34547a48a445 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -4348,11 +4348,7 @@ static int opa_local_smp_check(struct hfi1_ibport *ibp, */ if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) return 0; - /* - * On OPA devices it is okay to lose the upper 16 bits of LID as this - * information is obtained elsewhere. Mask off the upper 16 bits. - */ - ingress_pkey_table_fail(ppd, pkey, ib_lid_cpu16(0xFFFF & in_wc->slid)); + ingress_pkey_table_fail(ppd, pkey, in_wc->slid); return 1; } diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 4b01ccd895b4..5507910e8b8a 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -556,6 +556,8 @@ void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter) struct sdma_engine *sde; struct send_context *send_context; struct rvt_ack_entry *e = NULL; + struct rvt_srq *srq = qp->ibqp.srq ? + ibsrq_to_rvtsrq(qp->ibqp.srq) : NULL; sde = qp_to_sdma_engine(qp, priv->s_sc); wqe = rvt_get_swqe_ptr(qp, qp->s_last); @@ -563,7 +565,7 @@ void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter) if (qp->s_ack_queue) e = &qp->s_ack_queue[qp->s_tail_ack_queue]; seq_printf(s, - "N %d %s QP %x R %u %s %u %u %u f=%x %u %u %u %u %u %u SPSN %x %x %x %x %x RPSN %x S(%u %u %u %u %u %u %u) R(%u %u %u) RQP %x LID %x SL %u MTU %u %u %u %u %u SDE %p,%u SC %p,%u SCQ %u %u PID %d OS %x %x E %x %x %x\n", + "N %d %s QP %x R %u %s %u %u %u f=%x %u %u %u %u %u %u SPSN %x %x %x %x %x RPSN %x S(%u %u %u %u %u %u %u) R(%u %u %u) RQP %x LID %x SL %u MTU %u %u %u %u %u SDE %p,%u SC %p,%u SCQ %u %u PID %d OS %x %x E %x %x %x RNR %d %s %d\n", iter->n, qp_idle(qp) ? "I" : "B", qp->ibqp.qp_num, @@ -610,7 +612,11 @@ void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter) /* ack queue information */ e ? e->opcode : 0, e ? e->psn : 0, - e ? e->lpsn : 0); + e ? e->lpsn : 0, + qp->r_min_rnr_timer, + srq ? "SRQ" : "RQ", + srq ? srq->rq.size : qp->r_rq.size + ); } void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp) diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 7eb5d50578ba..14cc212a21c7 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -841,11 +841,11 @@ static inline void hfi1_make_rc_ack_16B(struct rvt_qp *qp, /* Convert dwords to flits */ len = (*hwords + *nwords) >> 1; - hfi1_make_16b_hdr(hdr, - ppd->lid | rdma_ah_get_path_bits(&qp->remote_ah_attr), + hfi1_make_16b_hdr(hdr, ppd->lid | + (rdma_ah_get_path_bits(&qp->remote_ah_attr) & + ((1 << ppd->lmc) - 1)), opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), - 16B), - len, pkey, becn, 0, l4, sc5); + 16B), len, pkey, becn, 0, l4, sc5); bth0 = pkey | (OP(ACKNOWLEDGE) << 24); bth0 |= extra_bytes << 20; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index a38785e224cc..b8776a362a91 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1486,7 +1486,7 @@ static int query_port(struct rvt_dev_info *rdi, u8 port_num, props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ? 4096 : hfi1_max_mtu), IB_MTU_4096); props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu : - mtu_to_enum(ppd->ibmtu, IB_MTU_2048); + mtu_to_enum(ppd->ibmtu, IB_MTU_4096); /* * sm_lid of 0xFFFF needs special handling so that it can @@ -1844,7 +1844,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) struct hfi1_ibport *ibp = &ppd->ibport_data; unsigned i; int ret; - size_t lcpysz = IB_DEVICE_NAME_MAX; for (i = 0; i < dd->num_pports; i++) init_ibport(ppd + i); @@ -1872,8 +1871,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) */ if (!ib_hfi1_sys_image_guid) ib_hfi1_sys_image_guid = ibdev->node_guid; - lcpysz = strlcpy(ibdev->name, class_name(), lcpysz); - strlcpy(ibdev->name + lcpysz, "_%d", IB_DEVICE_NAME_MAX - lcpysz); ibdev->owner = THIS_MODULE; ibdev->phys_port_cnt = dd->num_pports; ibdev->dev.parent = &dd->pcidev->dev; @@ -1893,7 +1890,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) * Fill in rvt info object. */ dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files; - dd->verbs_dev.rdi.driver_f.get_card_name = get_card_name; dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev; dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah; dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah; diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile index ff426a625e13..97bf2cd1cacb 100644 --- a/drivers/infiniband/hw/hns/Makefile +++ b/drivers/infiniband/hw/hns/Makefile @@ -5,7 +5,7 @@ ccflags-y := -Idrivers/net/ethernet/hisilicon/hns3 obj-$(CONFIG_INFINIBAND_HNS) += hns-roce.o -hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_eq.o hns_roce_pd.o \ +hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \ hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \ hns_roce_cq.o hns_roce_alloc.o obj-$(CONFIG_INFINIBAND_HNS_HIP06) += hns-roce-hw-v1.o diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.c b/drivers/infiniband/hw/hns/hns_roce_cmd.c index 1085cb249bc1..9ebe839d8b24 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.c +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.c @@ -103,6 +103,7 @@ void hns_roce_cmd_event(struct hns_roce_dev *hr_dev, u16 token, u8 status, context->out_param = out_param; complete(&context->done); } +EXPORT_SYMBOL_GPL(hns_roce_cmd_event); /* this should be called with "use_events" */ static int __hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param, diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.h b/drivers/infiniband/hw/hns/hns_roce_cmd.h index b1c94223c28b..9549ae51a0dd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.h +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.h @@ -88,6 +88,16 @@ enum { HNS_ROCE_CMD_DESTROY_SRQC_BT0 = 0x38, HNS_ROCE_CMD_DESTROY_SRQC_BT1 = 0x39, HNS_ROCE_CMD_DESTROY_SRQC_BT2 = 0x3a, + + /* EQC commands */ + HNS_ROCE_CMD_CREATE_AEQC = 0x80, + HNS_ROCE_CMD_MODIFY_AEQC = 0x81, + HNS_ROCE_CMD_QUERY_AEQC = 0x82, + HNS_ROCE_CMD_DESTROY_AEQC = 0x83, + HNS_ROCE_CMD_CREATE_CEQC = 0x90, + HNS_ROCE_CMD_MODIFY_CEQC = 0x91, + HNS_ROCE_CMD_QUERY_CEQC = 0x92, + HNS_ROCE_CMD_DESTROY_CEQC = 0x93, }; enum { diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h index 7ecb7a4147a8..dd67fafd0c40 100644 --- a/drivers/infiniband/hw/hns/hns_roce_common.h +++ b/drivers/infiniband/hw/hns/hns_roce_common.h @@ -376,6 +376,12 @@ #define ROCEE_RX_CMQ_TAIL_REG 0x07024 #define ROCEE_RX_CMQ_HEAD_REG 0x07028 +#define ROCEE_VF_MB_CFG0_REG 0x40 +#define ROCEE_VF_MB_STATUS_REG 0x58 + +#define ROCEE_VF_EQ_DB_CFG0_REG 0x238 +#define ROCEE_VF_EQ_DB_CFG1_REG 0x23C + #define ROCEE_VF_SMAC_CFG0_REG 0x12000 #define ROCEE_VF_SMAC_CFG1_REG 0x12004 @@ -385,4 +391,9 @@ #define ROCEE_VF_SGID_CFG3_REG 0x1000c #define ROCEE_VF_SGID_CFG4_REG 0x10010 +#define ROCEE_VF_ABN_INT_CFG_REG 0x13000 +#define ROCEE_VF_ABN_INT_ST_REG 0x13004 +#define ROCEE_VF_ABN_INT_EN_REG 0x13008 +#define ROCEE_VF_EVENT_INT_EN_REG 0x1300c + #endif /* _HNS_ROCE_COMMON_H */ diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 2111b57a3489..bccc9b54c9ce 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -196,15 +196,14 @@ void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) if (ret) dev_err(dev, "HW2SW_CQ failed (%d) for CQN %06lx\n", ret, hr_cq->cqn); - if (hr_dev->eq_table.eq) { - /* Waiting interrupt process procedure carried out */ - synchronize_irq(hr_dev->eq_table.eq[hr_cq->vector].irq); - - /* wait for all interrupt processed */ - if (atomic_dec_and_test(&hr_cq->refcount)) - complete(&hr_cq->free); - wait_for_completion(&hr_cq->free); - } + + /* Waiting interrupt process procedure carried out */ + synchronize_irq(hr_dev->eq_table.eq[hr_cq->vector].irq); + + /* wait for all interrupt processed */ + if (atomic_dec_and_test(&hr_cq->refcount)) + complete(&hr_cq->free); + wait_for_completion(&hr_cq->free); spin_lock_irq(&cq_table->lock); radix_tree_delete(&cq_table->tree, hr_cq->cqn); @@ -460,6 +459,7 @@ void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn) ++cq->arm_sn; cq->comp(cq); } +EXPORT_SYMBOL_GPL(hns_roce_cq_completion); void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type) { @@ -482,6 +482,7 @@ void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type) if (atomic_dec_and_test(&cq->refcount)) complete(&cq->free); } +EXPORT_SYMBOL_GPL(hns_roce_cq_event); int hns_roce_init_cq_table(struct hns_roce_dev *hr_dev) { diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index b154ce40cded..42c3b5a2d441 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -62,12 +62,16 @@ #define HNS_ROCE_CQE_WCMD_EMPTY_BIT 0x2 #define HNS_ROCE_MIN_CQE_CNT 16 -#define HNS_ROCE_MAX_IRQ_NUM 34 +#define HNS_ROCE_MAX_IRQ_NUM 128 -#define HNS_ROCE_COMP_VEC_NUM 32 +#define EQ_ENABLE 1 +#define EQ_DISABLE 0 -#define HNS_ROCE_AEQE_VEC_NUM 1 -#define HNS_ROCE_AEQE_OF_VEC_NUM 1 +#define HNS_ROCE_CEQ 0 +#define HNS_ROCE_AEQ 1 + +#define HNS_ROCE_CEQ_ENTRY_SIZE 0x4 +#define HNS_ROCE_AEQ_ENTRY_SIZE 0x10 /* 4G/4K = 1M */ #define HNS_ROCE_SL_SHIFT 28 @@ -130,6 +134,7 @@ enum hns_roce_event { HNS_ROCE_EVENT_TYPE_DB_OVERFLOW = 0x12, HNS_ROCE_EVENT_TYPE_MB = 0x13, HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW = 0x14, + HNS_ROCE_EVENT_TYPE_FLR = 0x15, }; /* Local Work Queue Catastrophic Error,SUBTYPE 0x5 */ @@ -173,6 +178,7 @@ enum { enum { HNS_ROCE_CAP_FLAG_REREG_MR = BIT(0), HNS_ROCE_CAP_FLAG_ROCE_V1_V2 = BIT(1), + HNS_ROCE_CAP_FLAG_RQ_INLINE = BIT(2) }; enum hns_roce_mtt_type { @@ -441,6 +447,21 @@ struct hns_roce_cmd_mailbox { struct hns_roce_dev; +struct hns_roce_rinl_sge { + void *addr; + u32 len; +}; + +struct hns_roce_rinl_wqe { + struct hns_roce_rinl_sge *sg_list; + u32 sge_cnt; +}; + +struct hns_roce_rinl_buf { + struct hns_roce_rinl_wqe *wqe_list; + u32 wqe_cnt; +}; + struct hns_roce_qp { struct ib_qp ibqp; struct hns_roce_buf hr_buf; @@ -462,7 +483,9 @@ struct hns_roce_qp { u8 resp_depth; u8 state; u32 access_flags; + u32 atomic_rd_en; u32 pkey_index; + u32 qkey; void (*event)(struct hns_roce_qp *, enum hns_roce_event); unsigned long qpn; @@ -472,6 +495,8 @@ struct hns_roce_qp { struct hns_roce_sge sge; u32 next_sge; + + struct hns_roce_rinl_buf rq_inl_buf; }; struct hns_roce_sqp { @@ -485,6 +510,45 @@ struct hns_roce_ib_iboe { u8 phy_port[HNS_ROCE_MAX_PORTS]; }; +enum { + HNS_ROCE_EQ_STAT_INVALID = 0, + HNS_ROCE_EQ_STAT_VALID = 2, +}; + +struct hns_roce_ceqe { + u32 comp; +}; + +struct hns_roce_aeqe { + u32 asyn; + union { + struct { + u32 qp; + u32 rsv0; + u32 rsv1; + } qp_event; + + struct { + u32 cq; + u32 rsv0; + u32 rsv1; + } cq_event; + + struct { + u32 ceqe; + u32 rsv0; + u32 rsv1; + } ce_event; + + struct { + __le64 out_param; + __le16 token; + u8 status; + u8 rsv0; + } __packed cmd; + } event; +}; + struct hns_roce_eq { struct hns_roce_dev *hr_dev; void __iomem *doorbell; @@ -498,11 +562,31 @@ struct hns_roce_eq { int log_page_size; int cons_index; struct hns_roce_buf_list *buf_list; + int over_ignore; + int coalesce; + int arm_st; + u64 eqe_ba; + int eqe_ba_pg_sz; + int eqe_buf_pg_sz; + int hop_num; + u64 *bt_l0; /* Base address table for L0 */ + u64 **bt_l1; /* Base address table for L1 */ + u64 **buf; + dma_addr_t l0_dma; + dma_addr_t *l1_dma; + dma_addr_t *buf_dma; + u32 l0_last_num; /* L0 last chunk num */ + u32 l1_last_num; /* L1 last chunk num */ + int eq_max_cnt; + int eq_period; + int shift; + dma_addr_t cur_eqe_ba; + dma_addr_t nxt_eqe_ba; }; struct hns_roce_eq_table { struct hns_roce_eq *eq; - void __iomem **eqc_base; + void __iomem **eqc_base; /* only for hw v1 */ }; struct hns_roce_caps { @@ -528,7 +612,7 @@ struct hns_roce_caps { u32 min_wqes; int reserved_cqs; int num_aeq_vectors; /* 1 */ - int num_comp_vectors; /* 32 ceq */ + int num_comp_vectors; int num_other_vectors; int num_mtpts; u32 num_mtt_segs; @@ -550,7 +634,7 @@ struct hns_roce_caps { u32 pbl_buf_pg_sz; u32 pbl_hop_num; int aeqe_depth; - int ceqe_depth[HNS_ROCE_COMP_VEC_NUM]; + int ceqe_depth; enum ib_mtu max_mtu; u32 qpc_bt_num; u32 srqc_bt_num; @@ -574,6 +658,9 @@ struct hns_roce_caps { u32 cqe_ba_pg_sz; u32 cqe_buf_pg_sz; u32 cqe_hop_num; + u32 eqe_ba_pg_sz; + u32 eqe_buf_pg_sz; + u32 eqe_hop_num; u32 chunk_sz; /* chunk size in non multihop mode*/ u64 flags; }; @@ -623,6 +710,8 @@ struct hns_roce_hw { int (*dereg_mr)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr); int (*destroy_cq)(struct ib_cq *ibcq); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); + int (*init_eq)(struct hns_roce_dev *hr_dev); + void (*cleanup_eq)(struct hns_roce_dev *hr_dev); }; struct hns_roce_dev { diff --git a/drivers/infiniband/hw/hns/hns_roce_eq.c b/drivers/infiniband/hw/hns/hns_roce_eq.c deleted file mode 100644 index d184431e2bf5..000000000000 --- a/drivers/infiniband/hw/hns/hns_roce_eq.c +++ /dev/null @@ -1,759 +0,0 @@ -/* - * Copyright (c) 2016 Hisilicon Limited. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include <linux/platform_device.h> -#include <linux/interrupt.h> -#include "hns_roce_common.h" -#include "hns_roce_device.h" -#include "hns_roce_eq.h" - -static void eq_set_cons_index(struct hns_roce_eq *eq, int req_not) -{ - roce_raw_write((eq->cons_index & CONS_INDEX_MASK) | - (req_not << eq->log_entries), eq->doorbell); - /* Memory barrier */ - mb(); -} - -static struct hns_roce_aeqe *get_aeqe(struct hns_roce_eq *eq, u32 entry) -{ - unsigned long off = (entry & (eq->entries - 1)) * - HNS_ROCE_AEQ_ENTRY_SIZE; - - return (struct hns_roce_aeqe *)((u8 *) - (eq->buf_list[off / HNS_ROCE_BA_SIZE].buf) + - off % HNS_ROCE_BA_SIZE); -} - -static struct hns_roce_aeqe *next_aeqe_sw(struct hns_roce_eq *eq) -{ - struct hns_roce_aeqe *aeqe = get_aeqe(eq, eq->cons_index); - - return (roce_get_bit(aeqe->asyn, HNS_ROCE_AEQE_U32_4_OWNER_S) ^ - !!(eq->cons_index & eq->entries)) ? aeqe : NULL; -} - -static void hns_roce_wq_catas_err_handle(struct hns_roce_dev *hr_dev, - struct hns_roce_aeqe *aeqe, int qpn) -{ - struct device *dev = &hr_dev->pdev->dev; - - dev_warn(dev, "Local Work Queue Catastrophic Error.\n"); - switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M, - HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) { - case HNS_ROCE_LWQCE_QPC_ERROR: - dev_warn(dev, "QP %d, QPC error.\n", qpn); - break; - case HNS_ROCE_LWQCE_MTU_ERROR: - dev_warn(dev, "QP %d, MTU error.\n", qpn); - break; - case HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR: - dev_warn(dev, "QP %d, WQE BA addr error.\n", qpn); - break; - case HNS_ROCE_LWQCE_WQE_ADDR_ERROR: - dev_warn(dev, "QP %d, WQE addr error.\n", qpn); - break; - case HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR: - dev_warn(dev, "QP %d, WQE shift error\n", qpn); - break; - case HNS_ROCE_LWQCE_SL_ERROR: - dev_warn(dev, "QP %d, SL error.\n", qpn); - break; - case HNS_ROCE_LWQCE_PORT_ERROR: - dev_warn(dev, "QP %d, port error.\n", qpn); - break; - default: - break; - } -} - -static void hns_roce_local_wq_access_err_handle(struct hns_roce_dev *hr_dev, - struct hns_roce_aeqe *aeqe, - int qpn) -{ - struct device *dev = &hr_dev->pdev->dev; - - dev_warn(dev, "Local Access Violation Work Queue Error.\n"); - switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M, - HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) { - case HNS_ROCE_LAVWQE_R_KEY_VIOLATION: - dev_warn(dev, "QP %d, R_key violation.\n", qpn); - break; - case HNS_ROCE_LAVWQE_LENGTH_ERROR: - dev_warn(dev, "QP %d, length error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_VA_ERROR: - dev_warn(dev, "QP %d, VA error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_PD_ERROR: - dev_err(dev, "QP %d, PD error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_RW_ACC_ERROR: - dev_warn(dev, "QP %d, rw acc error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_KEY_STATE_ERROR: - dev_warn(dev, "QP %d, key state error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_MR_OPERATION_ERROR: - dev_warn(dev, "QP %d, MR operation error.\n", qpn); - break; - default: - break; - } -} - -static void hns_roce_qp_err_handle(struct hns_roce_dev *hr_dev, - struct hns_roce_aeqe *aeqe, - int event_type) -{ - struct device *dev = &hr_dev->pdev->dev; - int phy_port; - int qpn; - - qpn = roce_get_field(aeqe->event.qp_event.qp, - HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_M, - HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_S); - phy_port = roce_get_field(aeqe->event.qp_event.qp, - HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_M, - HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_S); - if (qpn <= 1) - qpn = HNS_ROCE_MAX_PORTS * qpn + phy_port; - - switch (event_type) { - case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: - dev_warn(dev, "Invalid Req Local Work Queue Error.\n" - "QP %d, phy_port %d.\n", qpn, phy_port); - break; - case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: - hns_roce_wq_catas_err_handle(hr_dev, aeqe, qpn); - break; - case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: - hns_roce_local_wq_access_err_handle(hr_dev, aeqe, qpn); - break; - default: - break; - } - - hns_roce_qp_event(hr_dev, qpn, event_type); -} - -static void hns_roce_cq_err_handle(struct hns_roce_dev *hr_dev, - struct hns_roce_aeqe *aeqe, - int event_type) -{ - struct device *dev = &hr_dev->pdev->dev; - u32 cqn; - - cqn = le32_to_cpu(roce_get_field(aeqe->event.cq_event.cq, - HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_M, - HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_S)); - - switch (event_type) { - case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: - dev_warn(dev, "CQ 0x%x access err.\n", cqn); - break; - case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: - dev_warn(dev, "CQ 0x%x overflow\n", cqn); - break; - case HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID: - dev_warn(dev, "CQ 0x%x ID invalid.\n", cqn); - break; - default: - break; - } - - hns_roce_cq_event(hr_dev, cqn, event_type); -} - -static void hns_roce_db_overflow_handle(struct hns_roce_dev *hr_dev, - struct hns_roce_aeqe *aeqe) -{ - struct device *dev = &hr_dev->pdev->dev; - - switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M, - HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) { - case HNS_ROCE_DB_SUBTYPE_SDB_OVF: - dev_warn(dev, "SDB overflow.\n"); - break; - case HNS_ROCE_DB_SUBTYPE_SDB_ALM_OVF: - dev_warn(dev, "SDB almost overflow.\n"); - break; - case HNS_ROCE_DB_SUBTYPE_SDB_ALM_EMP: - dev_warn(dev, "SDB almost empty.\n"); - break; - case HNS_ROCE_DB_SUBTYPE_ODB_OVF: - dev_warn(dev, "ODB overflow.\n"); - break; - case HNS_ROCE_DB_SUBTYPE_ODB_ALM_OVF: - dev_warn(dev, "ODB almost overflow.\n"); - break; - case HNS_ROCE_DB_SUBTYPE_ODB_ALM_EMP: - dev_warn(dev, "SDB almost empty.\n"); - break; - default: - break; - } -} - -static int hns_roce_aeq_int(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) -{ - struct device *dev = &hr_dev->pdev->dev; - struct hns_roce_aeqe *aeqe; - int aeqes_found = 0; - int event_type; - - while ((aeqe = next_aeqe_sw(eq))) { - dev_dbg(dev, "aeqe = %p, aeqe->asyn.event_type = 0x%lx\n", aeqe, - roce_get_field(aeqe->asyn, - HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M, - HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S)); - /* Memory barrier */ - rmb(); - - event_type = roce_get_field(aeqe->asyn, - HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M, - HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S); - switch (event_type) { - case HNS_ROCE_EVENT_TYPE_PATH_MIG: - dev_warn(dev, "PATH MIG not supported\n"); - break; - case HNS_ROCE_EVENT_TYPE_COMM_EST: - dev_warn(dev, "COMMUNICATION established\n"); - break; - case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: - dev_warn(dev, "SQ DRAINED not supported\n"); - break; - case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED: - dev_warn(dev, "PATH MIG failed\n"); - break; - case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: - case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: - case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: - hns_roce_qp_err_handle(hr_dev, aeqe, event_type); - break; - case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: - case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR: - case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH: - dev_warn(dev, "SRQ not support!\n"); - break; - case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: - case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: - case HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID: - hns_roce_cq_err_handle(hr_dev, aeqe, event_type); - break; - case HNS_ROCE_EVENT_TYPE_PORT_CHANGE: - dev_warn(dev, "port change.\n"); - break; - case HNS_ROCE_EVENT_TYPE_MB: - hns_roce_cmd_event(hr_dev, - le16_to_cpu(aeqe->event.cmd.token), - aeqe->event.cmd.status, - le64_to_cpu(aeqe->event.cmd.out_param - )); - break; - case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW: - hns_roce_db_overflow_handle(hr_dev, aeqe); - break; - case HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW: - dev_warn(dev, "CEQ 0x%lx overflow.\n", - roce_get_field(aeqe->event.ce_event.ceqe, - HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_M, - HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_S)); - break; - default: - dev_warn(dev, "Unhandled event %d on EQ %d at index %u\n", - event_type, eq->eqn, eq->cons_index); - break; - } - - eq->cons_index++; - aeqes_found = 1; - - if (eq->cons_index > 2 * hr_dev->caps.aeqe_depth - 1) { - dev_warn(dev, "cons_index overflow, set back to zero\n" - ); - eq->cons_index = 0; - } - } - - eq_set_cons_index(eq, 0); - - return aeqes_found; -} - -static struct hns_roce_ceqe *get_ceqe(struct hns_roce_eq *eq, u32 entry) -{ - unsigned long off = (entry & (eq->entries - 1)) * - HNS_ROCE_CEQ_ENTRY_SIZE; - - return (struct hns_roce_ceqe *)((u8 *) - (eq->buf_list[off / HNS_ROCE_BA_SIZE].buf) + - off % HNS_ROCE_BA_SIZE); -} - -static struct hns_roce_ceqe *next_ceqe_sw(struct hns_roce_eq *eq) -{ - struct hns_roce_ceqe *ceqe = get_ceqe(eq, eq->cons_index); - - return (!!(roce_get_bit(ceqe->ceqe.comp, - HNS_ROCE_CEQE_CEQE_COMP_OWNER_S))) ^ - (!!(eq->cons_index & eq->entries)) ? ceqe : NULL; -} - -static int hns_roce_ceq_int(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) -{ - struct hns_roce_ceqe *ceqe; - int ceqes_found = 0; - u32 cqn; - - while ((ceqe = next_ceqe_sw(eq))) { - /* Memory barrier */ - rmb(); - cqn = roce_get_field(ceqe->ceqe.comp, - HNS_ROCE_CEQE_CEQE_COMP_CQN_M, - HNS_ROCE_CEQE_CEQE_COMP_CQN_S); - hns_roce_cq_completion(hr_dev, cqn); - - ++eq->cons_index; - ceqes_found = 1; - - if (eq->cons_index > 2 * hr_dev->caps.ceqe_depth[eq->eqn] - 1) { - dev_warn(&eq->hr_dev->pdev->dev, - "cons_index overflow, set back to zero\n"); - eq->cons_index = 0; - } - } - - eq_set_cons_index(eq, 0); - - return ceqes_found; -} - -static int hns_roce_aeq_ovf_int(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq) -{ - struct device *dev = &eq->hr_dev->pdev->dev; - int eqovf_found = 0; - u32 caepaemask_val; - u32 cealmovf_val; - u32 caepaest_val; - u32 aeshift_val; - u32 ceshift_val; - u32 cemask_val; - int i = 0; - - /** - * AEQ overflow ECC mult bit err CEQ overflow alarm - * must clear interrupt, mask irq, clear irq, cancel mask operation - */ - aeshift_val = roce_read(hr_dev, ROCEE_CAEP_AEQC_AEQE_SHIFT_REG); - - if (roce_get_bit(aeshift_val, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQ_ALM_OVF_INT_ST_S) == 1) { - dev_warn(dev, "AEQ overflow!\n"); - - /* Set mask */ - caepaemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG); - roce_set_bit(caepaemask_val, - ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S, - HNS_ROCE_INT_MASK_ENABLE); - roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, caepaemask_val); - - /* Clear int state(INT_WC : write 1 clear) */ - caepaest_val = roce_read(hr_dev, ROCEE_CAEP_AE_ST_REG); - roce_set_bit(caepaest_val, - ROCEE_CAEP_AE_ST_CAEP_AEQ_ALM_OVF_S, 1); - roce_write(hr_dev, ROCEE_CAEP_AE_ST_REG, caepaest_val); - - /* Clear mask */ - caepaemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG); - roce_set_bit(caepaemask_val, - ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S, - HNS_ROCE_INT_MASK_DISABLE); - roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, caepaemask_val); - } - - /* CEQ almost overflow */ - for (i = 0; i < hr_dev->caps.num_comp_vectors; i++) { - ceshift_val = roce_read(hr_dev, ROCEE_CAEP_CEQC_SHIFT_0_REG + - i * CEQ_REG_OFFSET); - - if (roce_get_bit(ceshift_val, - ROCEE_CAEP_CEQC_SHIFT_CAEP_CEQ_ALM_OVF_INT_ST_S) == 1) { - dev_warn(dev, "CEQ[%d] almost overflow!\n", i); - eqovf_found++; - - /* Set mask */ - cemask_val = roce_read(hr_dev, - ROCEE_CAEP_CE_IRQ_MASK_0_REG + - i * CEQ_REG_OFFSET); - roce_set_bit(cemask_val, - ROCEE_CAEP_CE_IRQ_MASK_CAEP_CEQ_ALM_OVF_MASK_S, - HNS_ROCE_INT_MASK_ENABLE); - roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG + - i * CEQ_REG_OFFSET, cemask_val); - - /* Clear int state(INT_WC : write 1 clear) */ - cealmovf_val = roce_read(hr_dev, - ROCEE_CAEP_CEQ_ALM_OVF_0_REG + - i * CEQ_REG_OFFSET); - roce_set_bit(cealmovf_val, - ROCEE_CAEP_CEQ_ALM_OVF_CAEP_CEQ_ALM_OVF_S, - 1); - roce_write(hr_dev, ROCEE_CAEP_CEQ_ALM_OVF_0_REG + - i * CEQ_REG_OFFSET, cealmovf_val); - - /* Clear mask */ - cemask_val = roce_read(hr_dev, - ROCEE_CAEP_CE_IRQ_MASK_0_REG + - i * CEQ_REG_OFFSET); - roce_set_bit(cemask_val, - ROCEE_CAEP_CE_IRQ_MASK_CAEP_CEQ_ALM_OVF_MASK_S, - HNS_ROCE_INT_MASK_DISABLE); - roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG + - i * CEQ_REG_OFFSET, cemask_val); - } - } - - /* ECC multi-bit error alarm */ - dev_warn(dev, "ECC UCERR ALARM: 0x%x, 0x%x, 0x%x\n", - roce_read(hr_dev, ROCEE_ECC_UCERR_ALM0_REG), - roce_read(hr_dev, ROCEE_ECC_UCERR_ALM1_REG), - roce_read(hr_dev, ROCEE_ECC_UCERR_ALM2_REG)); - - dev_warn(dev, "ECC CERR ALARM: 0x%x, 0x%x, 0x%x\n", - roce_read(hr_dev, ROCEE_ECC_CERR_ALM0_REG), - roce_read(hr_dev, ROCEE_ECC_CERR_ALM1_REG), - roce_read(hr_dev, ROCEE_ECC_CERR_ALM2_REG)); - - return eqovf_found; -} - -static int hns_roce_eq_int(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) -{ - int eqes_found = 0; - - if (likely(eq->type_flag == HNS_ROCE_CEQ)) - /* CEQ irq routine, CEQ is pulse irq, not clear */ - eqes_found = hns_roce_ceq_int(hr_dev, eq); - else if (likely(eq->type_flag == HNS_ROCE_AEQ)) - /* AEQ irq routine, AEQ is pulse irq, not clear */ - eqes_found = hns_roce_aeq_int(hr_dev, eq); - else - /* AEQ queue overflow irq */ - eqes_found = hns_roce_aeq_ovf_int(hr_dev, eq); - - return eqes_found; -} - -static irqreturn_t hns_roce_msi_x_interrupt(int irq, void *eq_ptr) -{ - int int_work = 0; - struct hns_roce_eq *eq = eq_ptr; - struct hns_roce_dev *hr_dev = eq->hr_dev; - - int_work = hns_roce_eq_int(hr_dev, eq); - - return IRQ_RETVAL(int_work); -} - -static void hns_roce_enable_eq(struct hns_roce_dev *hr_dev, int eq_num, - int enable_flag) -{ - void __iomem *eqc = hr_dev->eq_table.eqc_base[eq_num]; - u32 val; - - val = readl(eqc); - - if (enable_flag) - roce_set_field(val, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S, - HNS_ROCE_EQ_STAT_VALID); - else - roce_set_field(val, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S, - HNS_ROCE_EQ_STAT_INVALID); - writel(val, eqc); -} - -static int hns_roce_create_eq(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq) -{ - void __iomem *eqc = hr_dev->eq_table.eqc_base[eq->eqn]; - struct device *dev = &hr_dev->pdev->dev; - dma_addr_t tmp_dma_addr; - u32 eqconsindx_val = 0; - u32 eqcuridx_val = 0; - u32 eqshift_val = 0; - int num_bas = 0; - int ret; - int i; - - num_bas = (PAGE_ALIGN(eq->entries * eq->eqe_size) + - HNS_ROCE_BA_SIZE - 1) / HNS_ROCE_BA_SIZE; - - if ((eq->entries * eq->eqe_size) > HNS_ROCE_BA_SIZE) { - dev_err(dev, "[error]eq buf %d gt ba size(%d) need bas=%d\n", - (eq->entries * eq->eqe_size), HNS_ROCE_BA_SIZE, - num_bas); - return -EINVAL; - } - - eq->buf_list = kcalloc(num_bas, sizeof(*eq->buf_list), GFP_KERNEL); - if (!eq->buf_list) - return -ENOMEM; - - for (i = 0; i < num_bas; ++i) { - eq->buf_list[i].buf = dma_alloc_coherent(dev, HNS_ROCE_BA_SIZE, - &tmp_dma_addr, - GFP_KERNEL); - if (!eq->buf_list[i].buf) { - ret = -ENOMEM; - goto err_out_free_pages; - } - - eq->buf_list[i].map = tmp_dma_addr; - memset(eq->buf_list[i].buf, 0, HNS_ROCE_BA_SIZE); - } - eq->cons_index = 0; - roce_set_field(eqshift_val, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S, - HNS_ROCE_EQ_STAT_INVALID); - roce_set_field(eqshift_val, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_M, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_S, - eq->log_entries); - writel(eqshift_val, eqc); - - /* Configure eq extended address 12~44bit */ - writel((u32)(eq->buf_list[0].map >> 12), eqc + 4); - - /* - * Configure eq extended address 45~49 bit. - * 44 = 32 + 12, When evaluating addr to hardware, shift 12 because of - * using 4K page, and shift more 32 because of - * caculating the high 32 bit value evaluated to hardware. - */ - roce_set_field(eqcuridx_val, ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_M, - ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_S, - eq->buf_list[0].map >> 44); - roce_set_field(eqcuridx_val, - ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_M, - ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_S, 0); - writel(eqcuridx_val, eqc + 8); - - /* Configure eq consumer index */ - roce_set_field(eqconsindx_val, - ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_M, - ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_S, 0); - writel(eqconsindx_val, eqc + 0xc); - - return 0; - -err_out_free_pages: - for (i = i - 1; i >= 0; i--) - dma_free_coherent(dev, HNS_ROCE_BA_SIZE, eq->buf_list[i].buf, - eq->buf_list[i].map); - - kfree(eq->buf_list); - return ret; -} - -static void hns_roce_free_eq(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq) -{ - int i = 0; - int npages = (PAGE_ALIGN(eq->eqe_size * eq->entries) + - HNS_ROCE_BA_SIZE - 1) / HNS_ROCE_BA_SIZE; - - if (!eq->buf_list) - return; - - for (i = 0; i < npages; ++i) - dma_free_coherent(&hr_dev->pdev->dev, HNS_ROCE_BA_SIZE, - eq->buf_list[i].buf, eq->buf_list[i].map); - - kfree(eq->buf_list); -} - -static void hns_roce_int_mask_en(struct hns_roce_dev *hr_dev) -{ - int i = 0; - u32 aemask_val; - int masken = 0; - - /* AEQ INT */ - aemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG); - roce_set_bit(aemask_val, ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S, - masken); - roce_set_bit(aemask_val, ROCEE_CAEP_AE_MASK_CAEP_AE_IRQ_MASK_S, masken); - roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, aemask_val); - - /* CEQ INT */ - for (i = 0; i < hr_dev->caps.num_comp_vectors; i++) { - /* IRQ mask */ - roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG + - i * CEQ_REG_OFFSET, masken); - } -} - -static void hns_roce_ce_int_default_cfg(struct hns_roce_dev *hr_dev) -{ - /* Configure ce int interval */ - roce_write(hr_dev, ROCEE_CAEP_CE_INTERVAL_CFG_REG, - HNS_ROCE_CEQ_DEFAULT_INTERVAL); - - /* Configure ce int burst num */ - roce_write(hr_dev, ROCEE_CAEP_CE_BURST_NUM_CFG_REG, - HNS_ROCE_CEQ_DEFAULT_BURST_NUM); -} - -int hns_roce_init_eq_table(struct hns_roce_dev *hr_dev) -{ - struct hns_roce_eq_table *eq_table = &hr_dev->eq_table; - struct device *dev = &hr_dev->pdev->dev; - struct hns_roce_eq *eq = NULL; - int eq_num = 0; - int ret = 0; - int i = 0; - int j = 0; - - eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors; - eq_table->eq = kcalloc(eq_num, sizeof(*eq_table->eq), GFP_KERNEL); - if (!eq_table->eq) - return -ENOMEM; - - eq_table->eqc_base = kcalloc(eq_num, sizeof(*eq_table->eqc_base), - GFP_KERNEL); - if (!eq_table->eqc_base) { - ret = -ENOMEM; - goto err_eqc_base_alloc_fail; - } - - for (i = 0; i < eq_num; i++) { - eq = &eq_table->eq[i]; - eq->hr_dev = hr_dev; - eq->eqn = i; - eq->irq = hr_dev->irq[i]; - eq->log_page_size = PAGE_SHIFT; - - if (i < hr_dev->caps.num_comp_vectors) { - /* CEQ */ - eq_table->eqc_base[i] = hr_dev->reg_base + - ROCEE_CAEP_CEQC_SHIFT_0_REG + - HNS_ROCE_CEQC_REG_OFFSET * i; - eq->type_flag = HNS_ROCE_CEQ; - eq->doorbell = hr_dev->reg_base + - ROCEE_CAEP_CEQC_CONS_IDX_0_REG + - HNS_ROCE_CEQC_REG_OFFSET * i; - eq->entries = hr_dev->caps.ceqe_depth[i]; - eq->log_entries = ilog2(eq->entries); - eq->eqe_size = sizeof(struct hns_roce_ceqe); - } else { - /* AEQ */ - eq_table->eqc_base[i] = hr_dev->reg_base + - ROCEE_CAEP_AEQC_AEQE_SHIFT_REG; - eq->type_flag = HNS_ROCE_AEQ; - eq->doorbell = hr_dev->reg_base + - ROCEE_CAEP_AEQE_CONS_IDX_REG; - eq->entries = hr_dev->caps.aeqe_depth; - eq->log_entries = ilog2(eq->entries); - eq->eqe_size = sizeof(struct hns_roce_aeqe); - } - } - - /* Disable irq */ - hns_roce_int_mask_en(hr_dev); - - /* Configure CE irq interval and burst num */ - hns_roce_ce_int_default_cfg(hr_dev); - - for (i = 0; i < eq_num; i++) { - ret = hns_roce_create_eq(hr_dev, &eq_table->eq[i]); - if (ret) { - dev_err(dev, "eq create failed\n"); - goto err_create_eq_fail; - } - } - - for (j = 0; j < eq_num; j++) { - ret = request_irq(eq_table->eq[j].irq, hns_roce_msi_x_interrupt, - 0, hr_dev->irq_names[j], eq_table->eq + j); - if (ret) { - dev_err(dev, "request irq error!\n"); - goto err_request_irq_fail; - } - } - - for (i = 0; i < eq_num; i++) - hns_roce_enable_eq(hr_dev, i, EQ_ENABLE); - - return 0; - -err_request_irq_fail: - for (j = j - 1; j >= 0; j--) - free_irq(eq_table->eq[j].irq, eq_table->eq + j); - -err_create_eq_fail: - for (i = i - 1; i >= 0; i--) - hns_roce_free_eq(hr_dev, &eq_table->eq[i]); - - kfree(eq_table->eqc_base); - -err_eqc_base_alloc_fail: - kfree(eq_table->eq); - - return ret; -} - -void hns_roce_cleanup_eq_table(struct hns_roce_dev *hr_dev) -{ - int i; - int eq_num; - struct hns_roce_eq_table *eq_table = &hr_dev->eq_table; - - eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors; - for (i = 0; i < eq_num; i++) { - /* Disable EQ */ - hns_roce_enable_eq(hr_dev, i, EQ_DISABLE); - - free_irq(eq_table->eq[i].irq, eq_table->eq + i); - - hns_roce_free_eq(hr_dev, &eq_table->eq[i]); - } - - kfree(eq_table->eqc_base); - kfree(eq_table->eq); -} diff --git a/drivers/infiniband/hw/hns/hns_roce_eq.h b/drivers/infiniband/hw/hns/hns_roce_eq.h deleted file mode 100644 index c6d212d12e03..000000000000 --- a/drivers/infiniband/hw/hns/hns_roce_eq.h +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright (c) 2016 Hisilicon Limited. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef _HNS_ROCE_EQ_H -#define _HNS_ROCE_EQ_H - -#define HNS_ROCE_CEQ 1 -#define HNS_ROCE_AEQ 2 - -#define HNS_ROCE_CEQ_ENTRY_SIZE 0x4 -#define HNS_ROCE_AEQ_ENTRY_SIZE 0x10 -#define HNS_ROCE_CEQC_REG_OFFSET 0x18 - -#define HNS_ROCE_CEQ_DEFAULT_INTERVAL 0x10 -#define HNS_ROCE_CEQ_DEFAULT_BURST_NUM 0x10 - -#define HNS_ROCE_INT_MASK_DISABLE 0 -#define HNS_ROCE_INT_MASK_ENABLE 1 - -#define EQ_ENABLE 1 -#define EQ_DISABLE 0 -#define CONS_INDEX_MASK 0xffff - -#define CEQ_REG_OFFSET 0x18 - -enum { - HNS_ROCE_EQ_STAT_INVALID = 0, - HNS_ROCE_EQ_STAT_VALID = 2, -}; - -struct hns_roce_aeqe { - u32 asyn; - union { - struct { - u32 qp; - u32 rsv0; - u32 rsv1; - } qp_event; - - struct { - u32 cq; - u32 rsv0; - u32 rsv1; - } cq_event; - - struct { - u32 port; - u32 rsv0; - u32 rsv1; - } port_event; - - struct { - u32 ceqe; - u32 rsv0; - u32 rsv1; - } ce_event; - - struct { - __le64 out_param; - __le16 token; - u8 status; - u8 rsv0; - } __packed cmd; - } event; -}; - -#define HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S 16 -#define HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M \ - (((1UL << 8) - 1) << HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S) - -#define HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S 24 -#define HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M \ - (((1UL << 7) - 1) << HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S) - -#define HNS_ROCE_AEQE_U32_4_OWNER_S 31 - -#define HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_S 0 -#define HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_M \ - (((1UL << 24) - 1) << HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_S) - -#define HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_S 25 -#define HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_M \ - (((1UL << 3) - 1) << HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_S) - -#define HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_S 0 -#define HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_M \ - (((1UL << 16) - 1) << HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_S) - -#define HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_S 0 -#define HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_M \ - (((1UL << 5) - 1) << HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_S) - -struct hns_roce_ceqe { - union { - int comp; - } ceqe; -}; - -#define HNS_ROCE_CEQE_CEQE_COMP_OWNER_S 0 - -#define HNS_ROCE_CEQE_CEQE_COMP_CQN_S 16 -#define HNS_ROCE_CEQE_CEQE_COMP_CQN_M \ - (((1UL << 16) - 1) << HNS_ROCE_CEQE_CEQE_COMP_CQN_S) - -#endif /* _HNS_ROCE_EQ_H */ diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index af27168faf0f..21ca9fa7c9d1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -33,6 +33,7 @@ #include <linux/platform_device.h> #include <linux/acpi.h> #include <linux/etherdevice.h> +#include <linux/interrupt.h> #include <linux/of.h> #include <linux/of_platform.h> #include <rdma/ib_umem.h> @@ -774,7 +775,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) goto create_lp_qp_failed; } - ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, &attr, attr_mask, + ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, &attr, IB_QP_DEST_QPN, IB_QPS_INIT, IB_QPS_RTR); if (ret) { dev_err(dev, "modify qp failed(%d)!\n", ret); @@ -1492,9 +1493,9 @@ static int hns_roce_v1_profile(struct hns_roce_dev *hr_dev) caps->max_sq_inline = HNS_ROCE_V1_INLINE_SIZE; caps->num_uars = HNS_ROCE_V1_UAR_NUM; caps->phy_num_uars = HNS_ROCE_V1_PHY_UAR_NUM; - caps->num_aeq_vectors = HNS_ROCE_AEQE_VEC_NUM; - caps->num_comp_vectors = HNS_ROCE_COMP_VEC_NUM; - caps->num_other_vectors = HNS_ROCE_AEQE_OF_VEC_NUM; + caps->num_aeq_vectors = HNS_ROCE_V1_AEQE_VEC_NUM; + caps->num_comp_vectors = HNS_ROCE_V1_COMP_VEC_NUM; + caps->num_other_vectors = HNS_ROCE_V1_ABNORMAL_VEC_NUM; caps->num_mtpts = HNS_ROCE_V1_MAX_MTPT_NUM; caps->num_mtt_segs = HNS_ROCE_V1_MAX_MTT_SEGS; caps->num_pds = HNS_ROCE_V1_MAX_PD_NUM; @@ -1529,10 +1530,8 @@ static int hns_roce_v1_profile(struct hns_roce_dev *hr_dev) caps->num_ports + 1; } - for (i = 0; i < caps->num_comp_vectors; i++) - caps->ceqe_depth[i] = HNS_ROCE_V1_NUM_COMP_EQE; - - caps->aeqe_depth = HNS_ROCE_V1_NUM_ASYNC_EQE; + caps->ceqe_depth = HNS_ROCE_V1_COMP_EQE_NUM; + caps->aeqe_depth = HNS_ROCE_V1_ASYNC_EQE_NUM; caps->local_ca_ack_delay = le32_to_cpu(roce_read(hr_dev, ROCEE_ACK_DELAY_REG)); caps->max_mtu = IB_MTU_2048; @@ -2312,15 +2311,16 @@ static int hns_roce_v1_poll_one(struct hns_roce_cq *hr_cq, case HNS_ROCE_OPCODE_RDMA_WITH_IMM_RECEIVE: wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; wc->wc_flags = IB_WC_WITH_IMM; - wc->ex.imm_data = le32_to_cpu(cqe->immediate_data); + wc->ex.imm_data = + cpu_to_be32(le32_to_cpu(cqe->immediate_data)); break; case HNS_ROCE_OPCODE_SEND_DATA_RECEIVE: if (roce_get_bit(cqe->cqe_byte_4, CQE_BYTE_4_IMM_INDICATOR_S)) { wc->opcode = IB_WC_RECV; wc->wc_flags = IB_WC_WITH_IMM; - wc->ex.imm_data = le32_to_cpu( - cqe->immediate_data); + wc->ex.imm_data = cpu_to_be32( + le32_to_cpu(cqe->immediate_data)); } else { wc->opcode = IB_WC_RECV; wc->wc_flags = 0; @@ -3960,6 +3960,732 @@ static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq) return ret; } +static void set_eq_cons_index_v1(struct hns_roce_eq *eq, int req_not) +{ + roce_raw_write((eq->cons_index & HNS_ROCE_V1_CONS_IDX_M) | + (req_not << eq->log_entries), eq->doorbell); +} + +static void hns_roce_v1_wq_catas_err_handle(struct hns_roce_dev *hr_dev, + struct hns_roce_aeqe *aeqe, int qpn) +{ + struct device *dev = &hr_dev->pdev->dev; + + dev_warn(dev, "Local Work Queue Catastrophic Error.\n"); + switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M, + HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) { + case HNS_ROCE_LWQCE_QPC_ERROR: + dev_warn(dev, "QP %d, QPC error.\n", qpn); + break; + case HNS_ROCE_LWQCE_MTU_ERROR: + dev_warn(dev, "QP %d, MTU error.\n", qpn); + break; + case HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR: + dev_warn(dev, "QP %d, WQE BA addr error.\n", qpn); + break; + case HNS_ROCE_LWQCE_WQE_ADDR_ERROR: + dev_warn(dev, "QP %d, WQE addr error.\n", qpn); + break; + case HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR: + dev_warn(dev, "QP %d, WQE shift error\n", qpn); + break; + case HNS_ROCE_LWQCE_SL_ERROR: + dev_warn(dev, "QP %d, SL error.\n", qpn); + break; + case HNS_ROCE_LWQCE_PORT_ERROR: + dev_warn(dev, "QP %d, port error.\n", qpn); + break; + default: + break; + } +} + +static void hns_roce_v1_local_wq_access_err_handle(struct hns_roce_dev *hr_dev, + struct hns_roce_aeqe *aeqe, + int qpn) +{ + struct device *dev = &hr_dev->pdev->dev; + + dev_warn(dev, "Local Access Violation Work Queue Error.\n"); + switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M, + HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) { + case HNS_ROCE_LAVWQE_R_KEY_VIOLATION: + dev_warn(dev, "QP %d, R_key violation.\n", qpn); + break; + case HNS_ROCE_LAVWQE_LENGTH_ERROR: + dev_warn(dev, "QP %d, length error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_VA_ERROR: + dev_warn(dev, "QP %d, VA error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_PD_ERROR: + dev_err(dev, "QP %d, PD error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_RW_ACC_ERROR: + dev_warn(dev, "QP %d, rw acc error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_KEY_STATE_ERROR: + dev_warn(dev, "QP %d, key state error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_MR_OPERATION_ERROR: + dev_warn(dev, "QP %d, MR operation error.\n", qpn); + break; + default: + break; + } +} + +static void hns_roce_v1_qp_err_handle(struct hns_roce_dev *hr_dev, + struct hns_roce_aeqe *aeqe, + int event_type) +{ + struct device *dev = &hr_dev->pdev->dev; + int phy_port; + int qpn; + + qpn = roce_get_field(aeqe->event.qp_event.qp, + HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_M, + HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_S); + phy_port = roce_get_field(aeqe->event.qp_event.qp, + HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_M, + HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_S); + if (qpn <= 1) + qpn = HNS_ROCE_MAX_PORTS * qpn + phy_port; + + switch (event_type) { + case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: + dev_warn(dev, "Invalid Req Local Work Queue Error.\n" + "QP %d, phy_port %d.\n", qpn, phy_port); + break; + case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: + hns_roce_v1_wq_catas_err_handle(hr_dev, aeqe, qpn); + break; + case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: + hns_roce_v1_local_wq_access_err_handle(hr_dev, aeqe, qpn); + break; + default: + break; + } + + hns_roce_qp_event(hr_dev, qpn, event_type); +} + +static void hns_roce_v1_cq_err_handle(struct hns_roce_dev *hr_dev, + struct hns_roce_aeqe *aeqe, + int event_type) +{ + struct device *dev = &hr_dev->pdev->dev; + u32 cqn; + + cqn = le32_to_cpu(roce_get_field(aeqe->event.cq_event.cq, + HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_M, + HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_S)); + + switch (event_type) { + case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: + dev_warn(dev, "CQ 0x%x access err.\n", cqn); + break; + case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: + dev_warn(dev, "CQ 0x%x overflow\n", cqn); + break; + case HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID: + dev_warn(dev, "CQ 0x%x ID invalid.\n", cqn); + break; + default: + break; + } + + hns_roce_cq_event(hr_dev, cqn, event_type); +} + +static void hns_roce_v1_db_overflow_handle(struct hns_roce_dev *hr_dev, + struct hns_roce_aeqe *aeqe) +{ + struct device *dev = &hr_dev->pdev->dev; + + switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M, + HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) { + case HNS_ROCE_DB_SUBTYPE_SDB_OVF: + dev_warn(dev, "SDB overflow.\n"); + break; + case HNS_ROCE_DB_SUBTYPE_SDB_ALM_OVF: + dev_warn(dev, "SDB almost overflow.\n"); + break; + case HNS_ROCE_DB_SUBTYPE_SDB_ALM_EMP: + dev_warn(dev, "SDB almost empty.\n"); + break; + case HNS_ROCE_DB_SUBTYPE_ODB_OVF: + dev_warn(dev, "ODB overflow.\n"); + break; + case HNS_ROCE_DB_SUBTYPE_ODB_ALM_OVF: + dev_warn(dev, "ODB almost overflow.\n"); + break; + case HNS_ROCE_DB_SUBTYPE_ODB_ALM_EMP: + dev_warn(dev, "SDB almost empty.\n"); + break; + default: + break; + } +} + +static struct hns_roce_aeqe *get_aeqe_v1(struct hns_roce_eq *eq, u32 entry) +{ + unsigned long off = (entry & (eq->entries - 1)) * + HNS_ROCE_AEQ_ENTRY_SIZE; + + return (struct hns_roce_aeqe *)((u8 *) + (eq->buf_list[off / HNS_ROCE_BA_SIZE].buf) + + off % HNS_ROCE_BA_SIZE); +} + +static struct hns_roce_aeqe *next_aeqe_sw_v1(struct hns_roce_eq *eq) +{ + struct hns_roce_aeqe *aeqe = get_aeqe_v1(eq, eq->cons_index); + + return (roce_get_bit(aeqe->asyn, HNS_ROCE_AEQE_U32_4_OWNER_S) ^ + !!(eq->cons_index & eq->entries)) ? aeqe : NULL; +} + +static int hns_roce_v1_aeq_int(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) +{ + struct device *dev = &hr_dev->pdev->dev; + struct hns_roce_aeqe *aeqe; + int aeqes_found = 0; + int event_type; + + while ((aeqe = next_aeqe_sw_v1(eq))) { + + /* Make sure we read the AEQ entry after we have checked the + * ownership bit + */ + dma_rmb(); + + dev_dbg(dev, "aeqe = %p, aeqe->asyn.event_type = 0x%lx\n", aeqe, + roce_get_field(aeqe->asyn, + HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M, + HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S)); + event_type = roce_get_field(aeqe->asyn, + HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M, + HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S); + switch (event_type) { + case HNS_ROCE_EVENT_TYPE_PATH_MIG: + dev_warn(dev, "PATH MIG not supported\n"); + break; + case HNS_ROCE_EVENT_TYPE_COMM_EST: + dev_warn(dev, "COMMUNICATION established\n"); + break; + case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: + dev_warn(dev, "SQ DRAINED not supported\n"); + break; + case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED: + dev_warn(dev, "PATH MIG failed\n"); + break; + case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: + case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: + case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: + hns_roce_v1_qp_err_handle(hr_dev, aeqe, event_type); + break; + case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: + case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR: + case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH: + dev_warn(dev, "SRQ not support!\n"); + break; + case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: + case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: + case HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID: + hns_roce_v1_cq_err_handle(hr_dev, aeqe, event_type); + break; + case HNS_ROCE_EVENT_TYPE_PORT_CHANGE: + dev_warn(dev, "port change.\n"); + break; + case HNS_ROCE_EVENT_TYPE_MB: + hns_roce_cmd_event(hr_dev, + le16_to_cpu(aeqe->event.cmd.token), + aeqe->event.cmd.status, + le64_to_cpu(aeqe->event.cmd.out_param + )); + break; + case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW: + hns_roce_v1_db_overflow_handle(hr_dev, aeqe); + break; + case HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW: + dev_warn(dev, "CEQ 0x%lx overflow.\n", + roce_get_field(aeqe->event.ce_event.ceqe, + HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_M, + HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_S)); + break; + default: + dev_warn(dev, "Unhandled event %d on EQ %d at idx %u.\n", + event_type, eq->eqn, eq->cons_index); + break; + } + + eq->cons_index++; + aeqes_found = 1; + + if (eq->cons_index > 2 * hr_dev->caps.aeqe_depth - 1) { + dev_warn(dev, "cons_index overflow, set back to 0.\n"); + eq->cons_index = 0; + } + } + + set_eq_cons_index_v1(eq, 0); + + return aeqes_found; +} + +static struct hns_roce_ceqe *get_ceqe_v1(struct hns_roce_eq *eq, u32 entry) +{ + unsigned long off = (entry & (eq->entries - 1)) * + HNS_ROCE_CEQ_ENTRY_SIZE; + + return (struct hns_roce_ceqe *)((u8 *) + (eq->buf_list[off / HNS_ROCE_BA_SIZE].buf) + + off % HNS_ROCE_BA_SIZE); +} + +static struct hns_roce_ceqe *next_ceqe_sw_v1(struct hns_roce_eq *eq) +{ + struct hns_roce_ceqe *ceqe = get_ceqe_v1(eq, eq->cons_index); + + return (!!(roce_get_bit(ceqe->comp, + HNS_ROCE_CEQE_CEQE_COMP_OWNER_S))) ^ + (!!(eq->cons_index & eq->entries)) ? ceqe : NULL; +} + +static int hns_roce_v1_ceq_int(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) +{ + struct hns_roce_ceqe *ceqe; + int ceqes_found = 0; + u32 cqn; + + while ((ceqe = next_ceqe_sw_v1(eq))) { + + /* Make sure we read CEQ entry after we have checked the + * ownership bit + */ + dma_rmb(); + + cqn = roce_get_field(ceqe->comp, + HNS_ROCE_CEQE_CEQE_COMP_CQN_M, + HNS_ROCE_CEQE_CEQE_COMP_CQN_S); + hns_roce_cq_completion(hr_dev, cqn); + + ++eq->cons_index; + ceqes_found = 1; + + if (eq->cons_index > 2 * hr_dev->caps.ceqe_depth - 1) { + dev_warn(&eq->hr_dev->pdev->dev, + "cons_index overflow, set back to 0.\n"); + eq->cons_index = 0; + } + } + + set_eq_cons_index_v1(eq, 0); + + return ceqes_found; +} + +static irqreturn_t hns_roce_v1_msix_interrupt_eq(int irq, void *eq_ptr) +{ + struct hns_roce_eq *eq = eq_ptr; + struct hns_roce_dev *hr_dev = eq->hr_dev; + int int_work = 0; + + if (eq->type_flag == HNS_ROCE_CEQ) + /* CEQ irq routine, CEQ is pulse irq, not clear */ + int_work = hns_roce_v1_ceq_int(hr_dev, eq); + else + /* AEQ irq routine, AEQ is pulse irq, not clear */ + int_work = hns_roce_v1_aeq_int(hr_dev, eq); + + return IRQ_RETVAL(int_work); +} + +static irqreturn_t hns_roce_v1_msix_interrupt_abn(int irq, void *dev_id) +{ + struct hns_roce_dev *hr_dev = dev_id; + struct device *dev = &hr_dev->pdev->dev; + int int_work = 0; + u32 caepaemask_val; + u32 cealmovf_val; + u32 caepaest_val; + u32 aeshift_val; + u32 ceshift_val; + u32 cemask_val; + int i; + + /* + * Abnormal interrupt: + * AEQ overflow, ECC multi-bit err, CEQ overflow must clear + * interrupt, mask irq, clear irq, cancel mask operation + */ + aeshift_val = roce_read(hr_dev, ROCEE_CAEP_AEQC_AEQE_SHIFT_REG); + + /* AEQE overflow */ + if (roce_get_bit(aeshift_val, + ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQ_ALM_OVF_INT_ST_S) == 1) { + dev_warn(dev, "AEQ overflow!\n"); + + /* Set mask */ + caepaemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG); + roce_set_bit(caepaemask_val, + ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S, + HNS_ROCE_INT_MASK_ENABLE); + roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, caepaemask_val); + + /* Clear int state(INT_WC : write 1 clear) */ + caepaest_val = roce_read(hr_dev, ROCEE_CAEP_AE_ST_REG); + roce_set_bit(caepaest_val, + ROCEE_CAEP_AE_ST_CAEP_AEQ_ALM_OVF_S, 1); + roce_write(hr_dev, ROCEE_CAEP_AE_ST_REG, caepaest_val); + + /* Clear mask */ + caepaemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG); + roce_set_bit(caepaemask_val, + ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S, + HNS_ROCE_INT_MASK_DISABLE); + roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, caepaemask_val); + } + + /* CEQ almost overflow */ + for (i = 0; i < hr_dev->caps.num_comp_vectors; i++) { + ceshift_val = roce_read(hr_dev, ROCEE_CAEP_CEQC_SHIFT_0_REG + + i * CEQ_REG_OFFSET); + + if (roce_get_bit(ceshift_val, + ROCEE_CAEP_CEQC_SHIFT_CAEP_CEQ_ALM_OVF_INT_ST_S) == 1) { + dev_warn(dev, "CEQ[%d] almost overflow!\n", i); + int_work++; + + /* Set mask */ + cemask_val = roce_read(hr_dev, + ROCEE_CAEP_CE_IRQ_MASK_0_REG + + i * CEQ_REG_OFFSET); + roce_set_bit(cemask_val, + ROCEE_CAEP_CE_IRQ_MASK_CAEP_CEQ_ALM_OVF_MASK_S, + HNS_ROCE_INT_MASK_ENABLE); + roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG + + i * CEQ_REG_OFFSET, cemask_val); + + /* Clear int state(INT_WC : write 1 clear) */ + cealmovf_val = roce_read(hr_dev, + ROCEE_CAEP_CEQ_ALM_OVF_0_REG + + i * CEQ_REG_OFFSET); + roce_set_bit(cealmovf_val, + ROCEE_CAEP_CEQ_ALM_OVF_CAEP_CEQ_ALM_OVF_S, + 1); + roce_write(hr_dev, ROCEE_CAEP_CEQ_ALM_OVF_0_REG + + i * CEQ_REG_OFFSET, cealmovf_val); + + /* Clear mask */ + cemask_val = roce_read(hr_dev, + ROCEE_CAEP_CE_IRQ_MASK_0_REG + + i * CEQ_REG_OFFSET); + roce_set_bit(cemask_val, + ROCEE_CAEP_CE_IRQ_MASK_CAEP_CEQ_ALM_OVF_MASK_S, + HNS_ROCE_INT_MASK_DISABLE); + roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG + + i * CEQ_REG_OFFSET, cemask_val); + } + } + + /* ECC multi-bit error alarm */ + dev_warn(dev, "ECC UCERR ALARM: 0x%x, 0x%x, 0x%x\n", + roce_read(hr_dev, ROCEE_ECC_UCERR_ALM0_REG), + roce_read(hr_dev, ROCEE_ECC_UCERR_ALM1_REG), + roce_read(hr_dev, ROCEE_ECC_UCERR_ALM2_REG)); + + dev_warn(dev, "ECC CERR ALARM: 0x%x, 0x%x, 0x%x\n", + roce_read(hr_dev, ROCEE_ECC_CERR_ALM0_REG), + roce_read(hr_dev, ROCEE_ECC_CERR_ALM1_REG), + roce_read(hr_dev, ROCEE_ECC_CERR_ALM2_REG)); + + return IRQ_RETVAL(int_work); +} + +static void hns_roce_v1_int_mask_enable(struct hns_roce_dev *hr_dev) +{ + u32 aemask_val; + int masken = 0; + int i; + + /* AEQ INT */ + aemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG); + roce_set_bit(aemask_val, ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S, + masken); + roce_set_bit(aemask_val, ROCEE_CAEP_AE_MASK_CAEP_AE_IRQ_MASK_S, masken); + roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, aemask_val); + + /* CEQ INT */ + for (i = 0; i < hr_dev->caps.num_comp_vectors; i++) { + /* IRQ mask */ + roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG + + i * CEQ_REG_OFFSET, masken); + } +} + +static void hns_roce_v1_free_eq(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) +{ + int npages = (PAGE_ALIGN(eq->eqe_size * eq->entries) + + HNS_ROCE_BA_SIZE - 1) / HNS_ROCE_BA_SIZE; + int i; + + if (!eq->buf_list) + return; + + for (i = 0; i < npages; ++i) + dma_free_coherent(&hr_dev->pdev->dev, HNS_ROCE_BA_SIZE, + eq->buf_list[i].buf, eq->buf_list[i].map); + + kfree(eq->buf_list); +} + +static void hns_roce_v1_enable_eq(struct hns_roce_dev *hr_dev, int eq_num, + int enable_flag) +{ + void __iomem *eqc = hr_dev->eq_table.eqc_base[eq_num]; + u32 val; + + val = readl(eqc); + + if (enable_flag) + roce_set_field(val, + ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M, + ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S, + HNS_ROCE_EQ_STAT_VALID); + else + roce_set_field(val, + ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M, + ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S, + HNS_ROCE_EQ_STAT_INVALID); + writel(val, eqc); +} + +static int hns_roce_v1_create_eq(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) +{ + void __iomem *eqc = hr_dev->eq_table.eqc_base[eq->eqn]; + struct device *dev = &hr_dev->pdev->dev; + dma_addr_t tmp_dma_addr; + u32 eqconsindx_val = 0; + u32 eqcuridx_val = 0; + u32 eqshift_val = 0; + int num_bas; + int ret; + int i; + + num_bas = (PAGE_ALIGN(eq->entries * eq->eqe_size) + + HNS_ROCE_BA_SIZE - 1) / HNS_ROCE_BA_SIZE; + + if ((eq->entries * eq->eqe_size) > HNS_ROCE_BA_SIZE) { + dev_err(dev, "[error]eq buf %d gt ba size(%d) need bas=%d\n", + (eq->entries * eq->eqe_size), HNS_ROCE_BA_SIZE, + num_bas); + return -EINVAL; + } + + eq->buf_list = kcalloc(num_bas, sizeof(*eq->buf_list), GFP_KERNEL); + if (!eq->buf_list) + return -ENOMEM; + + for (i = 0; i < num_bas; ++i) { + eq->buf_list[i].buf = dma_alloc_coherent(dev, HNS_ROCE_BA_SIZE, + &tmp_dma_addr, + GFP_KERNEL); + if (!eq->buf_list[i].buf) { + ret = -ENOMEM; + goto err_out_free_pages; + } + + eq->buf_list[i].map = tmp_dma_addr; + memset(eq->buf_list[i].buf, 0, HNS_ROCE_BA_SIZE); + } + eq->cons_index = 0; + roce_set_field(eqshift_val, + ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M, + ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S, + HNS_ROCE_EQ_STAT_INVALID); + roce_set_field(eqshift_val, + ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_M, + ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_S, + eq->log_entries); + writel(eqshift_val, eqc); + + /* Configure eq extended address 12~44bit */ + writel((u32)(eq->buf_list[0].map >> 12), eqc + 4); + + /* + * Configure eq extended address 45~49 bit. + * 44 = 32 + 12, When evaluating addr to hardware, shift 12 because of + * using 4K page, and shift more 32 because of + * caculating the high 32 bit value evaluated to hardware. + */ + roce_set_field(eqcuridx_val, ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_M, + ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_S, + eq->buf_list[0].map >> 44); + roce_set_field(eqcuridx_val, + ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_M, + ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_S, 0); + writel(eqcuridx_val, eqc + 8); + + /* Configure eq consumer index */ + roce_set_field(eqconsindx_val, + ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_M, + ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_S, 0); + writel(eqconsindx_val, eqc + 0xc); + + return 0; + +err_out_free_pages: + for (i -= 1; i >= 0; i--) + dma_free_coherent(dev, HNS_ROCE_BA_SIZE, eq->buf_list[i].buf, + eq->buf_list[i].map); + + kfree(eq->buf_list); + return ret; +} + +static int hns_roce_v1_init_eq_table(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_eq_table *eq_table = &hr_dev->eq_table; + struct device *dev = &hr_dev->pdev->dev; + struct hns_roce_eq *eq; + int irq_num; + int eq_num; + int ret; + int i, j; + + eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors; + irq_num = eq_num + hr_dev->caps.num_other_vectors; + + eq_table->eq = kcalloc(eq_num, sizeof(*eq_table->eq), GFP_KERNEL); + if (!eq_table->eq) + return -ENOMEM; + + eq_table->eqc_base = kcalloc(eq_num, sizeof(*eq_table->eqc_base), + GFP_KERNEL); + if (!eq_table->eqc_base) { + ret = -ENOMEM; + goto err_eqc_base_alloc_fail; + } + + for (i = 0; i < eq_num; i++) { + eq = &eq_table->eq[i]; + eq->hr_dev = hr_dev; + eq->eqn = i; + eq->irq = hr_dev->irq[i]; + eq->log_page_size = PAGE_SHIFT; + + if (i < hr_dev->caps.num_comp_vectors) { + /* CEQ */ + eq_table->eqc_base[i] = hr_dev->reg_base + + ROCEE_CAEP_CEQC_SHIFT_0_REG + + CEQ_REG_OFFSET * i; + eq->type_flag = HNS_ROCE_CEQ; + eq->doorbell = hr_dev->reg_base + + ROCEE_CAEP_CEQC_CONS_IDX_0_REG + + CEQ_REG_OFFSET * i; + eq->entries = hr_dev->caps.ceqe_depth; + eq->log_entries = ilog2(eq->entries); + eq->eqe_size = HNS_ROCE_CEQ_ENTRY_SIZE; + } else { + /* AEQ */ + eq_table->eqc_base[i] = hr_dev->reg_base + + ROCEE_CAEP_AEQC_AEQE_SHIFT_REG; + eq->type_flag = HNS_ROCE_AEQ; + eq->doorbell = hr_dev->reg_base + + ROCEE_CAEP_AEQE_CONS_IDX_REG; + eq->entries = hr_dev->caps.aeqe_depth; + eq->log_entries = ilog2(eq->entries); + eq->eqe_size = HNS_ROCE_AEQ_ENTRY_SIZE; + } + } + + /* Disable irq */ + hns_roce_v1_int_mask_enable(hr_dev); + + /* Configure ce int interval */ + roce_write(hr_dev, ROCEE_CAEP_CE_INTERVAL_CFG_REG, + HNS_ROCE_CEQ_DEFAULT_INTERVAL); + + /* Configure ce int burst num */ + roce_write(hr_dev, ROCEE_CAEP_CE_BURST_NUM_CFG_REG, + HNS_ROCE_CEQ_DEFAULT_BURST_NUM); + + for (i = 0; i < eq_num; i++) { + ret = hns_roce_v1_create_eq(hr_dev, &eq_table->eq[i]); + if (ret) { + dev_err(dev, "eq create failed\n"); + goto err_create_eq_fail; + } + } + + for (j = 0; j < irq_num; j++) { + if (j < eq_num) + ret = request_irq(hr_dev->irq[j], + hns_roce_v1_msix_interrupt_eq, 0, + hr_dev->irq_names[j], + &eq_table->eq[j]); + else + ret = request_irq(hr_dev->irq[j], + hns_roce_v1_msix_interrupt_abn, 0, + hr_dev->irq_names[j], hr_dev); + + if (ret) { + dev_err(dev, "request irq error!\n"); + goto err_request_irq_fail; + } + } + + for (i = 0; i < eq_num; i++) + hns_roce_v1_enable_eq(hr_dev, i, EQ_ENABLE); + + return 0; + +err_request_irq_fail: + for (j -= 1; j >= 0; j--) + free_irq(hr_dev->irq[j], &eq_table->eq[j]); + +err_create_eq_fail: + for (i -= 1; i >= 0; i--) + hns_roce_v1_free_eq(hr_dev, &eq_table->eq[i]); + + kfree(eq_table->eqc_base); + +err_eqc_base_alloc_fail: + kfree(eq_table->eq); + + return ret; +} + +static void hns_roce_v1_cleanup_eq_table(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_eq_table *eq_table = &hr_dev->eq_table; + int irq_num; + int eq_num; + int i; + + eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors; + irq_num = eq_num + hr_dev->caps.num_other_vectors; + for (i = 0; i < eq_num; i++) { + /* Disable EQ */ + hns_roce_v1_enable_eq(hr_dev, i, EQ_DISABLE); + + free_irq(hr_dev->irq[i], &eq_table->eq[i]); + + hns_roce_v1_free_eq(hr_dev, &eq_table->eq[i]); + } + for (i = eq_num; i < irq_num; i++) + free_irq(hr_dev->irq[i], hr_dev); + + kfree(eq_table->eqc_base); + kfree(eq_table->eq); +} + static const struct hns_roce_hw hns_roce_hw_v1 = { .reset = hns_roce_v1_reset, .hw_profile = hns_roce_v1_profile, @@ -3983,6 +4709,8 @@ static const struct hns_roce_hw hns_roce_hw_v1 = { .poll_cq = hns_roce_v1_poll_cq, .dereg_mr = hns_roce_v1_dereg_mr, .destroy_cq = hns_roce_v1_destroy_cq, + .init_eq = hns_roce_v1_init_eq_table, + .cleanup_eq = hns_roce_v1_cleanup_eq_table, }; static const struct of_device_id hns_roce_of_match[] = { @@ -4060,10 +4788,6 @@ static int hns_roce_get_cfg(struct hns_roce_dev *hr_dev) /* get the mapped register base address */ res = platform_get_resource(hr_dev->pdev, IORESOURCE_MEM, 0); - if (!res) { - dev_err(dev, "memory resource not found!\n"); - return -EINVAL; - } hr_dev->reg_base = devm_ioremap_resource(dev, res); if (IS_ERR(hr_dev->reg_base)) return PTR_ERR(hr_dev->reg_base); @@ -4132,14 +4856,14 @@ static int hns_roce_get_cfg(struct hns_roce_dev *hr_dev) /* read the interrupt names from the DT or ACPI */ ret = device_property_read_string_array(dev, "interrupt-names", hr_dev->irq_names, - HNS_ROCE_MAX_IRQ_NUM); + HNS_ROCE_V1_MAX_IRQ_NUM); if (ret < 0) { dev_err(dev, "couldn't get interrupt names from DT or ACPI!\n"); return ret; } /* fetch the interrupt numbers */ - for (i = 0; i < HNS_ROCE_MAX_IRQ_NUM; i++) { + for (i = 0; i < HNS_ROCE_V1_MAX_IRQ_NUM; i++) { hr_dev->irq[i] = platform_get_irq(hr_dev->pdev, i); if (hr_dev->irq[i] <= 0) { dev_err(dev, "platform get of irq[=%d] failed!\n", i); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h index 21a07ef0afc9..b44ddd239060 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h @@ -60,8 +60,13 @@ #define HNS_ROCE_V1_GID_NUM 16 #define HNS_ROCE_V1_RESV_QP 8 -#define HNS_ROCE_V1_NUM_COMP_EQE 0x8000 -#define HNS_ROCE_V1_NUM_ASYNC_EQE 0x400 +#define HNS_ROCE_V1_MAX_IRQ_NUM 34 +#define HNS_ROCE_V1_COMP_VEC_NUM 32 +#define HNS_ROCE_V1_AEQE_VEC_NUM 1 +#define HNS_ROCE_V1_ABNORMAL_VEC_NUM 1 + +#define HNS_ROCE_V1_COMP_EQE_NUM 0x8000 +#define HNS_ROCE_V1_ASYNC_EQE_NUM 0x400 #define HNS_ROCE_V1_QPC_ENTRY_SIZE 256 #define HNS_ROCE_V1_IRRL_ENTRY_SIZE 8 @@ -159,6 +164,41 @@ #define SDB_INV_CNT_OFFSET 8 #define SDB_ST_CMP_VAL 8 +#define HNS_ROCE_CEQ_DEFAULT_INTERVAL 0x10 +#define HNS_ROCE_CEQ_DEFAULT_BURST_NUM 0x10 + +#define HNS_ROCE_INT_MASK_DISABLE 0 +#define HNS_ROCE_INT_MASK_ENABLE 1 + +#define CEQ_REG_OFFSET 0x18 + +#define HNS_ROCE_CEQE_CEQE_COMP_OWNER_S 0 + +#define HNS_ROCE_V1_CONS_IDX_M GENMASK(15, 0) + +#define HNS_ROCE_CEQE_CEQE_COMP_CQN_S 16 +#define HNS_ROCE_CEQE_CEQE_COMP_CQN_M GENMASK(31, 16) + +#define HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S 16 +#define HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M GENMASK(23, 16) + +#define HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S 24 +#define HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M GENMASK(30, 24) + +#define HNS_ROCE_AEQE_U32_4_OWNER_S 31 + +#define HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_S 0 +#define HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_M GENMASK(23, 0) + +#define HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_S 25 +#define HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_M GENMASK(27, 25) + +#define HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_S 0 +#define HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_M GENMASK(15, 0) + +#define HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_S 0 +#define HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_M GENMASK(4, 0) + struct hns_roce_cq_context { u32 cqc_byte_4; u32 cq_bt_l; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 8e18445714a9..256fe110107a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -34,6 +34,7 @@ #include <linux/etherdevice.h> #include <linux/interrupt.h> #include <linux/kernel.h> +#include <net/addrconf.h> #include <rdma/ib_umem.h> #include "hnae3.h" @@ -51,32 +52,106 @@ static void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg, dseg->len = cpu_to_le32(sg->length); } +static int set_rwqe_data_seg(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct hns_roce_v2_rc_send_wqe *rc_sq_wqe, + void *wqe, unsigned int *sge_ind, + struct ib_send_wr **bad_wr) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); + struct hns_roce_v2_wqe_data_seg *dseg = wqe; + struct hns_roce_qp *qp = to_hr_qp(ibqp); + int i; + + if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) { + if (rc_sq_wqe->msg_len > hr_dev->caps.max_sq_inline) { + *bad_wr = wr; + dev_err(hr_dev->dev, "inline len(1-%d)=%d, illegal", + rc_sq_wqe->msg_len, hr_dev->caps.max_sq_inline); + return -EINVAL; + } + + for (i = 0; i < wr->num_sge; i++) { + memcpy(wqe, ((void *)wr->sg_list[i].addr), + wr->sg_list[i].length); + wqe += wr->sg_list[i].length; + } + + roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_INLINE_S, + 1); + } else { + if (wr->num_sge <= 2) { + for (i = 0; i < wr->num_sge; i++) { + if (likely(wr->sg_list[i].length)) { + set_data_seg_v2(dseg, wr->sg_list + i); + dseg++; + } + } + } else { + roce_set_field(rc_sq_wqe->byte_20, + V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M, + V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S, + (*sge_ind) & (qp->sge.sge_cnt - 1)); + + for (i = 0; i < 2; i++) { + if (likely(wr->sg_list[i].length)) { + set_data_seg_v2(dseg, wr->sg_list + i); + dseg++; + } + } + + dseg = get_send_extend_sge(qp, + (*sge_ind) & (qp->sge.sge_cnt - 1)); + + for (i = 0; i < wr->num_sge - 2; i++) { + if (likely(wr->sg_list[i + 2].length)) { + set_data_seg_v2(dseg, + wr->sg_list + 2 + i); + dseg++; + (*sge_ind)++; + } + } + } + + roce_set_field(rc_sq_wqe->byte_16, + V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M, + V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S, wr->num_sge); + } + + return 0; +} + static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); + struct hns_roce_ah *ah = to_hr_ah(ud_wr(wr)->ah); + struct hns_roce_v2_ud_send_wqe *ud_sq_wqe; struct hns_roce_v2_rc_send_wqe *rc_sq_wqe; struct hns_roce_qp *qp = to_hr_qp(ibqp); struct hns_roce_v2_wqe_data_seg *dseg; struct device *dev = hr_dev->dev; struct hns_roce_v2_db sq_db; unsigned int sge_ind = 0; - unsigned int wqe_sz = 0; unsigned int owner_bit; unsigned long flags; unsigned int ind; void *wqe = NULL; + bool loopback; int ret = 0; + u8 *smac; int nreq; int i; - if (unlikely(ibqp->qp_type != IB_QPT_RC)) { + if (unlikely(ibqp->qp_type != IB_QPT_RC && + ibqp->qp_type != IB_QPT_GSI && + ibqp->qp_type != IB_QPT_UD)) { dev_err(dev, "Not supported QP(0x%x)type!\n", ibqp->qp_type); *bad_wr = NULL; return -EOPNOTSUPP; } - if (unlikely(qp->state != IB_QPS_RTS && qp->state != IB_QPS_SQD)) { + if (unlikely(qp->state == IB_QPS_RESET || qp->state == IB_QPS_INIT || + qp->state == IB_QPS_RTR)) { dev_err(dev, "Post WQE fail, QP state %d err!\n", qp->state); *bad_wr = wr; return -EINVAL; @@ -106,161 +181,255 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, wr->wr_id; owner_bit = ~(qp->sq.head >> ilog2(qp->sq.wqe_cnt)) & 0x1; - rc_sq_wqe = wqe; - memset(rc_sq_wqe, 0, sizeof(*rc_sq_wqe)); - for (i = 0; i < wr->num_sge; i++) - rc_sq_wqe->msg_len += wr->sg_list[i].length; - rc_sq_wqe->inv_key_immtdata = send_ieth(wr); + /* Corresponding to the QP type, wqe process separately */ + if (ibqp->qp_type == IB_QPT_GSI) { + ud_sq_wqe = wqe; + memset(ud_sq_wqe, 0, sizeof(*ud_sq_wqe)); + + roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_0_M, + V2_UD_SEND_WQE_DMAC_0_S, ah->av.mac[0]); + roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_1_M, + V2_UD_SEND_WQE_DMAC_1_S, ah->av.mac[1]); + roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_2_M, + V2_UD_SEND_WQE_DMAC_2_S, ah->av.mac[2]); + roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_3_M, + V2_UD_SEND_WQE_DMAC_3_S, ah->av.mac[3]); + roce_set_field(ud_sq_wqe->byte_48, + V2_UD_SEND_WQE_BYTE_48_DMAC_4_M, + V2_UD_SEND_WQE_BYTE_48_DMAC_4_S, + ah->av.mac[4]); + roce_set_field(ud_sq_wqe->byte_48, + V2_UD_SEND_WQE_BYTE_48_DMAC_5_M, + V2_UD_SEND_WQE_BYTE_48_DMAC_5_S, + ah->av.mac[5]); + + /* MAC loopback */ + smac = (u8 *)hr_dev->dev_addr[qp->port]; + loopback = ether_addr_equal_unaligned(ah->av.mac, + smac) ? 1 : 0; + + roce_set_bit(ud_sq_wqe->byte_40, + V2_UD_SEND_WQE_BYTE_40_LBI_S, loopback); + + roce_set_field(ud_sq_wqe->byte_4, + V2_UD_SEND_WQE_BYTE_4_OPCODE_M, + V2_UD_SEND_WQE_BYTE_4_OPCODE_S, + HNS_ROCE_V2_WQE_OP_SEND); - roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_FENCE_S, - (wr->send_flags & IB_SEND_FENCE) ? 1 : 0); + for (i = 0; i < wr->num_sge; i++) + ud_sq_wqe->msg_len += wr->sg_list[i].length; - roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_SE_S, - (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0); + ud_sq_wqe->immtdata = send_ieth(wr); - roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_CQE_S, - (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0); + /* Set sig attr */ + roce_set_bit(ud_sq_wqe->byte_4, + V2_UD_SEND_WQE_BYTE_4_CQE_S, + (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0); - roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OWNER_S, - owner_bit); + /* Set se attr */ + roce_set_bit(ud_sq_wqe->byte_4, + V2_UD_SEND_WQE_BYTE_4_SE_S, + (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0); - switch (wr->opcode) { - case IB_WR_RDMA_READ: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_RDMA_READ); - rc_sq_wqe->rkey = cpu_to_le32(rdma_wr(wr)->rkey); - rc_sq_wqe->va = cpu_to_le64(rdma_wr(wr)->remote_addr); - break; - case IB_WR_RDMA_WRITE: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_RDMA_WRITE); - rc_sq_wqe->rkey = cpu_to_le32(rdma_wr(wr)->rkey); - rc_sq_wqe->va = cpu_to_le64(rdma_wr(wr)->remote_addr); - break; - case IB_WR_RDMA_WRITE_WITH_IMM: - roce_set_field(rc_sq_wqe->byte_4, + roce_set_bit(ud_sq_wqe->byte_4, + V2_UD_SEND_WQE_BYTE_4_OWNER_S, owner_bit); + + roce_set_field(ud_sq_wqe->byte_16, + V2_UD_SEND_WQE_BYTE_16_PD_M, + V2_UD_SEND_WQE_BYTE_16_PD_S, + to_hr_pd(ibqp->pd)->pdn); + + roce_set_field(ud_sq_wqe->byte_16, + V2_UD_SEND_WQE_BYTE_16_SGE_NUM_M, + V2_UD_SEND_WQE_BYTE_16_SGE_NUM_S, + wr->num_sge); + + roce_set_field(ud_sq_wqe->byte_20, + V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M, + V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S, + sge_ind & (qp->sge.sge_cnt - 1)); + + roce_set_field(ud_sq_wqe->byte_24, + V2_UD_SEND_WQE_BYTE_24_UDPSPN_M, + V2_UD_SEND_WQE_BYTE_24_UDPSPN_S, 0); + ud_sq_wqe->qkey = + cpu_to_be32(ud_wr(wr)->remote_qkey & 0x80000000) ? + qp->qkey : ud_wr(wr)->remote_qkey; + roce_set_field(ud_sq_wqe->byte_32, + V2_UD_SEND_WQE_BYTE_32_DQPN_M, + V2_UD_SEND_WQE_BYTE_32_DQPN_S, + ud_wr(wr)->remote_qpn); + + roce_set_field(ud_sq_wqe->byte_36, + V2_UD_SEND_WQE_BYTE_36_VLAN_M, + V2_UD_SEND_WQE_BYTE_36_VLAN_S, + ah->av.vlan); + roce_set_field(ud_sq_wqe->byte_36, + V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_M, + V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_S, + ah->av.hop_limit); + roce_set_field(ud_sq_wqe->byte_36, + V2_UD_SEND_WQE_BYTE_36_TCLASS_M, + V2_UD_SEND_WQE_BYTE_36_TCLASS_S, + 0); + roce_set_field(ud_sq_wqe->byte_36, + V2_UD_SEND_WQE_BYTE_36_TCLASS_M, + V2_UD_SEND_WQE_BYTE_36_TCLASS_S, + 0); + roce_set_field(ud_sq_wqe->byte_40, + V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_M, + V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_S, 0); + roce_set_field(ud_sq_wqe->byte_40, + V2_UD_SEND_WQE_BYTE_40_SL_M, + V2_UD_SEND_WQE_BYTE_40_SL_S, + ah->av.sl_tclass_flowlabel >> + HNS_ROCE_SL_SHIFT); + roce_set_field(ud_sq_wqe->byte_40, + V2_UD_SEND_WQE_BYTE_40_PORTN_M, + V2_UD_SEND_WQE_BYTE_40_PORTN_S, + qp->port); + + roce_set_field(ud_sq_wqe->byte_48, + V2_UD_SEND_WQE_BYTE_48_SGID_INDX_M, + V2_UD_SEND_WQE_BYTE_48_SGID_INDX_S, + hns_get_gid_index(hr_dev, qp->phy_port, + ah->av.gid_index)); + + memcpy(&ud_sq_wqe->dgid[0], &ah->av.dgid[0], + GID_LEN_V2); + + dseg = get_send_extend_sge(qp, + sge_ind & (qp->sge.sge_cnt - 1)); + for (i = 0; i < wr->num_sge; i++) { + set_data_seg_v2(dseg + i, wr->sg_list + i); + sge_ind++; + } + + ind++; + } else if (ibqp->qp_type == IB_QPT_RC) { + rc_sq_wqe = wqe; + memset(rc_sq_wqe, 0, sizeof(*rc_sq_wqe)); + for (i = 0; i < wr->num_sge; i++) + rc_sq_wqe->msg_len += wr->sg_list[i].length; + + rc_sq_wqe->inv_key_immtdata = send_ieth(wr); + + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_FENCE_S, + (wr->send_flags & IB_SEND_FENCE) ? 1 : 0); + + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_SE_S, + (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0); + + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_CQE_S, + (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0); + + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OWNER_S, owner_bit); + + switch (wr->opcode) { + case IB_WR_RDMA_READ: + roce_set_field(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OPCODE_M, + V2_RC_SEND_WQE_BYTE_4_OPCODE_S, + HNS_ROCE_V2_WQE_OP_RDMA_READ); + rc_sq_wqe->rkey = + cpu_to_le32(rdma_wr(wr)->rkey); + rc_sq_wqe->va = + cpu_to_le64(rdma_wr(wr)->remote_addr); + break; + case IB_WR_RDMA_WRITE: + roce_set_field(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OPCODE_M, + V2_RC_SEND_WQE_BYTE_4_OPCODE_S, + HNS_ROCE_V2_WQE_OP_RDMA_WRITE); + rc_sq_wqe->rkey = + cpu_to_le32(rdma_wr(wr)->rkey); + rc_sq_wqe->va = + cpu_to_le64(rdma_wr(wr)->remote_addr); + break; + case IB_WR_RDMA_WRITE_WITH_IMM: + roce_set_field(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OPCODE_M, V2_RC_SEND_WQE_BYTE_4_OPCODE_S, HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM); - rc_sq_wqe->rkey = cpu_to_le32(rdma_wr(wr)->rkey); - rc_sq_wqe->va = cpu_to_le64(rdma_wr(wr)->remote_addr); - break; - case IB_WR_SEND: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_SEND); - break; - case IB_WR_SEND_WITH_INV: - roce_set_field(rc_sq_wqe->byte_4, + rc_sq_wqe->rkey = + cpu_to_le32(rdma_wr(wr)->rkey); + rc_sq_wqe->va = + cpu_to_le64(rdma_wr(wr)->remote_addr); + break; + case IB_WR_SEND: + roce_set_field(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OPCODE_M, + V2_RC_SEND_WQE_BYTE_4_OPCODE_S, + HNS_ROCE_V2_WQE_OP_SEND); + break; + case IB_WR_SEND_WITH_INV: + roce_set_field(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OPCODE_M, V2_RC_SEND_WQE_BYTE_4_OPCODE_S, HNS_ROCE_V2_WQE_OP_SEND_WITH_INV); - break; - case IB_WR_SEND_WITH_IMM: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM); - break; - case IB_WR_LOCAL_INV: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_LOCAL_INV); - break; - case IB_WR_ATOMIC_CMP_AND_SWP: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP); - break; - case IB_WR_ATOMIC_FETCH_AND_ADD: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD); - break; - case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: - roce_set_field(rc_sq_wqe->byte_4, + break; + case IB_WR_SEND_WITH_IMM: + roce_set_field(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OPCODE_M, + V2_RC_SEND_WQE_BYTE_4_OPCODE_S, + HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM); + break; + case IB_WR_LOCAL_INV: + roce_set_field(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OPCODE_M, + V2_RC_SEND_WQE_BYTE_4_OPCODE_S, + HNS_ROCE_V2_WQE_OP_LOCAL_INV); + break; + case IB_WR_ATOMIC_CMP_AND_SWP: + roce_set_field(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OPCODE_M, + V2_RC_SEND_WQE_BYTE_4_OPCODE_S, + HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP); + break; + case IB_WR_ATOMIC_FETCH_AND_ADD: + roce_set_field(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OPCODE_M, + V2_RC_SEND_WQE_BYTE_4_OPCODE_S, + HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD); + break; + case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: + roce_set_field(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OPCODE_M, V2_RC_SEND_WQE_BYTE_4_OPCODE_S, HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP); - break; - case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD: - roce_set_field(rc_sq_wqe->byte_4, + break; + case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD: + roce_set_field(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OPCODE_M, V2_RC_SEND_WQE_BYTE_4_OPCODE_S, HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD); - break; - default: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_MASK); - break; - } - - wqe += sizeof(struct hns_roce_v2_rc_send_wqe); - dseg = wqe; - if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) { - if (rc_sq_wqe->msg_len > - hr_dev->caps.max_sq_inline) { - ret = -EINVAL; - *bad_wr = wr; - dev_err(dev, "inline len(1-%d)=%d, illegal", - rc_sq_wqe->msg_len, - hr_dev->caps.max_sq_inline); - goto out; + break; + default: + roce_set_field(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OPCODE_M, + V2_RC_SEND_WQE_BYTE_4_OPCODE_S, + HNS_ROCE_V2_WQE_OP_MASK); + break; } - for (i = 0; i < wr->num_sge; i++) { - memcpy(wqe, ((void *)wr->sg_list[i].addr), - wr->sg_list[i].length); - wqe += wr->sg_list[i].length; - wqe_sz += wr->sg_list[i].length; - } + wqe += sizeof(struct hns_roce_v2_rc_send_wqe); + dseg = wqe; - roce_set_bit(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_INLINE_S, 1); + ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe, wqe, + &sge_ind, bad_wr); + if (ret) + goto out; + ind++; } else { - if (wr->num_sge <= 2) { - for (i = 0; i < wr->num_sge; i++) - set_data_seg_v2(dseg + i, - wr->sg_list + i); - } else { - roce_set_field(rc_sq_wqe->byte_20, - V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M, - V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S, - sge_ind & (qp->sge.sge_cnt - 1)); - - for (i = 0; i < 2; i++) - set_data_seg_v2(dseg + i, - wr->sg_list + i); - - dseg = get_send_extend_sge(qp, - sge_ind & (qp->sge.sge_cnt - 1)); - - for (i = 0; i < wr->num_sge - 2; i++) { - set_data_seg_v2(dseg + i, - wr->sg_list + 2 + i); - sge_ind++; - } - } - - roce_set_field(rc_sq_wqe->byte_16, - V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M, - V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S, - wr->num_sge); - wqe_sz += wr->num_sge * - sizeof(struct hns_roce_v2_wqe_data_seg); + dev_err(dev, "Illegal qp_type(0x%x)\n", ibqp->qp_type); + spin_unlock_irqrestore(&qp->sq.lock, flags); + return -EOPNOTSUPP; } - ind++; } out: @@ -299,6 +468,7 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); struct hns_roce_v2_wqe_data_seg *dseg; + struct hns_roce_rinl_sge *sge_list; struct device *dev = hr_dev->dev; struct hns_roce_v2_db rq_db; unsigned long flags; @@ -347,6 +517,14 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, dseg[i].addr = 0; } + /* rq support inline data */ + sge_list = hr_qp->rq_inl_buf.wqe_list[ind].sg_list; + hr_qp->rq_inl_buf.wqe_list[ind].sge_cnt = (u32)wr->num_sge; + for (i = 0; i < wr->num_sge; i++) { + sge_list[i].addr = (void *)(u64)wr->sg_list[i].addr; + sge_list[i].len = wr->sg_list[i].length; + } + hr_qp->rq.wrid[ind] = wr->wr_id; ind = (ind + 1) & (hr_qp->rq.wqe_cnt - 1); @@ -908,9 +1086,9 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) caps->max_sq_inline = HNS_ROCE_V2_MAX_SQ_INLINE; caps->num_uars = HNS_ROCE_V2_UAR_NUM; caps->phy_num_uars = HNS_ROCE_V2_PHY_UAR_NUM; - caps->num_aeq_vectors = 1; - caps->num_comp_vectors = 63; - caps->num_other_vectors = 0; + caps->num_aeq_vectors = HNS_ROCE_V2_AEQE_VEC_NUM; + caps->num_comp_vectors = HNS_ROCE_V2_COMP_VEC_NUM; + caps->num_other_vectors = HNS_ROCE_V2_ABNORMAL_VEC_NUM; caps->num_mtpts = HNS_ROCE_V2_MAX_MTPT_NUM; caps->num_mtt_segs = HNS_ROCE_V2_MAX_MTT_SEGS; caps->num_cqe_segs = HNS_ROCE_V2_MAX_CQE_SEGS; @@ -955,12 +1133,18 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) caps->cqe_ba_pg_sz = 0; caps->cqe_buf_pg_sz = 0; caps->cqe_hop_num = HNS_ROCE_CQE_HOP_NUM; + caps->eqe_ba_pg_sz = 0; + caps->eqe_buf_pg_sz = 0; + caps->eqe_hop_num = HNS_ROCE_EQE_HOP_NUM; caps->chunk_sz = HNS_ROCE_V2_TABLE_CHUNK_SIZE; caps->flags = HNS_ROCE_CAP_FLAG_REREG_MR | - HNS_ROCE_CAP_FLAG_ROCE_V1_V2; + HNS_ROCE_CAP_FLAG_ROCE_V1_V2 | + HNS_ROCE_CAP_FLAG_RQ_INLINE; caps->pkey_table_len[0] = 1; caps->gid_table_len[0] = HNS_ROCE_V2_GID_INDEX_NUM; + caps->ceqe_depth = HNS_ROCE_V2_COMP_EQE_NUM; + caps->aeqe_depth = HNS_ROCE_V2_ASYNC_EQE_NUM; caps->local_ca_ack_delay = 0; caps->max_mtu = IB_MTU_4096; @@ -1382,6 +1566,8 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev, roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_CQ_ST_M, V2_CQC_BYTE_4_CQ_ST_S, V2_CQ_STATE_VALID); + roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_ARM_ST_M, + V2_CQC_BYTE_4_ARM_ST_S, REG_NXT_CEQE); roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_SHIFT_M, V2_CQC_BYTE_4_SHIFT_S, ilog2((unsigned int)nent)); roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_CEQN_M, @@ -1422,6 +1608,15 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev, roce_set_field(cq_context->byte_40_cqe_ba, V2_CQC_BYTE_40_CQE_BA_M, V2_CQC_BYTE_40_CQE_BA_S, (dma_handle >> (32 + 3))); + + roce_set_field(cq_context->byte_56_cqe_period_maxcnt, + V2_CQC_BYTE_56_CQ_MAX_CNT_M, + V2_CQC_BYTE_56_CQ_MAX_CNT_S, + HNS_ROCE_V2_CQ_DEFAULT_BURST_NUM); + roce_set_field(cq_context->byte_56_cqe_period_maxcnt, + V2_CQC_BYTE_56_CQ_PERIOD_M, + V2_CQC_BYTE_56_CQ_PERIOD_S, + HNS_ROCE_V2_CQ_DEFAULT_INTERVAL); } static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq, @@ -1457,6 +1652,40 @@ static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq, return 0; } +static int hns_roce_handle_recv_inl_wqe(struct hns_roce_v2_cqe *cqe, + struct hns_roce_qp **cur_qp, + struct ib_wc *wc) +{ + struct hns_roce_rinl_sge *sge_list; + u32 wr_num, wr_cnt, sge_num; + u32 sge_cnt, data_len, size; + void *wqe_buf; + + wr_num = roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_WQE_INDX_M, + V2_CQE_BYTE_4_WQE_INDX_S) & 0xffff; + wr_cnt = wr_num & ((*cur_qp)->rq.wqe_cnt - 1); + + sge_list = (*cur_qp)->rq_inl_buf.wqe_list[wr_cnt].sg_list; + sge_num = (*cur_qp)->rq_inl_buf.wqe_list[wr_cnt].sge_cnt; + wqe_buf = get_recv_wqe(*cur_qp, wr_cnt); + data_len = wc->byte_len; + + for (sge_cnt = 0; (sge_cnt < sge_num) && (data_len); sge_cnt++) { + size = min(sge_list[sge_cnt].len, data_len); + memcpy((void *)sge_list[sge_cnt].addr, wqe_buf, size); + + data_len -= size; + wqe_buf += size; + } + + if (data_len) { + wc->status = IB_WC_LOC_LEN_ERR; + return -EAGAIN; + } + + return 0; +} + static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, struct hns_roce_qp **cur_qp, struct ib_wc *wc) { @@ -1469,6 +1698,7 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, u32 opcode; u32 status; int qpn; + int ret; /* Find cqe according to consumer index */ cqe = next_cqe_sw_v2(hr_cq); @@ -1636,7 +1866,7 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, case HNS_ROCE_V2_OPCODE_RDMA_WRITE_IMM: wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; wc->wc_flags = IB_WC_WITH_IMM; - wc->ex.imm_data = le32_to_cpu(cqe->rkey_immtdata); + wc->ex.imm_data = cqe->immtdata; break; case HNS_ROCE_V2_OPCODE_SEND: wc->opcode = IB_WC_RECV; @@ -1645,18 +1875,29 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, case HNS_ROCE_V2_OPCODE_SEND_WITH_IMM: wc->opcode = IB_WC_RECV; wc->wc_flags = IB_WC_WITH_IMM; - wc->ex.imm_data = le32_to_cpu(cqe->rkey_immtdata); + wc->ex.imm_data = cqe->immtdata; break; case HNS_ROCE_V2_OPCODE_SEND_WITH_INV: wc->opcode = IB_WC_RECV; wc->wc_flags = IB_WC_WITH_INVALIDATE; - wc->ex.invalidate_rkey = cqe->rkey_immtdata; + wc->ex.invalidate_rkey = le32_to_cpu(cqe->rkey); break; default: wc->status = IB_WC_GENERAL_ERR; break; } + if ((wc->qp->qp_type == IB_QPT_RC || + wc->qp->qp_type == IB_QPT_UC) && + (opcode == HNS_ROCE_V2_OPCODE_SEND || + opcode == HNS_ROCE_V2_OPCODE_SEND_WITH_IMM || + opcode == HNS_ROCE_V2_OPCODE_SEND_WITH_INV) && + (roce_get_bit(cqe->byte_4, V2_CQE_BYTE_4_RQ_INLINE_S))) { + ret = hns_roce_handle_recv_inl_wqe(cqe, cur_qp, wc); + if (ret) + return -EAGAIN; + } + /* Update tail pointer, record wr_id */ wq = &(*cur_qp)->rq; wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; @@ -1670,6 +1911,21 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, wc->wc_flags |= (roce_get_bit(cqe->byte_32, V2_CQE_BYTE_32_GRH_S) ? IB_WC_GRH : 0); + wc->port_num = roce_get_field(cqe->byte_32, + V2_CQE_BYTE_32_PORTN_M, V2_CQE_BYTE_32_PORTN_S); + wc->pkey_index = 0; + memcpy(wc->smac, cqe->smac, 4); + wc->smac[4] = roce_get_field(cqe->byte_28, + V2_CQE_BYTE_28_SMAC_4_M, + V2_CQE_BYTE_28_SMAC_4_S); + wc->smac[5] = roce_get_field(cqe->byte_28, + V2_CQE_BYTE_28_SMAC_5_M, + V2_CQE_BYTE_28_SMAC_5_S); + wc->vlan_id = 0xffff; + wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); + wc->network_hdr_type = roce_get_field(cqe->byte_28, + V2_CQE_BYTE_28_PORT_TYPE_M, + V2_CQE_BYTE_28_PORT_TYPE_S); } return 0; @@ -1859,8 +2115,39 @@ static int hns_roce_v2_qp_modify(struct hns_roce_dev *hr_dev, return ret; } +static void set_access_flags(struct hns_roce_qp *hr_qp, + struct hns_roce_v2_qp_context *context, + struct hns_roce_v2_qp_context *qpc_mask, + const struct ib_qp_attr *attr, int attr_mask) +{ + u8 dest_rd_atomic; + u32 access_flags; + + dest_rd_atomic = !!(attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) ? + attr->max_dest_rd_atomic : hr_qp->resp_depth; + + access_flags = !!(attr_mask & IB_QP_ACCESS_FLAGS) ? + attr->qp_access_flags : hr_qp->atomic_rd_en; + + if (!dest_rd_atomic) + access_flags &= IB_ACCESS_REMOTE_WRITE; + + roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S, + !!(access_flags & IB_ACCESS_REMOTE_READ)); + roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S, 0); + + roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RWE_S, + !!(access_flags & IB_ACCESS_REMOTE_WRITE)); + roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RWE_S, 0); + + roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S, + !!(access_flags & IB_ACCESS_REMOTE_ATOMIC)); + roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S, 0); +} + static void modify_qp_reset_to_init(struct ib_qp *ibqp, const struct ib_qp_attr *attr, + int attr_mask, struct hns_roce_v2_qp_context *context, struct hns_roce_v2_qp_context *qpc_mask) { @@ -1877,9 +2164,18 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M, V2_QPC_BYTE_4_TST_S, 0); - roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M, - V2_QPC_BYTE_4_SGE_SHIFT_S, hr_qp->sq.max_gs > 2 ? - ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0); + if (ibqp->qp_type == IB_QPT_GSI) + roce_set_field(context->byte_4_sqpn_tst, + V2_QPC_BYTE_4_SGE_SHIFT_M, + V2_QPC_BYTE_4_SGE_SHIFT_S, + ilog2((unsigned int)hr_qp->sge.sge_cnt)); + else + roce_set_field(context->byte_4_sqpn_tst, + V2_QPC_BYTE_4_SGE_SHIFT_M, + V2_QPC_BYTE_4_SGE_SHIFT_S, + hr_qp->sq.max_gs > 2 ? + ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0); + roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M, V2_QPC_BYTE_4_SGE_SHIFT_S, 0); @@ -1944,18 +2240,13 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CNP_TX_FLAG_S, 0); roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CE_FLAG_S, 0); - roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S, - !!(attr->qp_access_flags & IB_ACCESS_REMOTE_READ)); - roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S, 0); - - roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RWE_S, - !!(attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)); - roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RWE_S, 0); - - roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S, - !!(attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)); - roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S, 0); + if (attr_mask & IB_QP_QKEY) { + context->qkey_xrcd = attr->qkey; + qpc_mask->qkey_xrcd = 0; + hr_qp->qkey = attr->qkey; + } + roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S, 1); roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S, 0); roce_set_field(context->byte_80_rnr_rx_cqn, V2_QPC_BYTE_80_RX_CQN_M, @@ -2176,9 +2467,17 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M, V2_QPC_BYTE_4_TST_S, 0); - roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M, - V2_QPC_BYTE_4_SGE_SHIFT_S, hr_qp->sq.max_gs > 2 ? - ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0); + if (ibqp->qp_type == IB_QPT_GSI) + roce_set_field(context->byte_4_sqpn_tst, + V2_QPC_BYTE_4_SGE_SHIFT_M, + V2_QPC_BYTE_4_SGE_SHIFT_S, + ilog2((unsigned int)hr_qp->sge.sge_cnt)); + else + roce_set_field(context->byte_4_sqpn_tst, + V2_QPC_BYTE_4_SGE_SHIFT_M, + V2_QPC_BYTE_4_SGE_SHIFT_S, hr_qp->sq.max_gs > 2 ? + ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0); + roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M, V2_QPC_BYTE_4_SGE_SHIFT_S, 0); @@ -2239,7 +2538,7 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp, V2_QPC_BYTE_80_RX_CQN_S, 0); roce_set_field(context->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M, - V2_QPC_BYTE_252_TX_CQN_S, to_hr_cq(ibqp->recv_cq)->cqn); + V2_QPC_BYTE_252_TX_CQN_S, to_hr_cq(ibqp->send_cq)->cqn); roce_set_field(qpc_mask->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M, V2_QPC_BYTE_252_TX_CQN_S, 0); @@ -2255,10 +2554,10 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp, V2_QPC_BYTE_76_SRQN_M, V2_QPC_BYTE_76_SRQN_S, 0); } - if (attr_mask & IB_QP_PKEY_INDEX) - context->qkey_xrcd = attr->pkey_index; - else - context->qkey_xrcd = hr_qp->pkey_index; + if (attr_mask & IB_QP_QKEY) { + context->qkey_xrcd = attr->qkey; + qpc_mask->qkey_xrcd = 0; + } roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M, V2_QPC_BYTE_4_SQPN_S, hr_qp->qpn); @@ -2354,7 +2653,8 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(context->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_SGE_HOP_NUM_M, V2_QPC_BYTE_20_SGE_HOP_NUM_S, - hr_qp->sq.max_gs > 2 ? hr_dev->caps.mtt_hop_num : 0); + ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ? + hr_dev->caps.mtt_hop_num : 0); roce_set_field(qpc_mask->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_SGE_HOP_NUM_M, V2_QPC_BYTE_20_SGE_HOP_NUM_S, 0); @@ -2463,11 +2763,14 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_LBI_S, 0); } - roce_set_field(context->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M, - V2_QPC_BYTE_140_RR_MAX_S, - ilog2((unsigned int)attr->max_dest_rd_atomic)); - roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M, - V2_QPC_BYTE_140_RR_MAX_S, 0); + if ((attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) && + attr->max_dest_rd_atomic) { + roce_set_field(context->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M, + V2_QPC_BYTE_140_RR_MAX_S, + fls(attr->max_dest_rd_atomic - 1)); + roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M, + V2_QPC_BYTE_140_RR_MAX_S, 0); + } roce_set_field(context->byte_56_dqpn_err, V2_QPC_BYTE_56_DQPN_M, V2_QPC_BYTE_56_DQPN_S, attr->dest_qp_num); @@ -2511,8 +2814,13 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M, V2_QPC_BYTE_24_TC_S, 0); - roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M, - V2_QPC_BYTE_24_MTU_S, attr->path_mtu); + if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_UD) + roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M, + V2_QPC_BYTE_24_MTU_S, IB_MTU_4096); + else + roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M, + V2_QPC_BYTE_24_MTU_S, attr->path_mtu); + roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M, V2_QPC_BYTE_24_MTU_S, 0); @@ -2557,12 +2865,6 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, V2_QPC_BYTE_168_LP_SGEN_INI_M, V2_QPC_BYTE_168_LP_SGEN_INI_S, 0); - roce_set_field(context->byte_208_irrl, V2_QPC_BYTE_208_SR_MAX_M, - V2_QPC_BYTE_208_SR_MAX_S, - ilog2((unsigned int)attr->max_rd_atomic)); - roce_set_field(qpc_mask->byte_208_irrl, V2_QPC_BYTE_208_SR_MAX_M, - V2_QPC_BYTE_208_SR_MAX_S, 0); - roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_SL_M, V2_QPC_BYTE_28_SL_S, rdma_ah_get_sl(&attr->ah_attr)); roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_SL_M, @@ -2625,13 +2927,14 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S, 0); page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT); - context->sq_cur_sge_blk_addr = hr_qp->sq.max_gs > 2 ? + context->sq_cur_sge_blk_addr = + ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ? ((u32)(mtts[hr_qp->sge.offset / page_size] >> PAGE_ADDR_SHIFT)) : 0; roce_set_field(context->byte_184_irrl_idx, V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M, V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S, - hr_qp->sq.max_gs > 2 ? + ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ? (mtts[hr_qp->sge.offset / page_size] >> (32 + PAGE_ADDR_SHIFT)) : 0); qpc_mask->sq_cur_sge_blk_addr = 0; @@ -2766,6 +3069,14 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_196_sq_psn, V2_QPC_BYTE_196_SQ_MAX_PSN_M, V2_QPC_BYTE_196_SQ_MAX_PSN_S, 0); + if ((attr_mask & IB_QP_MAX_QP_RD_ATOMIC) && attr->max_rd_atomic) { + roce_set_field(context->byte_208_irrl, V2_QPC_BYTE_208_SR_MAX_M, + V2_QPC_BYTE_208_SR_MAX_S, + fls(attr->max_rd_atomic - 1)); + roce_set_field(qpc_mask->byte_208_irrl, + V2_QPC_BYTE_208_SR_MAX_M, + V2_QPC_BYTE_208_SR_MAX_S, 0); + } return 0; } @@ -2794,7 +3105,8 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, */ memset(qpc_mask, 0xff, sizeof(*qpc_mask)); if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { - modify_qp_reset_to_init(ibqp, attr, context, qpc_mask); + modify_qp_reset_to_init(ibqp, attr, attr_mask, context, + qpc_mask); } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) { modify_qp_init_to_init(ibqp, attr, attr_mask, context, qpc_mask); @@ -2829,6 +3141,9 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, goto out; } + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) + set_access_flags(hr_qp, context, qpc_mask, attr, attr_mask); + /* Every status migrate must change state */ roce_set_field(context->byte_60_qpst_mapid, V2_QPC_BYTE_60_QP_ST_M, V2_QPC_BYTE_60_QP_ST_S, new_state); @@ -2845,6 +3160,9 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, hr_qp->state = new_state; + if (attr_mask & IB_QP_ACCESS_FLAGS) + hr_qp->atomic_rd_en = attr->qp_access_flags; + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) hr_qp->resp_depth = attr->max_dest_rd_atomic; if (attr_mask & IB_QP_PORT) { @@ -3098,6 +3416,11 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf); } + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) { + kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list); + kfree(hr_qp->rq_inl_buf.wqe_list); + } + return 0; } @@ -3162,6 +3485,1146 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) return ret; } +static void set_eq_cons_index_v2(struct hns_roce_eq *eq) +{ + u32 doorbell[2]; + + doorbell[0] = 0; + doorbell[1] = 0; + + if (eq->type_flag == HNS_ROCE_AEQ) { + roce_set_field(doorbell[0], HNS_ROCE_V2_EQ_DB_CMD_M, + HNS_ROCE_V2_EQ_DB_CMD_S, + eq->arm_st == HNS_ROCE_V2_EQ_ALWAYS_ARMED ? + HNS_ROCE_EQ_DB_CMD_AEQ : + HNS_ROCE_EQ_DB_CMD_AEQ_ARMED); + } else { + roce_set_field(doorbell[0], HNS_ROCE_V2_EQ_DB_TAG_M, + HNS_ROCE_V2_EQ_DB_TAG_S, eq->eqn); + + roce_set_field(doorbell[0], HNS_ROCE_V2_EQ_DB_CMD_M, + HNS_ROCE_V2_EQ_DB_CMD_S, + eq->arm_st == HNS_ROCE_V2_EQ_ALWAYS_ARMED ? + HNS_ROCE_EQ_DB_CMD_CEQ : + HNS_ROCE_EQ_DB_CMD_CEQ_ARMED); + } + + roce_set_field(doorbell[1], HNS_ROCE_V2_EQ_DB_PARA_M, + HNS_ROCE_V2_EQ_DB_PARA_S, + (eq->cons_index & HNS_ROCE_V2_CONS_IDX_M)); + + hns_roce_write64_k(doorbell, eq->doorbell); +} + +static void hns_roce_v2_wq_catas_err_handle(struct hns_roce_dev *hr_dev, + struct hns_roce_aeqe *aeqe, + u32 qpn) +{ + struct device *dev = hr_dev->dev; + int sub_type; + + dev_warn(dev, "Local work queue catastrophic error.\n"); + sub_type = roce_get_field(aeqe->asyn, HNS_ROCE_V2_AEQE_SUB_TYPE_M, + HNS_ROCE_V2_AEQE_SUB_TYPE_S); + switch (sub_type) { + case HNS_ROCE_LWQCE_QPC_ERROR: + dev_warn(dev, "QP %d, QPC error.\n", qpn); + break; + case HNS_ROCE_LWQCE_MTU_ERROR: + dev_warn(dev, "QP %d, MTU error.\n", qpn); + break; + case HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR: + dev_warn(dev, "QP %d, WQE BA addr error.\n", qpn); + break; + case HNS_ROCE_LWQCE_WQE_ADDR_ERROR: + dev_warn(dev, "QP %d, WQE addr error.\n", qpn); + break; + case HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR: + dev_warn(dev, "QP %d, WQE shift error.\n", qpn); + break; + default: + dev_err(dev, "Unhandled sub_event type %d.\n", sub_type); + break; + } +} + +static void hns_roce_v2_local_wq_access_err_handle(struct hns_roce_dev *hr_dev, + struct hns_roce_aeqe *aeqe, u32 qpn) +{ + struct device *dev = hr_dev->dev; + int sub_type; + + dev_warn(dev, "Local access violation work queue error.\n"); + sub_type = roce_get_field(aeqe->asyn, HNS_ROCE_V2_AEQE_SUB_TYPE_M, + HNS_ROCE_V2_AEQE_SUB_TYPE_S); + switch (sub_type) { + case HNS_ROCE_LAVWQE_R_KEY_VIOLATION: + dev_warn(dev, "QP %d, R_key violation.\n", qpn); + break; + case HNS_ROCE_LAVWQE_LENGTH_ERROR: + dev_warn(dev, "QP %d, length error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_VA_ERROR: + dev_warn(dev, "QP %d, VA error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_PD_ERROR: + dev_err(dev, "QP %d, PD error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_RW_ACC_ERROR: + dev_warn(dev, "QP %d, rw acc error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_KEY_STATE_ERROR: + dev_warn(dev, "QP %d, key state error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_MR_OPERATION_ERROR: + dev_warn(dev, "QP %d, MR operation error.\n", qpn); + break; + default: + dev_err(dev, "Unhandled sub_event type %d.\n", sub_type); + break; + } +} + +static void hns_roce_v2_qp_err_handle(struct hns_roce_dev *hr_dev, + struct hns_roce_aeqe *aeqe, + int event_type) +{ + struct device *dev = hr_dev->dev; + u32 qpn; + + qpn = roce_get_field(aeqe->event.qp_event.qp, + HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M, + HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S); + + switch (event_type) { + case HNS_ROCE_EVENT_TYPE_COMM_EST: + dev_warn(dev, "Communication established.\n"); + break; + case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: + dev_warn(dev, "Send queue drained.\n"); + break; + case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: + hns_roce_v2_wq_catas_err_handle(hr_dev, aeqe, qpn); + break; + case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: + dev_warn(dev, "Invalid request local work queue error.\n"); + break; + case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: + hns_roce_v2_local_wq_access_err_handle(hr_dev, aeqe, qpn); + break; + default: + break; + } + + hns_roce_qp_event(hr_dev, qpn, event_type); +} + +static void hns_roce_v2_cq_err_handle(struct hns_roce_dev *hr_dev, + struct hns_roce_aeqe *aeqe, + int event_type) +{ + struct device *dev = hr_dev->dev; + u32 cqn; + + cqn = roce_get_field(aeqe->event.cq_event.cq, + HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M, + HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S); + + switch (event_type) { + case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: + dev_warn(dev, "CQ 0x%x access err.\n", cqn); + break; + case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: + dev_warn(dev, "CQ 0x%x overflow\n", cqn); + break; + default: + break; + } + + hns_roce_cq_event(hr_dev, cqn, event_type); +} + +static struct hns_roce_aeqe *get_aeqe_v2(struct hns_roce_eq *eq, u32 entry) +{ + u32 buf_chk_sz; + unsigned long off; + + buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT); + off = (entry & (eq->entries - 1)) * HNS_ROCE_AEQ_ENTRY_SIZE; + + return (struct hns_roce_aeqe *)((char *)(eq->buf_list->buf) + + off % buf_chk_sz); +} + +static struct hns_roce_aeqe *mhop_get_aeqe(struct hns_roce_eq *eq, u32 entry) +{ + u32 buf_chk_sz; + unsigned long off; + + buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT); + + off = (entry & (eq->entries - 1)) * HNS_ROCE_AEQ_ENTRY_SIZE; + + if (eq->hop_num == HNS_ROCE_HOP_NUM_0) + return (struct hns_roce_aeqe *)((u8 *)(eq->bt_l0) + + off % buf_chk_sz); + else + return (struct hns_roce_aeqe *)((u8 *) + (eq->buf[off / buf_chk_sz]) + off % buf_chk_sz); +} + +static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq) +{ + struct hns_roce_aeqe *aeqe; + + if (!eq->hop_num) + aeqe = get_aeqe_v2(eq, eq->cons_index); + else + aeqe = mhop_get_aeqe(eq, eq->cons_index); + + return (roce_get_bit(aeqe->asyn, HNS_ROCE_V2_AEQ_AEQE_OWNER_S) ^ + !!(eq->cons_index & eq->entries)) ? aeqe : NULL; +} + +static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) +{ + struct device *dev = hr_dev->dev; + struct hns_roce_aeqe *aeqe; + int aeqe_found = 0; + int event_type; + + while ((aeqe = next_aeqe_sw_v2(eq))) { + + /* Make sure we read AEQ entry after we have checked the + * ownership bit + */ + dma_rmb(); + + event_type = roce_get_field(aeqe->asyn, + HNS_ROCE_V2_AEQE_EVENT_TYPE_M, + HNS_ROCE_V2_AEQE_EVENT_TYPE_S); + + switch (event_type) { + case HNS_ROCE_EVENT_TYPE_PATH_MIG: + dev_warn(dev, "Path migrated succeeded.\n"); + break; + case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED: + dev_warn(dev, "Path migration failed.\n"); + break; + case HNS_ROCE_EVENT_TYPE_COMM_EST: + case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: + case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: + case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: + case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: + hns_roce_v2_qp_err_handle(hr_dev, aeqe, event_type); + break; + case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: + case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH: + case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR: + dev_warn(dev, "SRQ not support.\n"); + break; + case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: + case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: + hns_roce_v2_cq_err_handle(hr_dev, aeqe, event_type); + break; + case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW: + dev_warn(dev, "DB overflow.\n"); + break; + case HNS_ROCE_EVENT_TYPE_MB: + hns_roce_cmd_event(hr_dev, + le16_to_cpu(aeqe->event.cmd.token), + aeqe->event.cmd.status, + le64_to_cpu(aeqe->event.cmd.out_param)); + break; + case HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW: + dev_warn(dev, "CEQ overflow.\n"); + break; + case HNS_ROCE_EVENT_TYPE_FLR: + dev_warn(dev, "Function level reset.\n"); + break; + default: + dev_err(dev, "Unhandled event %d on EQ %d at idx %u.\n", + event_type, eq->eqn, eq->cons_index); + break; + }; + + ++eq->cons_index; + aeqe_found = 1; + + if (eq->cons_index > (2 * eq->entries - 1)) { + dev_warn(dev, "cons_index overflow, set back to 0.\n"); + eq->cons_index = 0; + } + } + + set_eq_cons_index_v2(eq); + return aeqe_found; +} + +static struct hns_roce_ceqe *get_ceqe_v2(struct hns_roce_eq *eq, u32 entry) +{ + u32 buf_chk_sz; + unsigned long off; + + buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT); + off = (entry & (eq->entries - 1)) * HNS_ROCE_CEQ_ENTRY_SIZE; + + return (struct hns_roce_ceqe *)((char *)(eq->buf_list->buf) + + off % buf_chk_sz); +} + +static struct hns_roce_ceqe *mhop_get_ceqe(struct hns_roce_eq *eq, u32 entry) +{ + u32 buf_chk_sz; + unsigned long off; + + buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT); + + off = (entry & (eq->entries - 1)) * HNS_ROCE_CEQ_ENTRY_SIZE; + + if (eq->hop_num == HNS_ROCE_HOP_NUM_0) + return (struct hns_roce_ceqe *)((u8 *)(eq->bt_l0) + + off % buf_chk_sz); + else + return (struct hns_roce_ceqe *)((u8 *)(eq->buf[off / + buf_chk_sz]) + off % buf_chk_sz); +} + +static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq) +{ + struct hns_roce_ceqe *ceqe; + + if (!eq->hop_num) + ceqe = get_ceqe_v2(eq, eq->cons_index); + else + ceqe = mhop_get_ceqe(eq, eq->cons_index); + + return (!!(roce_get_bit(ceqe->comp, HNS_ROCE_V2_CEQ_CEQE_OWNER_S))) ^ + (!!(eq->cons_index & eq->entries)) ? ceqe : NULL; +} + +static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) +{ + struct device *dev = hr_dev->dev; + struct hns_roce_ceqe *ceqe; + int ceqe_found = 0; + u32 cqn; + + while ((ceqe = next_ceqe_sw_v2(eq))) { + + /* Make sure we read CEQ entry after we have checked the + * ownership bit + */ + dma_rmb(); + + cqn = roce_get_field(ceqe->comp, + HNS_ROCE_V2_CEQE_COMP_CQN_M, + HNS_ROCE_V2_CEQE_COMP_CQN_S); + + hns_roce_cq_completion(hr_dev, cqn); + + ++eq->cons_index; + ceqe_found = 1; + + if (eq->cons_index > (2 * eq->entries - 1)) { + dev_warn(dev, "cons_index overflow, set back to 0.\n"); + eq->cons_index = 0; + } + } + + set_eq_cons_index_v2(eq); + + return ceqe_found; +} + +static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) +{ + struct hns_roce_eq *eq = eq_ptr; + struct hns_roce_dev *hr_dev = eq->hr_dev; + int int_work = 0; + + if (eq->type_flag == HNS_ROCE_CEQ) + /* Completion event interrupt */ + int_work = hns_roce_v2_ceq_int(hr_dev, eq); + else + /* Asychronous event interrupt */ + int_work = hns_roce_v2_aeq_int(hr_dev, eq); + + return IRQ_RETVAL(int_work); +} + +static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) +{ + struct hns_roce_dev *hr_dev = dev_id; + struct device *dev = hr_dev->dev; + int int_work = 0; + u32 int_st; + u32 int_en; + + /* Abnormal interrupt */ + int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG); + int_en = roce_read(hr_dev, ROCEE_VF_ABN_INT_EN_REG); + + if (roce_get_bit(int_st, HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S)) { + dev_err(dev, "AEQ overflow!\n"); + + roce_set_bit(int_st, HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S, 1); + roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st); + + roce_set_bit(int_en, HNS_ROCE_V2_VF_ABN_INT_EN_S, 1); + roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en); + + int_work = 1; + } else if (roce_get_bit(int_st, HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S)) { + dev_err(dev, "BUS ERR!\n"); + + roce_set_bit(int_st, HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S, 1); + roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st); + + roce_set_bit(int_en, HNS_ROCE_V2_VF_ABN_INT_EN_S, 1); + roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en); + + int_work = 1; + } else if (roce_get_bit(int_st, HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S)) { + dev_err(dev, "OTHER ERR!\n"); + + roce_set_bit(int_st, HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S, 1); + roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st); + + roce_set_bit(int_en, HNS_ROCE_V2_VF_ABN_INT_EN_S, 1); + roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en); + + int_work = 1; + } else + dev_err(dev, "There is no abnormal irq found!\n"); + + return IRQ_RETVAL(int_work); +} + +static void hns_roce_v2_int_mask_enable(struct hns_roce_dev *hr_dev, + int eq_num, int enable_flag) +{ + int i; + + if (enable_flag == EQ_ENABLE) { + for (i = 0; i < eq_num; i++) + roce_write(hr_dev, ROCEE_VF_EVENT_INT_EN_REG + + i * EQ_REG_OFFSET, + HNS_ROCE_V2_VF_EVENT_INT_EN_M); + + roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, + HNS_ROCE_V2_VF_ABN_INT_EN_M); + roce_write(hr_dev, ROCEE_VF_ABN_INT_CFG_REG, + HNS_ROCE_V2_VF_ABN_INT_CFG_M); + } else { + for (i = 0; i < eq_num; i++) + roce_write(hr_dev, ROCEE_VF_EVENT_INT_EN_REG + + i * EQ_REG_OFFSET, + HNS_ROCE_V2_VF_EVENT_INT_EN_M & 0x0); + + roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, + HNS_ROCE_V2_VF_ABN_INT_EN_M & 0x0); + roce_write(hr_dev, ROCEE_VF_ABN_INT_CFG_REG, + HNS_ROCE_V2_VF_ABN_INT_CFG_M & 0x0); + } +} + +static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, int eqn) +{ + struct device *dev = hr_dev->dev; + int ret; + + if (eqn < hr_dev->caps.num_comp_vectors) + ret = hns_roce_cmd_mbox(hr_dev, 0, 0, eqn & HNS_ROCE_V2_EQN_M, + 0, HNS_ROCE_CMD_DESTROY_CEQC, + HNS_ROCE_CMD_TIMEOUT_MSECS); + else + ret = hns_roce_cmd_mbox(hr_dev, 0, 0, eqn & HNS_ROCE_V2_EQN_M, + 0, HNS_ROCE_CMD_DESTROY_AEQC, + HNS_ROCE_CMD_TIMEOUT_MSECS); + if (ret) + dev_err(dev, "[mailbox cmd] destroy eqc(%d) failed.\n", eqn); +} + +static void hns_roce_mhop_free_eq(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) +{ + struct device *dev = hr_dev->dev; + u64 idx; + u64 size; + u32 buf_chk_sz; + u32 bt_chk_sz; + u32 mhop_num; + int eqe_alloc; + int ba_num; + int i = 0; + int j = 0; + + mhop_num = hr_dev->caps.eqe_hop_num; + buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT); + bt_chk_sz = 1 << (hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT); + ba_num = (PAGE_ALIGN(eq->entries * eq->eqe_size) + buf_chk_sz - 1) / + buf_chk_sz; + + /* hop_num = 0 */ + if (mhop_num == HNS_ROCE_HOP_NUM_0) { + dma_free_coherent(dev, (unsigned int)(eq->entries * + eq->eqe_size), eq->bt_l0, eq->l0_dma); + return; + } + + /* hop_num = 1 or hop = 2 */ + dma_free_coherent(dev, bt_chk_sz, eq->bt_l0, eq->l0_dma); + if (mhop_num == 1) { + for (i = 0; i < eq->l0_last_num; i++) { + if (i == eq->l0_last_num - 1) { + eqe_alloc = i * (buf_chk_sz / eq->eqe_size); + size = (eq->entries - eqe_alloc) * eq->eqe_size; + dma_free_coherent(dev, size, eq->buf[i], + eq->buf_dma[i]); + break; + } + dma_free_coherent(dev, buf_chk_sz, eq->buf[i], + eq->buf_dma[i]); + } + } else if (mhop_num == 2) { + for (i = 0; i < eq->l0_last_num; i++) { + dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i], + eq->l1_dma[i]); + + for (j = 0; j < bt_chk_sz / 8; j++) { + idx = i * (bt_chk_sz / 8) + j; + if ((i == eq->l0_last_num - 1) + && j == eq->l1_last_num - 1) { + eqe_alloc = (buf_chk_sz / eq->eqe_size) + * idx; + size = (eq->entries - eqe_alloc) + * eq->eqe_size; + dma_free_coherent(dev, size, + eq->buf[idx], + eq->buf_dma[idx]); + break; + } + dma_free_coherent(dev, buf_chk_sz, eq->buf[idx], + eq->buf_dma[idx]); + } + } + } + kfree(eq->buf_dma); + kfree(eq->buf); + kfree(eq->l1_dma); + kfree(eq->bt_l1); + eq->buf_dma = NULL; + eq->buf = NULL; + eq->l1_dma = NULL; + eq->bt_l1 = NULL; +} + +static void hns_roce_v2_free_eq(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) +{ + u32 buf_chk_sz; + + buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT); + + if (hr_dev->caps.eqe_hop_num) { + hns_roce_mhop_free_eq(hr_dev, eq); + return; + } + + if (eq->buf_list) + dma_free_coherent(hr_dev->dev, buf_chk_sz, + eq->buf_list->buf, eq->buf_list->map); +} + +static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq, + void *mb_buf) +{ + struct hns_roce_eq_context *eqc; + + eqc = mb_buf; + memset(eqc, 0, sizeof(struct hns_roce_eq_context)); + + /* init eqc */ + eq->doorbell = hr_dev->reg_base + ROCEE_VF_EQ_DB_CFG0_REG; + eq->hop_num = hr_dev->caps.eqe_hop_num; + eq->cons_index = 0; + eq->over_ignore = HNS_ROCE_V2_EQ_OVER_IGNORE_0; + eq->coalesce = HNS_ROCE_V2_EQ_COALESCE_0; + eq->arm_st = HNS_ROCE_V2_EQ_ALWAYS_ARMED; + eq->eqe_ba_pg_sz = hr_dev->caps.eqe_ba_pg_sz; + eq->eqe_buf_pg_sz = hr_dev->caps.eqe_buf_pg_sz; + eq->shift = ilog2((unsigned int)eq->entries); + + if (!eq->hop_num) + eq->eqe_ba = eq->buf_list->map; + else + eq->eqe_ba = eq->l0_dma; + + /* set eqc state */ + roce_set_field(eqc->byte_4, + HNS_ROCE_EQC_EQ_ST_M, + HNS_ROCE_EQC_EQ_ST_S, + HNS_ROCE_V2_EQ_STATE_VALID); + + /* set eqe hop num */ + roce_set_field(eqc->byte_4, + HNS_ROCE_EQC_HOP_NUM_M, + HNS_ROCE_EQC_HOP_NUM_S, eq->hop_num); + + /* set eqc over_ignore */ + roce_set_field(eqc->byte_4, + HNS_ROCE_EQC_OVER_IGNORE_M, + HNS_ROCE_EQC_OVER_IGNORE_S, eq->over_ignore); + + /* set eqc coalesce */ + roce_set_field(eqc->byte_4, + HNS_ROCE_EQC_COALESCE_M, + HNS_ROCE_EQC_COALESCE_S, eq->coalesce); + + /* set eqc arm_state */ + roce_set_field(eqc->byte_4, + HNS_ROCE_EQC_ARM_ST_M, + HNS_ROCE_EQC_ARM_ST_S, eq->arm_st); + + /* set eqn */ + roce_set_field(eqc->byte_4, + HNS_ROCE_EQC_EQN_M, + HNS_ROCE_EQC_EQN_S, eq->eqn); + + /* set eqe_cnt */ + roce_set_field(eqc->byte_4, + HNS_ROCE_EQC_EQE_CNT_M, + HNS_ROCE_EQC_EQE_CNT_S, + HNS_ROCE_EQ_INIT_EQE_CNT); + + /* set eqe_ba_pg_sz */ + roce_set_field(eqc->byte_8, + HNS_ROCE_EQC_BA_PG_SZ_M, + HNS_ROCE_EQC_BA_PG_SZ_S, eq->eqe_ba_pg_sz); + + /* set eqe_buf_pg_sz */ + roce_set_field(eqc->byte_8, + HNS_ROCE_EQC_BUF_PG_SZ_M, + HNS_ROCE_EQC_BUF_PG_SZ_S, eq->eqe_buf_pg_sz); + + /* set eq_producer_idx */ + roce_set_field(eqc->byte_8, + HNS_ROCE_EQC_PROD_INDX_M, + HNS_ROCE_EQC_PROD_INDX_S, + HNS_ROCE_EQ_INIT_PROD_IDX); + + /* set eq_max_cnt */ + roce_set_field(eqc->byte_12, + HNS_ROCE_EQC_MAX_CNT_M, + HNS_ROCE_EQC_MAX_CNT_S, eq->eq_max_cnt); + + /* set eq_period */ + roce_set_field(eqc->byte_12, + HNS_ROCE_EQC_PERIOD_M, + HNS_ROCE_EQC_PERIOD_S, eq->eq_period); + + /* set eqe_report_timer */ + roce_set_field(eqc->eqe_report_timer, + HNS_ROCE_EQC_REPORT_TIMER_M, + HNS_ROCE_EQC_REPORT_TIMER_S, + HNS_ROCE_EQ_INIT_REPORT_TIMER); + + /* set eqe_ba [34:3] */ + roce_set_field(eqc->eqe_ba0, + HNS_ROCE_EQC_EQE_BA_L_M, + HNS_ROCE_EQC_EQE_BA_L_S, eq->eqe_ba >> 3); + + /* set eqe_ba [64:35] */ + roce_set_field(eqc->eqe_ba1, + HNS_ROCE_EQC_EQE_BA_H_M, + HNS_ROCE_EQC_EQE_BA_H_S, eq->eqe_ba >> 35); + + /* set eq shift */ + roce_set_field(eqc->byte_28, + HNS_ROCE_EQC_SHIFT_M, + HNS_ROCE_EQC_SHIFT_S, eq->shift); + + /* set eq MSI_IDX */ + roce_set_field(eqc->byte_28, + HNS_ROCE_EQC_MSI_INDX_M, + HNS_ROCE_EQC_MSI_INDX_S, + HNS_ROCE_EQ_INIT_MSI_IDX); + + /* set cur_eqe_ba [27:12] */ + roce_set_field(eqc->byte_28, + HNS_ROCE_EQC_CUR_EQE_BA_L_M, + HNS_ROCE_EQC_CUR_EQE_BA_L_S, eq->cur_eqe_ba >> 12); + + /* set cur_eqe_ba [59:28] */ + roce_set_field(eqc->byte_32, + HNS_ROCE_EQC_CUR_EQE_BA_M_M, + HNS_ROCE_EQC_CUR_EQE_BA_M_S, eq->cur_eqe_ba >> 28); + + /* set cur_eqe_ba [63:60] */ + roce_set_field(eqc->byte_36, + HNS_ROCE_EQC_CUR_EQE_BA_H_M, + HNS_ROCE_EQC_CUR_EQE_BA_H_S, eq->cur_eqe_ba >> 60); + + /* set eq consumer idx */ + roce_set_field(eqc->byte_36, + HNS_ROCE_EQC_CONS_INDX_M, + HNS_ROCE_EQC_CONS_INDX_S, + HNS_ROCE_EQ_INIT_CONS_IDX); + + /* set nex_eqe_ba[43:12] */ + roce_set_field(eqc->nxt_eqe_ba0, + HNS_ROCE_EQC_NXT_EQE_BA_L_M, + HNS_ROCE_EQC_NXT_EQE_BA_L_S, eq->nxt_eqe_ba >> 12); + + /* set nex_eqe_ba[63:44] */ + roce_set_field(eqc->nxt_eqe_ba1, + HNS_ROCE_EQC_NXT_EQE_BA_H_M, + HNS_ROCE_EQC_NXT_EQE_BA_H_S, eq->nxt_eqe_ba >> 44); +} + +static int hns_roce_mhop_alloc_eq(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) +{ + struct device *dev = hr_dev->dev; + int eq_alloc_done = 0; + int eq_buf_cnt = 0; + int eqe_alloc; + u32 buf_chk_sz; + u32 bt_chk_sz; + u32 mhop_num; + u64 size; + u64 idx; + int ba_num; + int bt_num; + int record_i; + int record_j; + int i = 0; + int j = 0; + + mhop_num = hr_dev->caps.eqe_hop_num; + buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT); + bt_chk_sz = 1 << (hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT); + + ba_num = (PAGE_ALIGN(eq->entries * eq->eqe_size) + buf_chk_sz - 1) + / buf_chk_sz; + bt_num = (ba_num + bt_chk_sz / 8 - 1) / (bt_chk_sz / 8); + + /* hop_num = 0 */ + if (mhop_num == HNS_ROCE_HOP_NUM_0) { + if (eq->entries > buf_chk_sz / eq->eqe_size) { + dev_err(dev, "eq entries %d is larger than buf_pg_sz!", + eq->entries); + return -EINVAL; + } + eq->bt_l0 = dma_alloc_coherent(dev, eq->entries * eq->eqe_size, + &(eq->l0_dma), GFP_KERNEL); + if (!eq->bt_l0) + return -ENOMEM; + + eq->cur_eqe_ba = eq->l0_dma; + eq->nxt_eqe_ba = 0; + + memset(eq->bt_l0, 0, eq->entries * eq->eqe_size); + + return 0; + } + + eq->buf_dma = kcalloc(ba_num, sizeof(*eq->buf_dma), GFP_KERNEL); + if (!eq->buf_dma) + return -ENOMEM; + eq->buf = kcalloc(ba_num, sizeof(*eq->buf), GFP_KERNEL); + if (!eq->buf) + goto err_kcalloc_buf; + + if (mhop_num == 2) { + eq->l1_dma = kcalloc(bt_num, sizeof(*eq->l1_dma), GFP_KERNEL); + if (!eq->l1_dma) + goto err_kcalloc_l1_dma; + + eq->bt_l1 = kcalloc(bt_num, sizeof(*eq->bt_l1), GFP_KERNEL); + if (!eq->bt_l1) + goto err_kcalloc_bt_l1; + } + + /* alloc L0 BT */ + eq->bt_l0 = dma_alloc_coherent(dev, bt_chk_sz, &eq->l0_dma, GFP_KERNEL); + if (!eq->bt_l0) + goto err_dma_alloc_l0; + + if (mhop_num == 1) { + if (ba_num > (bt_chk_sz / 8)) + dev_err(dev, "ba_num %d is too large for 1 hop\n", + ba_num); + + /* alloc buf */ + for (i = 0; i < bt_chk_sz / 8; i++) { + if (eq_buf_cnt + 1 < ba_num) { + size = buf_chk_sz; + } else { + eqe_alloc = i * (buf_chk_sz / eq->eqe_size); + size = (eq->entries - eqe_alloc) * eq->eqe_size; + } + eq->buf[i] = dma_alloc_coherent(dev, size, + &(eq->buf_dma[i]), + GFP_KERNEL); + if (!eq->buf[i]) + goto err_dma_alloc_buf; + + memset(eq->buf[i], 0, size); + *(eq->bt_l0 + i) = eq->buf_dma[i]; + + eq_buf_cnt++; + if (eq_buf_cnt >= ba_num) + break; + } + eq->cur_eqe_ba = eq->buf_dma[0]; + eq->nxt_eqe_ba = eq->buf_dma[1]; + + } else if (mhop_num == 2) { + /* alloc L1 BT and buf */ + for (i = 0; i < bt_chk_sz / 8; i++) { + eq->bt_l1[i] = dma_alloc_coherent(dev, bt_chk_sz, + &(eq->l1_dma[i]), + GFP_KERNEL); + if (!eq->bt_l1[i]) + goto err_dma_alloc_l1; + *(eq->bt_l0 + i) = eq->l1_dma[i]; + + for (j = 0; j < bt_chk_sz / 8; j++) { + idx = i * bt_chk_sz / 8 + j; + if (eq_buf_cnt + 1 < ba_num) { + size = buf_chk_sz; + } else { + eqe_alloc = (buf_chk_sz / eq->eqe_size) + * idx; + size = (eq->entries - eqe_alloc) + * eq->eqe_size; + } + eq->buf[idx] = dma_alloc_coherent(dev, size, + &(eq->buf_dma[idx]), + GFP_KERNEL); + if (!eq->buf[idx]) + goto err_dma_alloc_buf; + + memset(eq->buf[idx], 0, size); + *(eq->bt_l1[i] + j) = eq->buf_dma[idx]; + + eq_buf_cnt++; + if (eq_buf_cnt >= ba_num) { + eq_alloc_done = 1; + break; + } + } + + if (eq_alloc_done) + break; + } + eq->cur_eqe_ba = eq->buf_dma[0]; + eq->nxt_eqe_ba = eq->buf_dma[1]; + } + + eq->l0_last_num = i + 1; + if (mhop_num == 2) + eq->l1_last_num = j + 1; + + return 0; + +err_dma_alloc_l1: + dma_free_coherent(dev, bt_chk_sz, eq->bt_l0, eq->l0_dma); + eq->bt_l0 = NULL; + eq->l0_dma = 0; + for (i -= 1; i >= 0; i--) { + dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i], + eq->l1_dma[i]); + + for (j = 0; j < bt_chk_sz / 8; j++) { + idx = i * bt_chk_sz / 8 + j; + dma_free_coherent(dev, buf_chk_sz, eq->buf[idx], + eq->buf_dma[idx]); + } + } + goto err_dma_alloc_l0; + +err_dma_alloc_buf: + dma_free_coherent(dev, bt_chk_sz, eq->bt_l0, eq->l0_dma); + eq->bt_l0 = NULL; + eq->l0_dma = 0; + + if (mhop_num == 1) + for (i -= i; i >= 0; i--) + dma_free_coherent(dev, buf_chk_sz, eq->buf[i], + eq->buf_dma[i]); + else if (mhop_num == 2) { + record_i = i; + record_j = j; + for (; i >= 0; i--) { + dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i], + eq->l1_dma[i]); + + for (j = 0; j < bt_chk_sz / 8; j++) { + if (i == record_i && j >= record_j) + break; + + idx = i * bt_chk_sz / 8 + j; + dma_free_coherent(dev, buf_chk_sz, + eq->buf[idx], + eq->buf_dma[idx]); + } + } + } + +err_dma_alloc_l0: + kfree(eq->bt_l1); + eq->bt_l1 = NULL; + +err_kcalloc_bt_l1: + kfree(eq->l1_dma); + eq->l1_dma = NULL; + +err_kcalloc_l1_dma: + kfree(eq->buf); + eq->buf = NULL; + +err_kcalloc_buf: + kfree(eq->buf_dma); + eq->buf_dma = NULL; + + return -ENOMEM; +} + +static int hns_roce_v2_create_eq(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq, + unsigned int eq_cmd) +{ + struct device *dev = hr_dev->dev; + struct hns_roce_cmd_mailbox *mailbox; + u32 buf_chk_sz = 0; + int ret; + + /* Allocate mailbox memory */ + mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + if (!hr_dev->caps.eqe_hop_num) { + buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT); + + eq->buf_list = kzalloc(sizeof(struct hns_roce_buf_list), + GFP_KERNEL); + if (!eq->buf_list) { + ret = -ENOMEM; + goto free_cmd_mbox; + } + + eq->buf_list->buf = dma_alloc_coherent(dev, buf_chk_sz, + &(eq->buf_list->map), + GFP_KERNEL); + if (!eq->buf_list->buf) { + ret = -ENOMEM; + goto err_alloc_buf; + } + + memset(eq->buf_list->buf, 0, buf_chk_sz); + } else { + ret = hns_roce_mhop_alloc_eq(hr_dev, eq); + if (ret) { + ret = -ENOMEM; + goto free_cmd_mbox; + } + } + + hns_roce_config_eqc(hr_dev, eq, mailbox->buf); + + ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, eq->eqn, 0, + eq_cmd, HNS_ROCE_CMD_TIMEOUT_MSECS); + if (ret) { + dev_err(dev, "[mailbox cmd] creat eqc failed.\n"); + goto err_cmd_mbox; + } + + hns_roce_free_cmd_mailbox(hr_dev, mailbox); + + return 0; + +err_cmd_mbox: + if (!hr_dev->caps.eqe_hop_num) + dma_free_coherent(dev, buf_chk_sz, eq->buf_list->buf, + eq->buf_list->map); + else { + hns_roce_mhop_free_eq(hr_dev, eq); + goto free_cmd_mbox; + } + +err_alloc_buf: + kfree(eq->buf_list); + +free_cmd_mbox: + hns_roce_free_cmd_mailbox(hr_dev, mailbox); + + return ret; +} + +static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_eq_table *eq_table = &hr_dev->eq_table; + struct device *dev = hr_dev->dev; + struct hns_roce_eq *eq; + unsigned int eq_cmd; + int irq_num; + int eq_num; + int other_num; + int comp_num; + int aeq_num; + int i, j, k; + int ret; + + other_num = hr_dev->caps.num_other_vectors; + comp_num = hr_dev->caps.num_comp_vectors; + aeq_num = hr_dev->caps.num_aeq_vectors; + + eq_num = comp_num + aeq_num; + irq_num = eq_num + other_num; + + eq_table->eq = kcalloc(eq_num, sizeof(*eq_table->eq), GFP_KERNEL); + if (!eq_table->eq) + return -ENOMEM; + + for (i = 0; i < irq_num; i++) { + hr_dev->irq_names[i] = kzalloc(HNS_ROCE_INT_NAME_LEN, + GFP_KERNEL); + if (!hr_dev->irq_names[i]) { + ret = -ENOMEM; + goto err_failed_kzalloc; + } + } + + /* create eq */ + for (j = 0; j < eq_num; j++) { + eq = &eq_table->eq[j]; + eq->hr_dev = hr_dev; + eq->eqn = j; + if (j < comp_num) { + /* CEQ */ + eq_cmd = HNS_ROCE_CMD_CREATE_CEQC; + eq->type_flag = HNS_ROCE_CEQ; + eq->entries = hr_dev->caps.ceqe_depth; + eq->eqe_size = HNS_ROCE_CEQ_ENTRY_SIZE; + eq->irq = hr_dev->irq[j + other_num + aeq_num]; + eq->eq_max_cnt = HNS_ROCE_CEQ_DEFAULT_BURST_NUM; + eq->eq_period = HNS_ROCE_CEQ_DEFAULT_INTERVAL; + } else { + /* AEQ */ + eq_cmd = HNS_ROCE_CMD_CREATE_AEQC; + eq->type_flag = HNS_ROCE_AEQ; + eq->entries = hr_dev->caps.aeqe_depth; + eq->eqe_size = HNS_ROCE_AEQ_ENTRY_SIZE; + eq->irq = hr_dev->irq[j - comp_num + other_num]; + eq->eq_max_cnt = HNS_ROCE_AEQ_DEFAULT_BURST_NUM; + eq->eq_period = HNS_ROCE_AEQ_DEFAULT_INTERVAL; + } + + ret = hns_roce_v2_create_eq(hr_dev, eq, eq_cmd); + if (ret) { + dev_err(dev, "eq create failed.\n"); + goto err_create_eq_fail; + } + } + + /* enable irq */ + hns_roce_v2_int_mask_enable(hr_dev, eq_num, EQ_ENABLE); + + /* irq contains: abnormal + AEQ + CEQ*/ + for (k = 0; k < irq_num; k++) + if (k < other_num) + snprintf((char *)hr_dev->irq_names[k], + HNS_ROCE_INT_NAME_LEN, "hns-abn-%d", k); + else if (k < (other_num + aeq_num)) + snprintf((char *)hr_dev->irq_names[k], + HNS_ROCE_INT_NAME_LEN, "hns-aeq-%d", + k - other_num); + else + snprintf((char *)hr_dev->irq_names[k], + HNS_ROCE_INT_NAME_LEN, "hns-ceq-%d", + k - other_num - aeq_num); + + for (k = 0; k < irq_num; k++) { + if (k < other_num) + ret = request_irq(hr_dev->irq[k], + hns_roce_v2_msix_interrupt_abn, + 0, hr_dev->irq_names[k], hr_dev); + + else if (k < (other_num + comp_num)) + ret = request_irq(eq_table->eq[k - other_num].irq, + hns_roce_v2_msix_interrupt_eq, + 0, hr_dev->irq_names[k + aeq_num], + &eq_table->eq[k - other_num]); + else + ret = request_irq(eq_table->eq[k - other_num].irq, + hns_roce_v2_msix_interrupt_eq, + 0, hr_dev->irq_names[k - comp_num], + &eq_table->eq[k - other_num]); + if (ret) { + dev_err(dev, "Request irq error!\n"); + goto err_request_irq_fail; + } + } + + return 0; + +err_request_irq_fail: + for (k -= 1; k >= 0; k--) + if (k < other_num) + free_irq(hr_dev->irq[k], hr_dev); + else + free_irq(eq_table->eq[k - other_num].irq, + &eq_table->eq[k - other_num]); + +err_create_eq_fail: + for (j -= 1; j >= 0; j--) + hns_roce_v2_free_eq(hr_dev, &eq_table->eq[j]); + +err_failed_kzalloc: + for (i -= 1; i >= 0; i--) + kfree(hr_dev->irq_names[i]); + kfree(eq_table->eq); + + return ret; +} + +static void hns_roce_v2_cleanup_eq_table(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_eq_table *eq_table = &hr_dev->eq_table; + int irq_num; + int eq_num; + int i; + + eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors; + irq_num = eq_num + hr_dev->caps.num_other_vectors; + + /* Disable irq */ + hns_roce_v2_int_mask_enable(hr_dev, eq_num, EQ_DISABLE); + + for (i = 0; i < hr_dev->caps.num_other_vectors; i++) + free_irq(hr_dev->irq[i], hr_dev); + + for (i = 0; i < eq_num; i++) { + hns_roce_v2_destroy_eqc(hr_dev, i); + + free_irq(eq_table->eq[i].irq, &eq_table->eq[i]); + + hns_roce_v2_free_eq(hr_dev, &eq_table->eq[i]); + } + + for (i = 0; i < irq_num; i++) + kfree(hr_dev->irq_names[i]); + + kfree(eq_table->eq); +} + static const struct hns_roce_hw hns_roce_hw_v2 = { .cmq_init = hns_roce_v2_cmq_init, .cmq_exit = hns_roce_v2_cmq_exit, @@ -3183,6 +4646,8 @@ static const struct hns_roce_hw hns_roce_hw_v2 = { .post_recv = hns_roce_v2_post_recv, .req_notify_cq = hns_roce_v2_req_notify_cq, .poll_cq = hns_roce_v2_poll_cq, + .init_eq = hns_roce_v2_init_eq_table, + .cleanup_eq = hns_roce_v2_cleanup_eq_table, }; static const struct pci_device_id hns_roce_hw_v2_pci_tbl[] = { @@ -3197,6 +4662,7 @@ static int hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev, struct hnae3_handle *handle) { const struct pci_device_id *id; + int i; id = pci_match_id(hns_roce_hw_v2_pci_tbl, hr_dev->pci_dev); if (!id) { @@ -3214,8 +4680,15 @@ static int hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev, hr_dev->iboe.netdevs[0] = handle->rinfo.netdev; hr_dev->iboe.phy_port[0] = 0; + addrconf_addr_eui48((u8 *)&hr_dev->ib_dev.node_guid, + hr_dev->iboe.netdevs[0]->dev_addr); + + for (i = 0; i < HNS_ROCE_V2_MAX_IRQ_NUM; i++) + hr_dev->irq[i] = pci_irq_vector(handle->pdev, + i + handle->rinfo.base_vector); + /* cmd issue mode: 0 is poll, 1 is event */ - hr_dev->cmd_mod = 0; + hr_dev->cmd_mod = 1; hr_dev->loop_idc = 0; return 0; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 04b7a51b8efb..960df095392a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -53,6 +53,10 @@ #define HNS_ROCE_V2_MAX_SQ_INLINE 0x20 #define HNS_ROCE_V2_UAR_NUM 256 #define HNS_ROCE_V2_PHY_UAR_NUM 1 +#define HNS_ROCE_V2_MAX_IRQ_NUM 65 +#define HNS_ROCE_V2_COMP_VEC_NUM 63 +#define HNS_ROCE_V2_AEQE_VEC_NUM 1 +#define HNS_ROCE_V2_ABNORMAL_VEC_NUM 1 #define HNS_ROCE_V2_MAX_MTPT_NUM 0x8000 #define HNS_ROCE_V2_MAX_MTT_SEGS 0x1000000 #define HNS_ROCE_V2_MAX_CQE_SEGS 0x1000000 @@ -78,6 +82,8 @@ #define HNS_ROCE_MTT_HOP_NUM 1 #define HNS_ROCE_CQE_HOP_NUM 1 #define HNS_ROCE_PBL_HOP_NUM 2 +#define HNS_ROCE_EQE_HOP_NUM 2 + #define HNS_ROCE_V2_GID_INDEX_NUM 256 #define HNS_ROCE_V2_TABLE_CHUNK_SIZE (1 << 18) @@ -105,6 +111,12 @@ (step_idx == 1 && hop_num == 1) || \ (step_idx == 2 && hop_num == 2)) +enum { + NO_ARMED = 0x0, + REG_NXT_CEQE = 0x2, + REG_NXT_SE_CEQE = 0x3 +}; + #define V2_CQ_DB_REQ_NOT_SOL 0 #define V2_CQ_DB_REQ_NOT 1 @@ -229,6 +241,9 @@ struct hns_roce_v2_cq_context { u32 cqe_report_timer; u32 byte_64_se_cqe_idx; }; +#define HNS_ROCE_V2_CQ_DEFAULT_BURST_NUM 0x0 +#define HNS_ROCE_V2_CQ_DEFAULT_INTERVAL 0x0 + #define V2_CQC_BYTE_4_CQ_ST_S 0 #define V2_CQC_BYTE_4_CQ_ST_M GENMASK(1, 0) @@ -747,11 +762,14 @@ struct hns_roce_v2_qp_context { struct hns_roce_v2_cqe { u32 byte_4; - u32 rkey_immtdata; + union { + __le32 rkey; + __be32 immtdata; + }; u32 byte_12; u32 byte_16; u32 byte_cnt; - u32 smac; + u8 smac[4]; u32 byte_28; u32 byte_32; }; @@ -901,6 +919,90 @@ struct hns_roce_v2_cq_db { #define V2_CQ_DB_PARAMETER_NOTIFY_S 24 +struct hns_roce_v2_ud_send_wqe { + u32 byte_4; + u32 msg_len; + u32 immtdata; + u32 byte_16; + u32 byte_20; + u32 byte_24; + u32 qkey; + u32 byte_32; + u32 byte_36; + u32 byte_40; + u32 dmac; + u32 byte_48; + u8 dgid[GID_LEN_V2]; + +}; +#define V2_UD_SEND_WQE_BYTE_4_OPCODE_S 0 +#define V2_UD_SEND_WQE_BYTE_4_OPCODE_M GENMASK(4, 0) + +#define V2_UD_SEND_WQE_BYTE_4_OWNER_S 7 + +#define V2_UD_SEND_WQE_BYTE_4_CQE_S 8 + +#define V2_UD_SEND_WQE_BYTE_4_SE_S 11 + +#define V2_UD_SEND_WQE_BYTE_16_PD_S 0 +#define V2_UD_SEND_WQE_BYTE_16_PD_M GENMASK(23, 0) + +#define V2_UD_SEND_WQE_BYTE_16_SGE_NUM_S 24 +#define V2_UD_SEND_WQE_BYTE_16_SGE_NUM_M GENMASK(31, 24) + +#define V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S 0 +#define V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M GENMASK(23, 0) + +#define V2_UD_SEND_WQE_BYTE_24_UDPSPN_S 16 +#define V2_UD_SEND_WQE_BYTE_24_UDPSPN_M GENMASK(31, 16) + +#define V2_UD_SEND_WQE_BYTE_32_DQPN_S 0 +#define V2_UD_SEND_WQE_BYTE_32_DQPN_M GENMASK(23, 0) + +#define V2_UD_SEND_WQE_BYTE_36_VLAN_S 0 +#define V2_UD_SEND_WQE_BYTE_36_VLAN_M GENMASK(15, 0) + +#define V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_S 16 +#define V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_M GENMASK(23, 16) + +#define V2_UD_SEND_WQE_BYTE_36_TCLASS_S 24 +#define V2_UD_SEND_WQE_BYTE_36_TCLASS_M GENMASK(31, 24) + +#define V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_S 0 +#define V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_M GENMASK(19, 0) + +#define V2_UD_SEND_WQE_BYTE_40_SL_S 20 +#define V2_UD_SEND_WQE_BYTE_40_SL_M GENMASK(23, 20) + +#define V2_UD_SEND_WQE_BYTE_40_PORTN_S 24 +#define V2_UD_SEND_WQE_BYTE_40_PORTN_M GENMASK(26, 24) + +#define V2_UD_SEND_WQE_BYTE_40_LBI_S 31 + +#define V2_UD_SEND_WQE_DMAC_0_S 0 +#define V2_UD_SEND_WQE_DMAC_0_M GENMASK(7, 0) + +#define V2_UD_SEND_WQE_DMAC_1_S 8 +#define V2_UD_SEND_WQE_DMAC_1_M GENMASK(15, 8) + +#define V2_UD_SEND_WQE_DMAC_2_S 16 +#define V2_UD_SEND_WQE_DMAC_2_M GENMASK(23, 16) + +#define V2_UD_SEND_WQE_DMAC_3_S 24 +#define V2_UD_SEND_WQE_DMAC_3_M GENMASK(31, 24) + +#define V2_UD_SEND_WQE_BYTE_48_DMAC_4_S 0 +#define V2_UD_SEND_WQE_BYTE_48_DMAC_4_M GENMASK(7, 0) + +#define V2_UD_SEND_WQE_BYTE_48_DMAC_5_S 8 +#define V2_UD_SEND_WQE_BYTE_48_DMAC_5_M GENMASK(15, 8) + +#define V2_UD_SEND_WQE_BYTE_48_SGID_INDX_S 16 +#define V2_UD_SEND_WQE_BYTE_48_SGID_INDX_M GENMASK(23, 16) + +#define V2_UD_SEND_WQE_BYTE_48_SMAC_INDX_S 24 +#define V2_UD_SEND_WQE_BYTE_48_SMAC_INDX_M GENMASK(31, 24) + struct hns_roce_v2_rc_send_wqe { u32 byte_4; u32 msg_len; @@ -1129,9 +1231,6 @@ struct hns_roce_cmq_desc { u32 data[6]; }; -#define ROCEE_VF_MB_CFG0_REG 0x40 -#define ROCEE_VF_MB_STATUS_REG 0x58 - #define HNS_ROCE_V2_GO_BIT_TIMEOUT_MSECS 10000 #define HNS_ROCE_HW_RUN_BIT_SHIFT 31 @@ -1174,4 +1273,178 @@ struct hns_roce_v2_priv { struct hns_roce_v2_cmq cmq; }; +struct hns_roce_eq_context { + u32 byte_4; + u32 byte_8; + u32 byte_12; + u32 eqe_report_timer; + u32 eqe_ba0; + u32 eqe_ba1; + u32 byte_28; + u32 byte_32; + u32 byte_36; + u32 nxt_eqe_ba0; + u32 nxt_eqe_ba1; + u32 rsv[5]; +}; + +#define HNS_ROCE_AEQ_DEFAULT_BURST_NUM 0x0 +#define HNS_ROCE_AEQ_DEFAULT_INTERVAL 0x0 +#define HNS_ROCE_CEQ_DEFAULT_BURST_NUM 0x0 +#define HNS_ROCE_CEQ_DEFAULT_INTERVAL 0x0 + +#define HNS_ROCE_V2_EQ_STATE_INVALID 0 +#define HNS_ROCE_V2_EQ_STATE_VALID 1 +#define HNS_ROCE_V2_EQ_STATE_OVERFLOW 2 +#define HNS_ROCE_V2_EQ_STATE_FAILURE 3 + +#define HNS_ROCE_V2_EQ_OVER_IGNORE_0 0 +#define HNS_ROCE_V2_EQ_OVER_IGNORE_1 1 + +#define HNS_ROCE_V2_EQ_COALESCE_0 0 +#define HNS_ROCE_V2_EQ_COALESCE_1 1 + +#define HNS_ROCE_V2_EQ_FIRED 0 +#define HNS_ROCE_V2_EQ_ARMED 1 +#define HNS_ROCE_V2_EQ_ALWAYS_ARMED 3 + +#define HNS_ROCE_EQ_INIT_EQE_CNT 0 +#define HNS_ROCE_EQ_INIT_PROD_IDX 0 +#define HNS_ROCE_EQ_INIT_REPORT_TIMER 0 +#define HNS_ROCE_EQ_INIT_MSI_IDX 0 +#define HNS_ROCE_EQ_INIT_CONS_IDX 0 +#define HNS_ROCE_EQ_INIT_NXT_EQE_BA 0 + +#define HNS_ROCE_V2_CEQ_CEQE_OWNER_S 31 +#define HNS_ROCE_V2_AEQ_AEQE_OWNER_S 31 + +#define HNS_ROCE_V2_COMP_EQE_NUM 0x1000 +#define HNS_ROCE_V2_ASYNC_EQE_NUM 0x1000 + +#define HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S 0 +#define HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S 1 +#define HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S 2 + +#define HNS_ROCE_EQ_DB_CMD_AEQ 0x0 +#define HNS_ROCE_EQ_DB_CMD_AEQ_ARMED 0x1 +#define HNS_ROCE_EQ_DB_CMD_CEQ 0x2 +#define HNS_ROCE_EQ_DB_CMD_CEQ_ARMED 0x3 + +#define EQ_ENABLE 1 +#define EQ_DISABLE 0 + +#define EQ_REG_OFFSET 0x4 + +#define HNS_ROCE_INT_NAME_LEN 32 +#define HNS_ROCE_V2_EQN_M GENMASK(23, 0) + +#define HNS_ROCE_V2_CONS_IDX_M GENMASK(23, 0) + +#define HNS_ROCE_V2_VF_ABN_INT_EN_S 0 +#define HNS_ROCE_V2_VF_ABN_INT_EN_M GENMASK(0, 0) +#define HNS_ROCE_V2_VF_ABN_INT_ST_M GENMASK(2, 0) +#define HNS_ROCE_V2_VF_ABN_INT_CFG_M GENMASK(2, 0) +#define HNS_ROCE_V2_VF_EVENT_INT_EN_M GENMASK(0, 0) + +/* WORD0 */ +#define HNS_ROCE_EQC_EQ_ST_S 0 +#define HNS_ROCE_EQC_EQ_ST_M GENMASK(1, 0) + +#define HNS_ROCE_EQC_HOP_NUM_S 2 +#define HNS_ROCE_EQC_HOP_NUM_M GENMASK(3, 2) + +#define HNS_ROCE_EQC_OVER_IGNORE_S 4 +#define HNS_ROCE_EQC_OVER_IGNORE_M GENMASK(4, 4) + +#define HNS_ROCE_EQC_COALESCE_S 5 +#define HNS_ROCE_EQC_COALESCE_M GENMASK(5, 5) + +#define HNS_ROCE_EQC_ARM_ST_S 6 +#define HNS_ROCE_EQC_ARM_ST_M GENMASK(7, 6) + +#define HNS_ROCE_EQC_EQN_S 8 +#define HNS_ROCE_EQC_EQN_M GENMASK(15, 8) + +#define HNS_ROCE_EQC_EQE_CNT_S 16 +#define HNS_ROCE_EQC_EQE_CNT_M GENMASK(31, 16) + +/* WORD1 */ +#define HNS_ROCE_EQC_BA_PG_SZ_S 0 +#define HNS_ROCE_EQC_BA_PG_SZ_M GENMASK(3, 0) + +#define HNS_ROCE_EQC_BUF_PG_SZ_S 4 +#define HNS_ROCE_EQC_BUF_PG_SZ_M GENMASK(7, 4) + +#define HNS_ROCE_EQC_PROD_INDX_S 8 +#define HNS_ROCE_EQC_PROD_INDX_M GENMASK(31, 8) + +/* WORD2 */ +#define HNS_ROCE_EQC_MAX_CNT_S 0 +#define HNS_ROCE_EQC_MAX_CNT_M GENMASK(15, 0) + +#define HNS_ROCE_EQC_PERIOD_S 16 +#define HNS_ROCE_EQC_PERIOD_M GENMASK(31, 16) + +/* WORD3 */ +#define HNS_ROCE_EQC_REPORT_TIMER_S 0 +#define HNS_ROCE_EQC_REPORT_TIMER_M GENMASK(31, 0) + +/* WORD4 */ +#define HNS_ROCE_EQC_EQE_BA_L_S 0 +#define HNS_ROCE_EQC_EQE_BA_L_M GENMASK(31, 0) + +/* WORD5 */ +#define HNS_ROCE_EQC_EQE_BA_H_S 0 +#define HNS_ROCE_EQC_EQE_BA_H_M GENMASK(28, 0) + +/* WORD6 */ +#define HNS_ROCE_EQC_SHIFT_S 0 +#define HNS_ROCE_EQC_SHIFT_M GENMASK(7, 0) + +#define HNS_ROCE_EQC_MSI_INDX_S 8 +#define HNS_ROCE_EQC_MSI_INDX_M GENMASK(15, 8) + +#define HNS_ROCE_EQC_CUR_EQE_BA_L_S 16 +#define HNS_ROCE_EQC_CUR_EQE_BA_L_M GENMASK(31, 16) + +/* WORD7 */ +#define HNS_ROCE_EQC_CUR_EQE_BA_M_S 0 +#define HNS_ROCE_EQC_CUR_EQE_BA_M_M GENMASK(31, 0) + +/* WORD8 */ +#define HNS_ROCE_EQC_CUR_EQE_BA_H_S 0 +#define HNS_ROCE_EQC_CUR_EQE_BA_H_M GENMASK(3, 0) + +#define HNS_ROCE_EQC_CONS_INDX_S 8 +#define HNS_ROCE_EQC_CONS_INDX_M GENMASK(31, 8) + +/* WORD9 */ +#define HNS_ROCE_EQC_NXT_EQE_BA_L_S 0 +#define HNS_ROCE_EQC_NXT_EQE_BA_L_M GENMASK(31, 0) + +/* WORD10 */ +#define HNS_ROCE_EQC_NXT_EQE_BA_H_S 0 +#define HNS_ROCE_EQC_NXT_EQE_BA_H_M GENMASK(19, 0) + +#define HNS_ROCE_V2_CEQE_COMP_CQN_S 0 +#define HNS_ROCE_V2_CEQE_COMP_CQN_M GENMASK(23, 0) + +#define HNS_ROCE_V2_AEQE_EVENT_TYPE_S 0 +#define HNS_ROCE_V2_AEQE_EVENT_TYPE_M GENMASK(7, 0) + +#define HNS_ROCE_V2_AEQE_SUB_TYPE_S 8 +#define HNS_ROCE_V2_AEQE_SUB_TYPE_M GENMASK(15, 8) + +#define HNS_ROCE_V2_EQ_DB_CMD_S 16 +#define HNS_ROCE_V2_EQ_DB_CMD_M GENMASK(17, 16) + +#define HNS_ROCE_V2_EQ_DB_TAG_S 0 +#define HNS_ROCE_V2_EQ_DB_TAG_M GENMASK(7, 0) + +#define HNS_ROCE_V2_EQ_DB_PARA_S 0 +#define HNS_ROCE_V2_EQ_DB_PARA_M GENMASK(23, 0) + +#define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S 0 +#define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M GENMASK(23, 0) + #endif diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index cf02ac2d3596..aa0c242ddc50 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -748,12 +748,10 @@ int hns_roce_init(struct hns_roce_dev *hr_dev) goto error_failed_cmd_init; } - if (hr_dev->cmd_mod) { - ret = hns_roce_init_eq_table(hr_dev); - if (ret) { - dev_err(dev, "eq init failed!\n"); - goto error_failed_eq_table; - } + ret = hr_dev->hw->init_eq(hr_dev); + if (ret) { + dev_err(dev, "eq init failed!\n"); + goto error_failed_eq_table; } if (hr_dev->cmd_mod) { @@ -805,8 +803,7 @@ error_failed_init_hem: hns_roce_cmd_use_polling(hr_dev); error_failed_use_event: - if (hr_dev->cmd_mod) - hns_roce_cleanup_eq_table(hr_dev); + hr_dev->hw->cleanup_eq(hr_dev); error_failed_eq_table: hns_roce_cmd_cleanup(hr_dev); @@ -837,8 +834,7 @@ void hns_roce_exit(struct hns_roce_dev *hr_dev) if (hr_dev->cmd_mod) hns_roce_cmd_use_polling(hr_dev); - if (hr_dev->cmd_mod) - hns_roce_cleanup_eq_table(hr_dev); + hr_dev->hw->cleanup_eq(hr_dev); hns_roce_cmd_cleanup(hr_dev); if (hr_dev->hw->cmq_exit) hr_dev->hw->cmq_exit(hr_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 49586ec8126a..4414cea9ef56 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -65,6 +65,7 @@ void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type) if (atomic_dec_and_test(&qp->refcount)) complete(&qp->free); } +EXPORT_SYMBOL_GPL(hns_roce_qp_event); static void hns_roce_ib_qp_event(struct hns_roce_qp *hr_qp, enum hns_roce_event type) @@ -454,6 +455,13 @@ static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev, hr_qp->sge.sge_shift = 4; } + /* ud sqwqe's sge use extend sge */ + if (hr_dev->caps.max_sq_sg > 2 && hr_qp->ibqp.qp_type == IB_QPT_GSI) { + hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt * + hr_qp->sq.max_gs); + hr_qp->sge.sge_shift = 4; + } + /* Get buf size, SQ and RQ are aligned to PAGE_SIZE */ page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT); hr_qp->sq.offset = 0; @@ -493,6 +501,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, int ret = 0; u32 page_shift; u32 npages; + int i; mutex_init(&hr_qp->mutex); spin_lock_init(&hr_qp->sq.lock); @@ -500,6 +509,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, hr_qp->state = IB_QPS_RESET; + hr_qp->ibqp.qp_type = init_attr->qp_type; + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) hr_qp->sq_signal_bits = IB_SIGNAL_ALL_WR; else @@ -512,18 +523,48 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, goto err_out; } + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) { + /* allocate recv inline buf */ + hr_qp->rq_inl_buf.wqe_list = kcalloc(hr_qp->rq.wqe_cnt, + sizeof(struct hns_roce_rinl_wqe), + GFP_KERNEL); + if (!hr_qp->rq_inl_buf.wqe_list) { + ret = -ENOMEM; + goto err_out; + } + + hr_qp->rq_inl_buf.wqe_cnt = hr_qp->rq.wqe_cnt; + + /* Firstly, allocate a list of sge space buffer */ + hr_qp->rq_inl_buf.wqe_list[0].sg_list = + kcalloc(hr_qp->rq_inl_buf.wqe_cnt, + init_attr->cap.max_recv_sge * + sizeof(struct hns_roce_rinl_sge), + GFP_KERNEL); + if (!hr_qp->rq_inl_buf.wqe_list[0].sg_list) { + ret = -ENOMEM; + goto err_wqe_list; + } + + for (i = 1; i < hr_qp->rq_inl_buf.wqe_cnt; i++) + /* Secondly, reallocate the buffer */ + hr_qp->rq_inl_buf.wqe_list[i].sg_list = + &hr_qp->rq_inl_buf.wqe_list[0].sg_list[i * + init_attr->cap.max_recv_sge]; + } + if (ib_pd->uobject) { if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { dev_err(dev, "ib_copy_from_udata error for create qp\n"); ret = -EFAULT; - goto err_out; + goto err_rq_sge_list; } ret = hns_roce_set_user_sq_size(hr_dev, &init_attr->cap, hr_qp, &ucmd); if (ret) { dev_err(dev, "hns_roce_set_user_sq_size error for create qp\n"); - goto err_out; + goto err_rq_sge_list; } hr_qp->umem = ib_umem_get(ib_pd->uobject->context, @@ -532,7 +573,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, if (IS_ERR(hr_qp->umem)) { dev_err(dev, "ib_umem_get error for create qp\n"); ret = PTR_ERR(hr_qp->umem); - goto err_out; + goto err_rq_sge_list; } hr_qp->mtt.mtt_type = MTT_TYPE_WQE; @@ -566,13 +607,13 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { dev_err(dev, "init_attr->create_flags error!\n"); ret = -EINVAL; - goto err_out; + goto err_rq_sge_list; } if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) { dev_err(dev, "init_attr->create_flags error!\n"); ret = -EINVAL; - goto err_out; + goto err_rq_sge_list; } /* Set SQ size */ @@ -580,7 +621,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, hr_qp); if (ret) { dev_err(dev, "hns_roce_set_kernel_sq_size error!\n"); - goto err_out; + goto err_rq_sge_list; } /* QP doorbell register address */ @@ -596,7 +637,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, &hr_qp->hr_buf, page_shift)) { dev_err(dev, "hns_roce_buf_alloc error!\n"); ret = -ENOMEM; - goto err_out; + goto err_rq_sge_list; } hr_qp->mtt.mtt_type = MTT_TYPE_WQE; @@ -678,6 +719,14 @@ err_buf: else hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf); +err_rq_sge_list: + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) + kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list); + +err_wqe_list: + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) + kfree(hr_qp->rq_inl_buf.wqe_list); + err_out: return ret; } @@ -724,8 +773,13 @@ struct ib_qp *hns_roce_create_qp(struct ib_pd *pd, hr_qp = &hr_sqp->hr_qp; hr_qp->port = init_attr->port_num - 1; hr_qp->phy_port = hr_dev->iboe.phy_port[hr_qp->port]; - hr_qp->ibqp.qp_num = HNS_ROCE_MAX_PORTS + - hr_dev->iboe.phy_port[hr_qp->port]; + + /* when hw version is v1, the sqpn is allocated */ + if (hr_dev->caps.max_sq_sg <= 2) + hr_qp->ibqp.qp_num = HNS_ROCE_MAX_PORTS + + hr_dev->iboe.phy_port[hr_qp->port]; + else + hr_qp->ibqp.qp_num = 1; ret = hns_roce_create_qp_common(hr_dev, pd, init_attr, udata, hr_qp->ibqp.qp_num, hr_qp); diff --git a/drivers/infiniband/hw/i40iw/Kconfig b/drivers/infiniband/hw/i40iw/Kconfig index f6d20ba88c03..2962979c06e9 100644 --- a/drivers/infiniband/hw/i40iw/Kconfig +++ b/drivers/infiniband/hw/i40iw/Kconfig @@ -5,4 +5,3 @@ config INFINIBAND_I40IW select GENERIC_ALLOCATOR ---help--- Intel(R) Ethernet X722 iWARP Driver - INET && I40IW && INFINIBAND && I40E diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h index 4ae9131b6350..bcddd7061fc0 100644 --- a/drivers/infiniband/hw/i40iw/i40iw.h +++ b/drivers/infiniband/hw/i40iw/i40iw.h @@ -587,5 +587,8 @@ int i40iw_inet6addr_event(struct notifier_block *notifier, int i40iw_net_event(struct notifier_block *notifier, unsigned long event, void *ptr); +int i40iw_netdevice_event(struct notifier_block *notifier, + unsigned long event, + void *ptr); #endif diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 77870f9e1736..abf4cd897849 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -92,14 +92,9 @@ void i40iw_free_sqbuf(struct i40iw_sc_vsi *vsi, void *bufp) static u8 i40iw_derive_hw_ird_setting(u16 cm_ird) { u8 encoded_ird_size; - u8 pof2_cm_ird = 1; - - /* round-off to next powerof2 */ - while (pof2_cm_ird < cm_ird) - pof2_cm_ird *= 2; /* ird_size field is encoded in qp_ctx */ - switch (pof2_cm_ird) { + switch (cm_ird ? roundup_pow_of_two(cm_ird) : 0) { case I40IW_HW_IRD_SETTING_64: encoded_ird_size = 3; break; @@ -125,13 +120,16 @@ static u8 i40iw_derive_hw_ird_setting(u16 cm_ird) * @conn_ird: connection IRD * @conn_ord: connection ORD */ -static void i40iw_record_ird_ord(struct i40iw_cm_node *cm_node, u16 conn_ird, u16 conn_ord) +static void i40iw_record_ird_ord(struct i40iw_cm_node *cm_node, u32 conn_ird, + u32 conn_ord) { if (conn_ird > I40IW_MAX_IRD_SIZE) conn_ird = I40IW_MAX_IRD_SIZE; if (conn_ord > I40IW_MAX_ORD_SIZE) conn_ord = I40IW_MAX_ORD_SIZE; + else if (!conn_ord && cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO) + conn_ord = 1; cm_node->ird_size = conn_ird; cm_node->ord_size = conn_ord; @@ -2878,15 +2876,13 @@ static struct i40iw_cm_listener *i40iw_make_listen_node( * i40iw_create_cm_node - make a connection node with params * @cm_core: cm's core * @iwdev: iwarp device structure - * @private_data_len: len to provate data for mpa request - * @private_data: pointer to private data for connection + * @conn_param: upper layer connection parameters * @cm_info: quad info for connection */ static struct i40iw_cm_node *i40iw_create_cm_node( struct i40iw_cm_core *cm_core, struct i40iw_device *iwdev, - u16 private_data_len, - void *private_data, + struct iw_cm_conn_param *conn_param, struct i40iw_cm_info *cm_info) { struct i40iw_cm_node *cm_node; @@ -2894,6 +2890,9 @@ static struct i40iw_cm_node *i40iw_create_cm_node( struct i40iw_cm_node *loopback_remotenode; struct i40iw_cm_info loopback_cm_info; + u16 private_data_len = conn_param->private_data_len; + const void *private_data = conn_param->private_data; + /* create a CM connection node */ cm_node = i40iw_make_cm_node(cm_core, iwdev, cm_info, NULL); if (!cm_node) @@ -2902,6 +2901,8 @@ static struct i40iw_cm_node *i40iw_create_cm_node( cm_node->tcp_cntxt.client = 1; cm_node->tcp_cntxt.rcv_wscale = I40IW_CM_DEFAULT_RCV_WND_SCALE; + i40iw_record_ird_ord(cm_node, conn_param->ird, conn_param->ord); + if (!memcmp(cm_info->loc_addr, cm_info->rem_addr, sizeof(cm_info->loc_addr))) { loopback_remotelistener = i40iw_find_listener( cm_core, @@ -2935,6 +2936,10 @@ static struct i40iw_cm_node *i40iw_create_cm_node( private_data_len); loopback_remotenode->pdata.size = private_data_len; + if (loopback_remotenode->ord_size > cm_node->ird_size) + loopback_remotenode->ord_size = + cm_node->ird_size; + cm_node->state = I40IW_CM_STATE_OFFLOADED; cm_node->tcp_cntxt.rcv_nxt = loopback_remotenode->tcp_cntxt.loc_seq_num; @@ -3691,7 +3696,7 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_node->qhash_set = false; i40iw_modify_qp(&iwqp->ibqp, &attr, IB_QP_STATE, NULL); - cm_node->accelerated = 1; + cm_node->accelerated = true; status = i40iw_send_cm_event(cm_node, cm_id, IW_CM_EVENT_ESTABLISHED, 0); if (status) @@ -3815,9 +3820,7 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) __func__, cm_id->tos, cm_info.user_pri); cm_id->add_ref(cm_id); cm_node = i40iw_create_cm_node(&iwdev->cm_core, iwdev, - conn_param->private_data_len, - (void *)conn_param->private_data, - &cm_info); + conn_param, &cm_info); if (IS_ERR(cm_node)) { ret = PTR_ERR(cm_node); @@ -3849,11 +3852,6 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) } cm_node->apbvt_set = true; - i40iw_record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord); - if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO && - !cm_node->ord_size) - cm_node->ord_size = 1; - iwqp->cm_node = cm_node; cm_node->iwqp = iwqp; iwqp->cm_id = cm_id; @@ -4058,7 +4056,7 @@ static void i40iw_cm_event_connected(struct i40iw_cm_event *event) cm_node->qhash_set = false; i40iw_modify_qp(&iwqp->ibqp, &attr, IB_QP_STATE, NULL); - cm_node->accelerated = 1; + cm_node->accelerated = true; status = i40iw_send_cm_event(cm_node, cm_id, IW_CM_EVENT_CONNECT_REPLY, 0); if (status) @@ -4242,10 +4240,16 @@ set_qhash: } /** - * i40iw_cm_disconnect_all - disconnect all connected qp's + * i40iw_cm_teardown_connections - teardown QPs * @iwdev: device pointer + * @ipaddr: Pointer to IPv4 or IPv6 address + * @ipv4: flag indicating IPv4 when true + * @disconnect_all: flag indicating disconnect all QPs + * teardown QPs where source or destination addr matches ip addr */ -void i40iw_cm_disconnect_all(struct i40iw_device *iwdev) +void i40iw_cm_teardown_connections(struct i40iw_device *iwdev, u32 *ipaddr, + struct i40iw_cm_info *nfo, + bool disconnect_all) { struct i40iw_cm_core *cm_core = &iwdev->cm_core; struct list_head *list_core_temp; @@ -4259,8 +4263,13 @@ void i40iw_cm_disconnect_all(struct i40iw_device *iwdev) spin_lock_irqsave(&cm_core->ht_lock, flags); list_for_each_safe(list_node, list_core_temp, &cm_core->connected_nodes) { cm_node = container_of(list_node, struct i40iw_cm_node, list); - atomic_inc(&cm_node->ref_count); - list_add(&cm_node->connected_entry, &connected_list); + if (disconnect_all || + (nfo->vlan_id == cm_node->vlan_id && + (!memcmp(cm_node->loc_addr, ipaddr, nfo->ipv4 ? 4 : 16) || + !memcmp(cm_node->rem_addr, ipaddr, nfo->ipv4 ? 4 : 16)))) { + atomic_inc(&cm_node->ref_count); + list_add(&cm_node->connected_entry, &connected_list); + } } spin_unlock_irqrestore(&cm_core->ht_lock, flags); @@ -4294,6 +4303,9 @@ void i40iw_if_notify(struct i40iw_device *iwdev, struct net_device *netdev, enum i40iw_quad_hash_manage_type op = ifup ? I40IW_QHASH_MANAGE_TYPE_ADD : I40IW_QHASH_MANAGE_TYPE_DELETE; + nfo.vlan_id = vlan_id; + nfo.ipv4 = ipv4; + /* Disable or enable qhash for listeners */ spin_lock_irqsave(&cm_core->listen_list_lock, flags); list_for_each_entry(listen_node, &cm_core->listen_nodes, list) { @@ -4303,8 +4315,6 @@ void i40iw_if_notify(struct i40iw_device *iwdev, struct net_device *netdev, memcpy(nfo.loc_addr, listen_node->loc_addr, sizeof(nfo.loc_addr)); nfo.loc_port = listen_node->loc_port; - nfo.ipv4 = listen_node->ipv4; - nfo.vlan_id = listen_node->vlan_id; nfo.user_pri = listen_node->user_pri; if (!list_empty(&listen_node->child_listen_list)) { i40iw_qhash_ctrl(iwdev, @@ -4326,7 +4336,7 @@ void i40iw_if_notify(struct i40iw_device *iwdev, struct net_device *netdev, } spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); - /* disconnect any connected qp's on ifdown */ + /* teardown connected qp's on ifdown */ if (!ifup) - i40iw_cm_disconnect_all(iwdev); + i40iw_cm_teardown_connections(iwdev, ipaddr, &nfo, false); } diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.h b/drivers/infiniband/hw/i40iw/i40iw_cm.h index 0d5840d2c4fc..cf60c451e071 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.h +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.h @@ -276,8 +276,6 @@ struct i40iw_cm_tcp_context { u32 mss; u8 snd_wscale; u8 rcv_wscale; - - struct timeval sent_ts; }; enum i40iw_cm_listener_state { @@ -337,7 +335,7 @@ struct i40iw_cm_node { u16 mpav2_ird_ord; struct iw_cm_id *cm_id; struct list_head list; - int accelerated; + bool accelerated; struct i40iw_cm_listener *listener; int apbvt_set; int accept_pend; @@ -455,5 +453,7 @@ int i40iw_arp_table(struct i40iw_device *iwdev, void i40iw_if_notify(struct i40iw_device *iwdev, struct net_device *netdev, u32 *ipaddr, bool ipv4, bool ifup); -void i40iw_cm_disconnect_all(struct i40iw_device *iwdev); +void i40iw_cm_teardown_connections(struct i40iw_device *iwdev, u32 *ipaddr, + struct i40iw_cm_info *nfo, + bool disconnect_all); #endif /* I40IW_CM_H */ diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c index da9821a10e0d..c74fd3309b93 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c +++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c @@ -1893,8 +1893,6 @@ static enum i40iw_status_code i40iw_sc_get_next_aeqe(struct i40iw_sc_aeq *aeq, static enum i40iw_status_code i40iw_sc_repost_aeq_entries(struct i40iw_sc_dev *dev, u32 count) { - if (count > I40IW_MAX_AEQ_ALLOCATE_COUNT) - return I40IW_ERR_INVALID_SIZE; if (dev->is_pf) i40iw_wr32(dev->hw, I40E_PFPE_AEQALLOC, count); @@ -3872,7 +3870,6 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_ struct i40iw_virt_mem virt_mem; u32 i, mem_size; u32 qpwantedoriginal, qpwanted, mrwanted, pblewanted; - u32 powerof2; u64 sd_needed; u32 loop_count = 0; @@ -3928,8 +3925,10 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_ hmc_info->hmc_obj[I40IW_HMC_IW_APBVT_ENTRY].cnt = 1; hmc_info->hmc_obj[I40IW_HMC_IW_MR].cnt = mrwanted; - hmc_info->hmc_obj[I40IW_HMC_IW_XF].cnt = I40IW_MAX_WQ_ENTRIES * qpwanted; - hmc_info->hmc_obj[I40IW_HMC_IW_Q1].cnt = 4 * I40IW_MAX_IRD_SIZE * qpwanted; + hmc_info->hmc_obj[I40IW_HMC_IW_XF].cnt = + roundup_pow_of_two(I40IW_MAX_WQ_ENTRIES * qpwanted); + hmc_info->hmc_obj[I40IW_HMC_IW_Q1].cnt = + roundup_pow_of_two(2 * I40IW_MAX_IRD_SIZE * qpwanted); hmc_info->hmc_obj[I40IW_HMC_IW_XFFL].cnt = hmc_info->hmc_obj[I40IW_HMC_IW_XF].cnt / hmc_fpm_misc->xf_block_size; hmc_info->hmc_obj[I40IW_HMC_IW_Q1FL].cnt = @@ -3945,16 +3944,10 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_ if ((loop_count > 1000) || ((!(loop_count % 10)) && (qpwanted > qpwantedoriginal * 2 / 3))) { - if (qpwanted > FPM_MULTIPLIER) { - qpwanted -= FPM_MULTIPLIER; - powerof2 = 1; - while (powerof2 < qpwanted) - powerof2 *= 2; - powerof2 /= 2; - qpwanted = powerof2; - } else { - qpwanted /= 2; - } + if (qpwanted > FPM_MULTIPLIER) + qpwanted = roundup_pow_of_two(qpwanted - + FPM_MULTIPLIER); + qpwanted >>= 1; } if (mrwanted > FPM_MULTIPLIER * 10) mrwanted -= FPM_MULTIPLIER * 10; @@ -3962,8 +3955,6 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_ pblewanted -= FPM_MULTIPLIER * 1000; } while (sd_needed > hmc_fpm_misc->max_sds && loop_count < 2000); - sd_needed = i40iw_est_sd(dev, hmc_info); - i40iw_debug(dev, I40IW_DEBUG_HMC, "loop_cnt=%d, sd_needed=%lld, qpcnt = %d, cqcnt=%d, mrcnt=%d, pblecnt=%d\n", loop_count, sd_needed, diff --git a/drivers/infiniband/hw/i40iw/i40iw_d.h b/drivers/infiniband/hw/i40iw/i40iw_d.h index 029083cb81d5..4b65e4140bd7 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_d.h +++ b/drivers/infiniband/hw/i40iw/i40iw_d.h @@ -97,6 +97,7 @@ #define RDMA_OPCODE_MASK 0x0f #define RDMA_READ_REQ_OPCODE 1 #define Q2_BAD_FRAME_OFFSET 72 +#define Q2_FPSN_OFFSET 64 #define CQE_MAJOR_DRV 0x8000 #define I40IW_TERM_SENT 0x01 diff --git a/drivers/infiniband/hw/i40iw/i40iw_hw.c b/drivers/infiniband/hw/i40iw/i40iw_hw.c index e96bdafbcbb3..61540e14e4b9 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_hw.c +++ b/drivers/infiniband/hw/i40iw/i40iw_hw.c @@ -385,6 +385,8 @@ void i40iw_process_aeq(struct i40iw_device *iwdev) iwcq->ibcq.event_handler(&ibevent, iwcq->ibcq.cq_context); } break; + case I40IW_AE_LLP_DOUBT_REACHABILITY: + break; case I40IW_AE_PRIV_OPERATION_DENIED: case I40IW_AE_STAG_ZERO_INVALID: case I40IW_AE_IB_RREQ_AND_Q1_FULL: @@ -403,7 +405,6 @@ void i40iw_process_aeq(struct i40iw_device *iwdev) case I40IW_AE_LLP_SEGMENT_TOO_SMALL: case I40IW_AE_LLP_SYN_RECEIVED: case I40IW_AE_LLP_TOO_MANY_RETRIES: - case I40IW_AE_LLP_DOUBT_REACHABILITY: case I40IW_AE_LCE_QP_CATASTROPHIC: case I40IW_AE_LCE_FUNCTION_CATASTROPHIC: case I40IW_AE_LCE_CQ_CATASTROPHIC: diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c index e824296713e2..b08862978de8 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_main.c +++ b/drivers/infiniband/hw/i40iw/i40iw_main.c @@ -99,6 +99,10 @@ static struct notifier_block i40iw_net_notifier = { .notifier_call = i40iw_net_event }; +static struct notifier_block i40iw_netdevice_notifier = { + .notifier_call = i40iw_netdevice_event +}; + /** * i40iw_find_i40e_handler - find a handler given a client info * @ldev: pointer to a client info @@ -483,6 +487,7 @@ static enum i40iw_status_code i40iw_create_hmc_objs(struct i40iw_device *iwdev, for (i = 0; i < IW_HMC_OBJ_TYPE_NUM; i++) { info.rsrc_type = iw_hmc_obj_types[i]; info.count = dev->hmc_info->hmc_obj[info.rsrc_type].cnt; + info.add_sd_cnt = 0; status = i40iw_create_hmc_obj_type(dev, &info); if (status) { i40iw_pr_err("create obj type %d status = %d\n", @@ -607,7 +612,7 @@ static enum i40iw_status_code i40iw_create_cqp(struct i40iw_device *iwdev) INIT_LIST_HEAD(&cqp->cqp_avail_reqs); INIT_LIST_HEAD(&cqp->cqp_pending_reqs); /* init the waitq of the cqp_requests and add them to the list */ - for (i = 0; i < I40IW_CQP_SW_SQSIZE_2048; i++) { + for (i = 0; i < sqsize; i++) { init_waitqueue_head(&cqp->cqp_requests[i].waitq); list_add_tail(&cqp->cqp_requests[i].list, &cqp->cqp_avail_reqs); } @@ -1285,7 +1290,7 @@ static void i40iw_wait_pe_ready(struct i40iw_hw *hw) __LINE__, statuscpu2); if ((statuscpu0 == 0x80) && (statuscpu1 == 0x80) && (statuscpu2 == 0x80)) break; /* SUCCESS */ - mdelay(1000); + msleep(1000); retrycount++; } while (retrycount < 14); i40iw_wr32(hw, 0xb4040, 0x4C104C5); @@ -1393,6 +1398,7 @@ static void i40iw_register_notifiers(void) register_inetaddr_notifier(&i40iw_inetaddr_notifier); register_inet6addr_notifier(&i40iw_inetaddr6_notifier); register_netevent_notifier(&i40iw_net_notifier); + register_netdevice_notifier(&i40iw_netdevice_notifier); } /** @@ -1404,6 +1410,7 @@ static void i40iw_unregister_notifiers(void) unregister_netevent_notifier(&i40iw_net_notifier); unregister_inetaddr_notifier(&i40iw_inetaddr_notifier); unregister_inet6addr_notifier(&i40iw_inetaddr6_notifier); + unregister_netdevice_notifier(&i40iw_netdevice_notifier); } /** @@ -1793,7 +1800,7 @@ static void i40iw_close(struct i40e_info *ldev, struct i40e_client *client, bool if (reset) iwdev->reset = true; - i40iw_cm_disconnect_all(iwdev); + i40iw_cm_teardown_connections(iwdev, NULL, NULL, true); destroy_workqueue(iwdev->virtchnl_wq); i40iw_deinit_device(iwdev); } diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.c b/drivers/infiniband/hw/i40iw/i40iw_puda.c index 796a815b53fd..4c21197830b3 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_puda.c +++ b/drivers/infiniband/hw/i40iw/i40iw_puda.c @@ -48,7 +48,6 @@ static void i40iw_ieq_tx_compl(struct i40iw_sc_vsi *vsi, void *sqwrid); static void i40iw_ilq_putback_rcvbuf(struct i40iw_sc_qp *qp, u32 wqe_idx); static enum i40iw_status_code i40iw_puda_replenish_rq(struct i40iw_puda_rsrc *rsrc, bool initial); -static void i40iw_ieq_cleanup_qp(struct i40iw_puda_rsrc *ieq, struct i40iw_sc_qp *qp); /** * i40iw_puda_get_listbuf - get buffer from puda list * @list: list to use for buffers (ILQ or IEQ) @@ -1378,7 +1377,7 @@ static void i40iw_ieq_handle_exception(struct i40iw_puda_rsrc *ieq, u32 *hw_host_ctx = (u32 *)qp->hw_host_ctx; u32 rcv_wnd = hw_host_ctx[23]; /* first partial seq # in q2 */ - u32 fps = qp->q2_buf[16]; + u32 fps = *(u32 *)(qp->q2_buf + Q2_FPSN_OFFSET); struct list_head *rxlist = &pfpdu->rxlist; struct list_head *plist; @@ -1483,7 +1482,7 @@ static void i40iw_ieq_tx_compl(struct i40iw_sc_vsi *vsi, void *sqwrid) * @ieq: ieq resource * @qp: all pending fpdu buffers */ -static void i40iw_ieq_cleanup_qp(struct i40iw_puda_rsrc *ieq, struct i40iw_sc_qp *qp) +void i40iw_ieq_cleanup_qp(struct i40iw_puda_rsrc *ieq, struct i40iw_sc_qp *qp) { struct i40iw_puda_buf *buf; struct i40iw_pfpdu *pfpdu = &qp->pfpdu; diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.h b/drivers/infiniband/hw/i40iw/i40iw_puda.h index 660aa3edae56..53a7d58c84b5 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_puda.h +++ b/drivers/infiniband/hw/i40iw/i40iw_puda.h @@ -184,4 +184,5 @@ enum i40iw_status_code i40iw_cqp_qp_create_cmd(struct i40iw_sc_dev *dev, struct enum i40iw_status_code i40iw_cqp_cq_create_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_cq *cq); void i40iw_cqp_qp_destroy_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp); void i40iw_cqp_cq_destroy_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_cq *cq); +void i40iw_ieq_cleanup_qp(struct i40iw_puda_rsrc *ieq, struct i40iw_sc_qp *qp); #endif diff --git a/drivers/infiniband/hw/i40iw/i40iw_uk.c b/drivers/infiniband/hw/i40iw/i40iw_uk.c index 3ec5389a81a1..8afa5a67a86b 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_uk.c +++ b/drivers/infiniband/hw/i40iw/i40iw_uk.c @@ -894,20 +894,6 @@ exit: } /** - * i40iw_qp_roundup - return round up QP WQ depth - * @wqdepth: WQ depth in quantas to round up - */ -static int i40iw_qp_round_up(u32 wqdepth) -{ - int scount = 1; - - for (wqdepth--; scount <= 16; scount *= 2) - wqdepth |= wqdepth >> scount; - - return ++wqdepth; -} - -/** * i40iw_get_wqe_shift - get shift count for maximum wqe size * @sge: Maximum Scatter Gather Elements wqe * @inline_data: Maximum inline data size @@ -934,7 +920,7 @@ void i40iw_get_wqe_shift(u32 sge, u32 inline_data, u8 *shift) */ enum i40iw_status_code i40iw_get_sqdepth(u32 sq_size, u8 shift, u32 *sqdepth) { - *sqdepth = i40iw_qp_round_up((sq_size << shift) + I40IW_SQ_RSVD); + *sqdepth = roundup_pow_of_two((sq_size << shift) + I40IW_SQ_RSVD); if (*sqdepth < (I40IW_QP_SW_MIN_WQSIZE << shift)) *sqdepth = I40IW_QP_SW_MIN_WQSIZE << shift; @@ -953,7 +939,7 @@ enum i40iw_status_code i40iw_get_sqdepth(u32 sq_size, u8 shift, u32 *sqdepth) */ enum i40iw_status_code i40iw_get_rqdepth(u32 rq_size, u8 shift, u32 *rqdepth) { - *rqdepth = i40iw_qp_round_up((rq_size << shift) + I40IW_RQ_RSVD); + *rqdepth = roundup_pow_of_two((rq_size << shift) + I40IW_RQ_RSVD); if (*rqdepth < (I40IW_QP_SW_MIN_WQSIZE << shift)) *rqdepth = I40IW_QP_SW_MIN_WQSIZE << shift; diff --git a/drivers/infiniband/hw/i40iw/i40iw_user.h b/drivers/infiniband/hw/i40iw/i40iw_user.h index e73efc59a0ab..b125925641e0 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_user.h +++ b/drivers/infiniband/hw/i40iw/i40iw_user.h @@ -59,7 +59,6 @@ enum i40iw_device_capabilities_const { I40IW_MAX_CEQ_ENTRIES = 131071, I40IW_MIN_CQ_SIZE = 1, I40IW_MAX_CQ_SIZE = 1048575, - I40IW_MAX_AEQ_ALLOCATE_COUNT = 255, I40IW_DB_ID_ZERO = 0, I40IW_MAX_WQ_FRAGMENT_COUNT = 3, I40IW_MAX_SGE_RD = 1, @@ -72,7 +71,7 @@ enum i40iw_device_capabilities_const { I40IW_MAX_SQ_PAYLOAD_SIZE = 2145386496, I40IW_MAX_INLINE_DATA_SIZE = 48, I40IW_MAX_PUSHMODE_INLINE_DATA_SIZE = 48, - I40IW_MAX_IRD_SIZE = 63, + I40IW_MAX_IRD_SIZE = 64, I40IW_MAX_ORD_SIZE = 127, I40IW_MAX_WQ_ENTRIES = 2048, I40IW_Q2_BUFFER_SIZE = (248 + 100), diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c index 8845dba7c438..ddc1056b0b4e 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_utils.c +++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c @@ -137,7 +137,7 @@ inline u32 i40iw_rd32(struct i40iw_hw *hw, u32 reg) } /** - * i40iw_inetaddr_event - system notifier for netdev events + * i40iw_inetaddr_event - system notifier for ipv4 addr events * @notfier: not used * @event: event for notifier * @ptr: if address @@ -200,7 +200,7 @@ int i40iw_inetaddr_event(struct notifier_block *notifier, } /** - * i40iw_inet6addr_event - system notifier for ipv6 netdev events + * i40iw_inet6addr_event - system notifier for ipv6 addr events * @notfier: not used * @event: event for notifier * @ptr: if address @@ -252,7 +252,7 @@ int i40iw_inet6addr_event(struct notifier_block *notifier, } /** - * i40iw_net_event - system notifier for net events + * i40iw_net_event - system notifier for netevents * @notfier: not used * @event: event for notifier * @ptr: neighbor @@ -297,6 +297,50 @@ int i40iw_net_event(struct notifier_block *notifier, unsigned long event, void * } /** + * i40iw_netdevice_event - system notifier for netdev events + * @notfier: not used + * @event: event for notifier + * @ptr: netdev + */ +int i40iw_netdevice_event(struct notifier_block *notifier, + unsigned long event, + void *ptr) +{ + struct net_device *event_netdev; + struct net_device *netdev; + struct i40iw_device *iwdev; + struct i40iw_handler *hdl; + + event_netdev = netdev_notifier_info_to_dev(ptr); + + hdl = i40iw_find_netdev(event_netdev); + if (!hdl) + return NOTIFY_DONE; + + iwdev = &hdl->device; + if (iwdev->init_state < RDMA_DEV_REGISTERED || iwdev->closing) + return NOTIFY_DONE; + + netdev = iwdev->ldev->netdev; + if (netdev != event_netdev) + return NOTIFY_DONE; + + iwdev->iw_status = 1; + + switch (event) { + case NETDEV_DOWN: + iwdev->iw_status = 0; + /* Fall through */ + case NETDEV_UP: + i40iw_port_ibevent(iwdev); + break; + default: + break; + } + return NOTIFY_DONE; +} + +/** * i40iw_get_cqp_request - get cqp struct * @cqp: device cqp ptr * @wait: cqp to be used in wait mode diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 3c6f3ce88f89..70024e8e2692 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -412,6 +412,7 @@ void i40iw_free_qp_resources(struct i40iw_device *iwdev, { struct i40iw_pbl *iwpbl = &iwqp->iwpbl; + i40iw_ieq_cleanup_qp(iwdev->vsi.ieq, &iwqp->sc_qp); i40iw_dealloc_push_page(iwdev, &iwqp->sc_qp); if (qp_num) i40iw_free_resource(iwdev, iwdev->allocated_qps, qp_num); @@ -1637,6 +1638,7 @@ static struct ib_mr *i40iw_alloc_mr(struct ib_pd *pd, err_code = -EOVERFLOW; goto err; } + stag &= ~I40IW_CQPSQ_STAG_KEY_MASK; iwmr->stag = stag; iwmr->ibmr.rkey = stag; iwmr->ibmr.lkey = stag; @@ -2242,14 +2244,12 @@ static int i40iw_post_send(struct ib_qp *ibqp, info.op.inline_rdma_write.len = ib_wr->sg_list[0].length; info.op.inline_rdma_write.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr; info.op.inline_rdma_write.rem_addr.stag = rdma_wr(ib_wr)->rkey; - info.op.inline_rdma_write.rem_addr.len = ib_wr->sg_list->length; ret = ukqp->ops.iw_inline_rdma_write(ukqp, &info, false); } else { info.op.rdma_write.lo_sg_list = (void *)ib_wr->sg_list; info.op.rdma_write.num_lo_sges = ib_wr->num_sge; info.op.rdma_write.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr; info.op.rdma_write.rem_addr.stag = rdma_wr(ib_wr)->rkey; - info.op.rdma_write.rem_addr.len = ib_wr->sg_list->length; ret = ukqp->ops.iw_rdma_write(ukqp, &info, false); } @@ -2271,7 +2271,6 @@ static int i40iw_post_send(struct ib_qp *ibqp, info.op_type = I40IW_OP_TYPE_RDMA_READ; info.op.rdma_read.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr; info.op.rdma_read.rem_addr.stag = rdma_wr(ib_wr)->rkey; - info.op.rdma_read.rem_addr.len = ib_wr->sg_list->length; info.op.rdma_read.lo_addr.tag_off = ib_wr->sg_list->addr; info.op.rdma_read.lo_addr.stag = ib_wr->sg_list->lkey; info.op.rdma_read.lo_addr.len = ib_wr->sg_list->length; diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index bf4f14a1b4fc..9a566ee3ceff 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -170,7 +170,7 @@ err_buf: return err; } -#define CQ_CREATE_FLAGS_SUPPORTED IB_CQ_FLAGS_TIMESTAMP_COMPLETION +#define CQ_CREATE_FLAGS_SUPPORTED IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_ucontext *context, @@ -246,7 +246,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar, cq->db.dma, &cq->mcq, vector, 0, - !!(cq->create_flags & IB_CQ_FLAGS_TIMESTAMP_COMPLETION)); + !!(cq->create_flags & IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION)); if (err) goto err_dbmap; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 8c8a16791a3f..8d2ee9322f2e 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -589,6 +589,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, if (props->rss_caps.supported_qpts) { resp.rss_caps.rx_hash_function = MLX4_IB_RX_HASH_FUNC_TOEPLITZ; + resp.rss_caps.rx_hash_fields_mask = MLX4_IB_RX_HASH_SRC_IPV4 | MLX4_IB_RX_HASH_DST_IPV4 | @@ -598,6 +599,11 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, MLX4_IB_RX_HASH_DST_PORT_TCP | MLX4_IB_RX_HASH_SRC_PORT_UDP | MLX4_IB_RX_HASH_DST_PORT_UDP; + + if (dev->dev->caps.tunnel_offload_mode == + MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) + resp.rss_caps.rx_hash_fields_mask |= + MLX4_IB_RX_HASH_INNER; } } @@ -2995,9 +3001,8 @@ err_steer_free_bitmap: kfree(ibdev->ib_uc_qpns_bitmap); err_steer_qp_release: - if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) - mlx4_qp_release_range(dev, ibdev->steer_qpn_base, - ibdev->steer_qpn_count); + mlx4_qp_release_range(dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_count); err_counter: for (i = 0; i < ibdev->num_ports; ++i) mlx4_ib_delete_counters_table(ibdev, &ibdev->counters_table[i]); @@ -3102,11 +3107,9 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) ibdev->iboe.nb.notifier_call = NULL; } - if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) { - mlx4_qp_release_range(dev, ibdev->steer_qpn_base, - ibdev->steer_qpn_count); - kfree(ibdev->ib_uc_qpns_bitmap); - } + mlx4_qp_release_range(dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_count); + kfree(ibdev->ib_uc_qpns_bitmap); iounmap(ibdev->uar_map); for (p = 0; p < ibdev->num_ports; ++p) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index caf490ab24c8..f045491f2c14 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -734,10 +734,24 @@ static int set_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_rss *rss_ctx, return (-EOPNOTSUPP); } + if (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_INNER) { + if (dev->dev->caps.tunnel_offload_mode == + MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) { + /* + * Hash according to inner headers if exist, otherwise + * according to outer headers. + */ + rss_ctx->flags |= MLX4_RSS_BY_INNER_HEADERS_IPONLY; + } else { + pr_debug("RSS Hash for inner headers isn't supported\n"); + return (-EOPNOTSUPP); + } + } + return 0; } -static int create_qp_rss(struct mlx4_ib_dev *dev, struct ib_pd *ibpd, +static int create_qp_rss(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *init_attr, struct mlx4_ib_create_qp_rss *ucmd, struct mlx4_ib_qp *qp) @@ -860,7 +874,7 @@ static struct ib_qp *_mlx4_ib_create_qp_rss(struct ib_pd *pd, qp->pri.vid = 0xFFFF; qp->alt.vid = 0xFFFF; - err = create_qp_rss(to_mdev(pd->device), pd, init_attr, &ucmd, qp); + err = create_qp_rss(to_mdev(pd->device), init_attr, &ucmd, qp); if (err) { kfree(qp); return ERR_PTR(err); @@ -1836,6 +1850,8 @@ static int _mlx4_set_path(struct mlx4_ib_dev *dev, mlx4_ib_gid_index_to_real_index(dev, port, grh->sgid_index); + if (real_sgid_index < 0) + return real_sgid_index; if (real_sgid_index >= dev->dev->caps.gid_table_len[port]) { pr_err("sgid_index (%u) too large. max is %d\n", real_sgid_index, dev->dev->caps.gid_table_len[port] - 1); diff --git a/drivers/infiniband/hw/mlx5/cong.c b/drivers/infiniband/hw/mlx5/cong.c index 2d32b519bb61..985fa2637390 100644 --- a/drivers/infiniband/hw/mlx5/cong.c +++ b/drivers/infiniband/hw/mlx5/cong.c @@ -247,21 +247,30 @@ static void mlx5_ib_set_cc_param_mask_val(void *field, int offset, } } -static int mlx5_ib_get_cc_params(struct mlx5_ib_dev *dev, int offset, u32 *var) +static int mlx5_ib_get_cc_params(struct mlx5_ib_dev *dev, u8 port_num, + int offset, u32 *var) { int outlen = MLX5_ST_SZ_BYTES(query_cong_params_out); void *out; void *field; int err; enum mlx5_ib_cong_node_type node; + struct mlx5_core_dev *mdev; + + /* Takes a 1-based port number */ + mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL); + if (!mdev) + return -ENODEV; out = kvzalloc(outlen, GFP_KERNEL); - if (!out) - return -ENOMEM; + if (!out) { + err = -ENOMEM; + goto alloc_err; + } node = mlx5_ib_param_to_node(offset); - err = mlx5_cmd_query_cong_params(dev->mdev, node, out, outlen); + err = mlx5_cmd_query_cong_params(mdev, node, out, outlen); if (err) goto free; @@ -270,21 +279,32 @@ static int mlx5_ib_get_cc_params(struct mlx5_ib_dev *dev, int offset, u32 *var) free: kvfree(out); +alloc_err: + mlx5_ib_put_native_port_mdev(dev, port_num + 1); return err; } -static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, int offset, u32 var) +static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, u8 port_num, + int offset, u32 var) { int inlen = MLX5_ST_SZ_BYTES(modify_cong_params_in); void *in; void *field; enum mlx5_ib_cong_node_type node; + struct mlx5_core_dev *mdev; u32 attr_mask = 0; int err; + /* Takes a 1-based port number */ + mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL); + if (!mdev) + return -ENODEV; + in = kvzalloc(inlen, GFP_KERNEL); - if (!in) - return -ENOMEM; + if (!in) { + err = -ENOMEM; + goto alloc_err; + } MLX5_SET(modify_cong_params_in, in, opcode, MLX5_CMD_OP_MODIFY_CONG_PARAMS); @@ -299,8 +319,10 @@ static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, int offset, u32 var) MLX5_SET(field_select_r_roce_rp, field, field_select_r_roce_rp, attr_mask); - err = mlx5_cmd_modify_cong_params(dev->mdev, in, inlen); + err = mlx5_cmd_modify_cong_params(mdev, in, inlen); kvfree(in); +alloc_err: + mlx5_ib_put_native_port_mdev(dev, port_num + 1); return err; } @@ -324,7 +346,7 @@ static ssize_t set_param(struct file *filp, const char __user *buf, if (kstrtou32(lbuf, 0, &var)) return -EINVAL; - ret = mlx5_ib_set_cc_params(param->dev, offset, var); + ret = mlx5_ib_set_cc_params(param->dev, param->port_num, offset, var); return ret ? ret : count; } @@ -340,7 +362,7 @@ static ssize_t get_param(struct file *filp, char __user *buf, size_t count, if (*pos) return 0; - ret = mlx5_ib_get_cc_params(param->dev, offset, &var); + ret = mlx5_ib_get_cc_params(param->dev, param->port_num, offset, &var); if (ret) return ret; @@ -362,44 +384,51 @@ static const struct file_operations dbg_cc_fops = { .read = get_param, }; -void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev) +void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num) { if (!mlx5_debugfs_root || - !dev->dbg_cc_params || - !dev->dbg_cc_params->root) + !dev->port[port_num].dbg_cc_params || + !dev->port[port_num].dbg_cc_params->root) return; - debugfs_remove_recursive(dev->dbg_cc_params->root); - kfree(dev->dbg_cc_params); - dev->dbg_cc_params = NULL; + debugfs_remove_recursive(dev->port[port_num].dbg_cc_params->root); + kfree(dev->port[port_num].dbg_cc_params); + dev->port[port_num].dbg_cc_params = NULL; } -int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev) +int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num) { struct mlx5_ib_dbg_cc_params *dbg_cc_params; + struct mlx5_core_dev *mdev; int i; if (!mlx5_debugfs_root) goto out; - if (!MLX5_CAP_GEN(dev->mdev, cc_query_allowed) || - !MLX5_CAP_GEN(dev->mdev, cc_modify_allowed)) + /* Takes a 1-based port number */ + mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL); + if (!mdev) goto out; + if (!MLX5_CAP_GEN(mdev, cc_query_allowed) || + !MLX5_CAP_GEN(mdev, cc_modify_allowed)) + goto put_mdev; + dbg_cc_params = kzalloc(sizeof(*dbg_cc_params), GFP_KERNEL); if (!dbg_cc_params) - goto out; + goto err; - dev->dbg_cc_params = dbg_cc_params; + dev->port[port_num].dbg_cc_params = dbg_cc_params; dbg_cc_params->root = debugfs_create_dir("cc_params", - dev->mdev->priv.dbg_root); + mdev->priv.dbg_root); if (!dbg_cc_params->root) goto err; for (i = 0; i < MLX5_IB_DBG_CC_MAX; i++) { dbg_cc_params->params[i].offset = i; dbg_cc_params->params[i].dev = dev; + dbg_cc_params->params[i].port_num = port_num; dbg_cc_params->params[i].dentry = debugfs_create_file(mlx5_ib_dbg_cc_name[i], 0600, dbg_cc_params->root, @@ -408,11 +437,17 @@ int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev) if (!dbg_cc_params->params[i].dentry) goto err; } -out: return 0; + +put_mdev: + mlx5_ib_put_native_port_mdev(dev, port_num + 1); +out: + return 0; err: mlx5_ib_warn(dev, "cong debugfs failure\n"); - mlx5_ib_cleanup_cong_debugfs(dev); + mlx5_ib_cleanup_cong_debugfs(dev, port_num); + mlx5_ib_put_native_port_mdev(dev, port_num + 1); + /* * We don't want to fail driver if debugfs failed to initialize, * so we are not forwarding error to the user. diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 18705cbcdc8c..5b974fb97611 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -1010,7 +1010,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, MLX5_SET(cqc, cqc, uar_page, index); MLX5_SET(cqc, cqc, c_eqn, eqn); MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma); - if (cq->create_flags & IB_CQ_FLAGS_IGNORE_OVERRUN) + if (cq->create_flags & IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN) MLX5_SET(cqc, cqc, oi, 1); err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen); diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index 1003b0133a49..32a9e9228b13 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -197,10 +197,9 @@ static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt, vl_15_dropped); } -static int process_pma_cmd(struct ib_device *ibdev, u8 port_num, +static int process_pma_cmd(struct mlx5_core_dev *mdev, u8 port_num, const struct ib_mad *in_mad, struct ib_mad *out_mad) { - struct mlx5_ib_dev *dev = to_mdev(ibdev); int err; void *out_cnt; @@ -222,7 +221,7 @@ static int process_pma_cmd(struct ib_device *ibdev, u8 port_num, if (!out_cnt) return IB_MAD_RESULT_FAILURE; - err = mlx5_core_query_vport_counter(dev->mdev, 0, 0, + err = mlx5_core_query_vport_counter(mdev, 0, 0, port_num, out_cnt, sz); if (!err) pma_cnt_ext_assign(pma_cnt_ext, out_cnt); @@ -235,7 +234,7 @@ static int process_pma_cmd(struct ib_device *ibdev, u8 port_num, if (!out_cnt) return IB_MAD_RESULT_FAILURE; - err = mlx5_core_query_ib_ppcnt(dev->mdev, port_num, + err = mlx5_core_query_ib_ppcnt(mdev, port_num, out_cnt, sz); if (!err) pma_cnt_assign(pma_cnt, out_cnt); @@ -255,9 +254,11 @@ int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, u16 *out_mad_pkey_index) { struct mlx5_ib_dev *dev = to_mdev(ibdev); - struct mlx5_core_dev *mdev = dev->mdev; const struct ib_mad *in_mad = (const struct ib_mad *)in; struct ib_mad *out_mad = (struct ib_mad *)out; + struct mlx5_core_dev *mdev; + u8 mdev_port_num; + int ret; if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || *out_mad_size != sizeof(*out_mad))) @@ -265,14 +266,20 @@ int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, memset(out_mad->data, 0, sizeof(out_mad->data)); + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num); + if (!mdev) + return IB_MAD_RESULT_FAILURE; + if (MLX5_CAP_GEN(mdev, vport_counters) && in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT && in_mad->mad_hdr.method == IB_MGMT_METHOD_GET) { - return process_pma_cmd(ibdev, port_num, in_mad, out_mad); + ret = process_pma_cmd(mdev, mdev_port_num, in_mad, out_mad); } else { - return process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, + ret = process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, in_mad, out_mad); } + mlx5_ib_put_native_port_mdev(dev, port_num); + return ret; } int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port) @@ -519,7 +526,7 @@ int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port, int ext_active_speed; int err = -ENOMEM; - if (port < 1 || port > MLX5_CAP_GEN(mdev, num_ports)) { + if (port < 1 || port > dev->num_ports) { mlx5_ib_warn(dev, "invalid port number %d\n", port); return -EINVAL; } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 262c1aa2e028..4236c8086820 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -50,16 +50,14 @@ #include <rdma/ib_cache.h> #include <linux/mlx5/port.h> #include <linux/mlx5/vport.h> +#include <linux/mlx5/fs.h> #include <linux/list.h> #include <rdma/ib_smi.h> #include <rdma/ib_umem.h> #include <linux/in.h> #include <linux/etherdevice.h> -#include <linux/mlx5/fs.h> -#include <linux/mlx5/vport.h> #include "mlx5_ib.h" #include "cmd.h" -#include <linux/mlx5/vport.h> #define DRIVER_NAME "mlx5_ib" #define DRIVER_VERSION "5.0-0" @@ -72,10 +70,36 @@ static char mlx5_version[] = DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v" DRIVER_VERSION "\n"; +struct mlx5_ib_event_work { + struct work_struct work; + struct mlx5_core_dev *dev; + void *context; + enum mlx5_dev_event event; + unsigned long param; +}; + enum { MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3, }; +static struct workqueue_struct *mlx5_ib_event_wq; +static LIST_HEAD(mlx5_ib_unaffiliated_port_list); +static LIST_HEAD(mlx5_ib_dev_list); +/* + * This mutex should be held when accessing either of the above lists + */ +static DEFINE_MUTEX(mlx5_ib_multiport_mutex); + +struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi) +{ + struct mlx5_ib_dev *dev; + + mutex_lock(&mlx5_ib_multiport_mutex); + dev = mpi->ibdev; + mutex_unlock(&mlx5_ib_multiport_mutex); + return dev; +} + static enum rdma_link_layer mlx5_port_type_cap_to_rdma_ll(int port_type_cap) { @@ -115,24 +139,32 @@ static int get_port_state(struct ib_device *ibdev, static int mlx5_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { + struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb); struct net_device *ndev = netdev_notifier_info_to_dev(ptr); - struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev, - roce.nb); + u8 port_num = roce->native_port_num; + struct mlx5_core_dev *mdev; + struct mlx5_ib_dev *ibdev; + + ibdev = roce->dev; + mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL); + if (!mdev) + return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_UNREGISTER: - write_lock(&ibdev->roce.netdev_lock); - if (ndev->dev.parent == &ibdev->mdev->pdev->dev) - ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ? - NULL : ndev; - write_unlock(&ibdev->roce.netdev_lock); + write_lock(&roce->netdev_lock); + + if (ndev->dev.parent == &mdev->pdev->dev) + roce->netdev = (event == NETDEV_UNREGISTER) ? + NULL : ndev; + write_unlock(&roce->netdev_lock); break; case NETDEV_CHANGE: case NETDEV_UP: case NETDEV_DOWN: { - struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(ibdev->mdev); + struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(mdev); struct net_device *upper = NULL; if (lag_ndev) { @@ -140,27 +172,28 @@ static int mlx5_netdev_event(struct notifier_block *this, dev_put(lag_ndev); } - if ((upper == ndev || (!upper && ndev == ibdev->roce.netdev)) + if ((upper == ndev || (!upper && ndev == roce->netdev)) && ibdev->ib_active) { struct ib_event ibev = { }; enum ib_port_state port_state; - if (get_port_state(&ibdev->ib_dev, 1, &port_state)) - return NOTIFY_DONE; + if (get_port_state(&ibdev->ib_dev, port_num, + &port_state)) + goto done; - if (ibdev->roce.last_port_state == port_state) - return NOTIFY_DONE; + if (roce->last_port_state == port_state) + goto done; - ibdev->roce.last_port_state = port_state; + roce->last_port_state = port_state; ibev.device = &ibdev->ib_dev; if (port_state == IB_PORT_DOWN) ibev.event = IB_EVENT_PORT_ERR; else if (port_state == IB_PORT_ACTIVE) ibev.event = IB_EVENT_PORT_ACTIVE; else - return NOTIFY_DONE; + goto done; - ibev.element.port_num = 1; + ibev.element.port_num = port_num; ib_dispatch_event(&ibev); } break; @@ -169,7 +202,8 @@ static int mlx5_netdev_event(struct notifier_block *this, default: break; } - +done: + mlx5_ib_put_native_port_mdev(ibdev, port_num); return NOTIFY_DONE; } @@ -178,22 +212,88 @@ static struct net_device *mlx5_ib_get_netdev(struct ib_device *device, { struct mlx5_ib_dev *ibdev = to_mdev(device); struct net_device *ndev; + struct mlx5_core_dev *mdev; - ndev = mlx5_lag_get_roce_netdev(ibdev->mdev); + mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL); + if (!mdev) + return NULL; + + ndev = mlx5_lag_get_roce_netdev(mdev); if (ndev) - return ndev; + goto out; /* Ensure ndev does not disappear before we invoke dev_hold() */ - read_lock(&ibdev->roce.netdev_lock); - ndev = ibdev->roce.netdev; + read_lock(&ibdev->roce[port_num - 1].netdev_lock); + ndev = ibdev->roce[port_num - 1].netdev; if (ndev) dev_hold(ndev); - read_unlock(&ibdev->roce.netdev_lock); + read_unlock(&ibdev->roce[port_num - 1].netdev_lock); +out: + mlx5_ib_put_native_port_mdev(ibdev, port_num); return ndev; } +struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev, + u8 ib_port_num, + u8 *native_port_num) +{ + enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev, + ib_port_num); + struct mlx5_core_dev *mdev = NULL; + struct mlx5_ib_multiport_info *mpi; + struct mlx5_ib_port *port; + + if (native_port_num) + *native_port_num = 1; + + if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET) + return ibdev->mdev; + + port = &ibdev->port[ib_port_num - 1]; + if (!port) + return NULL; + + spin_lock(&port->mp.mpi_lock); + mpi = ibdev->port[ib_port_num - 1].mp.mpi; + if (mpi && !mpi->unaffiliate) { + mdev = mpi->mdev; + /* If it's the master no need to refcount, it'll exist + * as long as the ib_dev exists. + */ + if (!mpi->is_master) + mpi->mdev_refcnt++; + } + spin_unlock(&port->mp.mpi_lock); + + return mdev; +} + +void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *ibdev, u8 port_num) +{ + enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev, + port_num); + struct mlx5_ib_multiport_info *mpi; + struct mlx5_ib_port *port; + + if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET) + return; + + port = &ibdev->port[port_num - 1]; + + spin_lock(&port->mp.mpi_lock); + mpi = ibdev->port[port_num - 1].mp.mpi; + if (mpi->is_master) + goto out; + + mpi->mdev_refcnt--; + if (mpi->unaffiliate) + complete(&mpi->unref_comp); +out: + spin_unlock(&port->mp.mpi_lock); +} + static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed, u8 *active_width) { @@ -256,19 +356,33 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, struct ib_port_attr *props) { struct mlx5_ib_dev *dev = to_mdev(device); - struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_core_dev *mdev; struct net_device *ndev, *upper; enum ib_mtu ndev_ib_mtu; + bool put_mdev = true; u16 qkey_viol_cntr; u32 eth_prot_oper; + u8 mdev_port_num; int err; + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num); + if (!mdev) { + /* This means the port isn't affiliated yet. Get the + * info for the master port instead. + */ + put_mdev = false; + mdev = dev->mdev; + mdev_port_num = 1; + port_num = 1; + } + /* Possible bad flows are checked before filling out props so in case * of an error it will still be zeroed out. */ - err = mlx5_query_port_eth_proto_oper(mdev, ð_prot_oper, port_num); + err = mlx5_query_port_eth_proto_oper(mdev, ð_prot_oper, + mdev_port_num); if (err) - return err; + goto out; translate_eth_proto_oper(eth_prot_oper, &props->active_speed, &props->active_width); @@ -284,12 +398,16 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, props->state = IB_PORT_DOWN; props->phys_state = 3; - mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr); + mlx5_query_nic_vport_qkey_viol_cntr(mdev, &qkey_viol_cntr); props->qkey_viol_cntr = qkey_viol_cntr; + /* If this is a stub query for an unaffiliated port stop here */ + if (!put_mdev) + goto out; + ndev = mlx5_ib_get_netdev(device, port_num); if (!ndev) - return 0; + goto out; if (mlx5_lag_is_active(dev->mdev)) { rcu_read_lock(); @@ -312,7 +430,10 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, dev_put(ndev); props->active_mtu = min(props->max_mtu, ndev_ib_mtu); - return 0; +out: + if (put_mdev) + mlx5_ib_put_native_port_mdev(dev, port_num); + return err; } static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num, @@ -354,7 +475,7 @@ static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num, return mlx5_core_roce_gid_set(dev->mdev, index, roce_version, roce_l3_type, gid->raw, mac, vlan, - vlan_id); + vlan_id, port_num); } static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num, @@ -438,11 +559,11 @@ static int mlx5_get_vport_access_method(struct ib_device *ibdev) } static void get_atomic_caps(struct mlx5_ib_dev *dev, + u8 atomic_size_qp, struct ib_device_attr *props) { u8 tmp; u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); - u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); u8 atomic_req_8B_endianness_mode = MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianness_mode); @@ -459,6 +580,29 @@ static void get_atomic_caps(struct mlx5_ib_dev *dev, } } +static void get_atomic_caps_qp(struct mlx5_ib_dev *dev, + struct ib_device_attr *props) +{ + u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); + + get_atomic_caps(dev, atomic_size_qp, props); +} + +static void get_atomic_caps_dc(struct mlx5_ib_dev *dev, + struct ib_device_attr *props) +{ + u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); + + get_atomic_caps(dev, atomic_size_qp, props); +} + +bool mlx5_ib_dc_atomic_is_supported(struct mlx5_ib_dev *dev) +{ + struct ib_device_attr props = {}; + + get_atomic_caps_dc(dev, &props); + return (props.atomic_cap == IB_ATOMIC_HCA) ? true : false; +} static int mlx5_query_system_image_guid(struct ib_device *ibdev, __be64 *sys_image_guid) { @@ -587,6 +731,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, int max_rq_sg; int max_sq_sg; u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz); + bool raw_support = !mlx5_core_mp_enabled(mdev); struct mlx5_ib_query_device_resp resp = {}; size_t resp_len; u64 max_tso; @@ -650,7 +795,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (MLX5_CAP_GEN(mdev, block_lb_mc)) props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; - if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads)) { + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && raw_support) { if (MLX5_CAP_ETH(mdev, csum_cap)) { /* Legacy bit to support old userspace libraries */ props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM; @@ -682,7 +827,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, MLX5_RX_HASH_SRC_PORT_TCP | MLX5_RX_HASH_DST_PORT_TCP | MLX5_RX_HASH_SRC_PORT_UDP | - MLX5_RX_HASH_DST_PORT_UDP; + MLX5_RX_HASH_DST_PORT_UDP | + MLX5_RX_HASH_INNER; resp.response_length += sizeof(resp.rss_caps); } } else { @@ -698,7 +844,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) && - MLX5_CAP_GEN(dev->mdev, general_notification_event)) + MLX5_CAP_GEN(dev->mdev, general_notification_event) && + raw_support) props->raw_packet_caps |= IB_RAW_PACKET_CAP_DELAY_DROP; if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) && @@ -706,7 +853,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && - MLX5_CAP_ETH(dev->mdev, scatter_fcs)) { + MLX5_CAP_ETH(dev->mdev, scatter_fcs) && + raw_support) { /* Legacy bit to support old userspace libraries */ props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS; props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS; @@ -746,7 +894,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->max_srq_sge = max_rq_sg - 1; props->max_fast_reg_page_list_len = 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size); - get_atomic_caps(dev, props); + get_atomic_caps_qp(dev, props); props->masked_atomic_cap = IB_ATOMIC_NONE; props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg); @@ -770,7 +918,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION; if (mlx5_ib_port_link_layer(ibdev, 1) == - IB_LINK_LAYER_ETHERNET) { + IB_LINK_LAYER_ETHERNET && raw_support) { props->rss_caps.max_rwq_indirection_tables = 1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt); props->rss_caps.max_rwq_indirection_table_size = @@ -807,7 +955,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, resp.response_length += sizeof(resp.cqe_comp_caps); } - if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen)) { + if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen) && + raw_support) { if (MLX5_CAP_QOS(mdev, packet_pacing) && MLX5_CAP_GEN(mdev, qos)) { resp.packet_pacing_caps.qp_rate_limit_max = @@ -866,7 +1015,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } } - if (field_avail(typeof(resp), striding_rq_caps, uhw->outlen)) { + if (field_avail(typeof(resp), striding_rq_caps, uhw->outlen) && + raw_support) { resp.response_length += sizeof(resp.striding_rq_caps); if (MLX5_CAP_GEN(mdev, striding_rq)) { resp.striding_rq_caps.min_single_stride_log_num_of_bytes = @@ -1097,7 +1247,22 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, } if (!ret && props) { - count = mlx5_core_reserved_gids_count(to_mdev(ibdev)->mdev); + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev; + bool put_mdev = true; + + mdev = mlx5_ib_get_native_port_mdev(dev, port, NULL); + if (!mdev) { + /* If the port isn't affiliated yet query the master. + * The master and slave will have the same values. + */ + mdev = dev->mdev; + port = 1; + put_mdev = false; + } + count = mlx5_core_reserved_gids_count(mdev); + if (put_mdev) + mlx5_ib_put_native_port_mdev(dev, port); props->gid_tbl_len -= count; } return ret; @@ -1122,20 +1287,43 @@ static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index, } -static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, - u16 *pkey) +static int mlx5_query_hca_nic_pkey(struct ib_device *ibdev, u8 port, + u16 index, u16 *pkey) { struct mlx5_ib_dev *dev = to_mdev(ibdev); - struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_core_dev *mdev; + bool put_mdev = true; + u8 mdev_port_num; + int err; + + mdev = mlx5_ib_get_native_port_mdev(dev, port, &mdev_port_num); + if (!mdev) { + /* The port isn't affiliated yet, get the PKey from the master + * port. For RoCE the PKey tables will be the same. + */ + put_mdev = false; + mdev = dev->mdev; + mdev_port_num = 1; + } + + err = mlx5_query_hca_vport_pkey(mdev, 0, mdev_port_num, 0, + index, pkey); + if (put_mdev) + mlx5_ib_put_native_port_mdev(dev, port); + return err; +} + +static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: - return mlx5_query_hca_vport_pkey(mdev, 0, port, 0, index, - pkey); + return mlx5_query_hca_nic_pkey(ibdev, port, index, pkey); default: return -EINVAL; } @@ -1174,23 +1362,32 @@ static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u8 port_num, u32 mask, u32 value) { struct mlx5_hca_vport_context ctx = {}; + struct mlx5_core_dev *mdev; + u8 mdev_port_num; int err; - err = mlx5_query_hca_vport_context(dev->mdev, 0, - port_num, 0, &ctx); + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num); + if (!mdev) + return -ENODEV; + + err = mlx5_query_hca_vport_context(mdev, 0, mdev_port_num, 0, &ctx); if (err) - return err; + goto out; if (~ctx.cap_mask1_perm & mask) { mlx5_ib_warn(dev, "trying to change bitmask 0x%X but change supported 0x%X\n", mask, ctx.cap_mask1_perm); - return -EINVAL; + err = -EINVAL; + goto out; } ctx.cap_mask1 = value; ctx.cap_mask1_perm = mask; - err = mlx5_core_modify_hca_vport_context(dev->mdev, 0, - port_num, 0, &ctx); + err = mlx5_core_modify_hca_vport_context(mdev, 0, mdev_port_num, + 0, &ctx); + +out: + mlx5_ib_put_native_port_mdev(dev, port_num); return err; } @@ -1241,9 +1438,18 @@ static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps) caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n"); } +static u16 calc_dynamic_bfregs(int uars_per_sys_page) +{ + /* Large page with non 4k uar support might limit the dynamic size */ + if (uars_per_sys_page == 1 && PAGE_SIZE > 4096) + return MLX5_MIN_DYN_BFREGS; + + return MLX5_MAX_DYN_BFREGS; +} + static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k, struct mlx5_ib_alloc_ucontext_req_v2 *req, - u32 *num_sys_pages) + struct mlx5_bfreg_info *bfregi) { int uars_per_sys_page; int bfregs_per_sys_page; @@ -1260,16 +1466,21 @@ static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k, uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k); bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR; + /* This holds the required static allocation asked by the user */ req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page); - *num_sys_pages = req->total_num_bfregs / bfregs_per_sys_page; - if (req->num_low_latency_bfregs > req->total_num_bfregs - 1) return -EINVAL; - mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, using %d sys pages\n", + bfregi->num_static_sys_pages = req->total_num_bfregs / bfregs_per_sys_page; + bfregi->num_dyn_bfregs = ALIGN(calc_dynamic_bfregs(uars_per_sys_page), bfregs_per_sys_page); + bfregi->total_num_bfregs = req->total_num_bfregs + bfregi->num_dyn_bfregs; + bfregi->num_sys_pages = bfregi->total_num_bfregs / bfregs_per_sys_page; + + mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, total bfregs %d, using %d sys pages\n", MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no", lib_uar_4k ? "yes" : "no", ref_bfregs, - req->total_num_bfregs, *num_sys_pages); + req->total_num_bfregs, bfregi->total_num_bfregs, + bfregi->num_sys_pages); return 0; } @@ -1281,13 +1492,17 @@ static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *conte int i; bfregi = &context->bfregi; - for (i = 0; i < bfregi->num_sys_pages; i++) { + for (i = 0; i < bfregi->num_static_sys_pages; i++) { err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]); if (err) goto error; mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]); } + + for (i = bfregi->num_static_sys_pages; i < bfregi->num_sys_pages; i++) + bfregi->sys_pages[i] = MLX5_IB_INVALID_UAR_INDEX; + return 0; error: @@ -1306,12 +1521,16 @@ static int deallocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *con bfregi = &context->bfregi; for (i = 0; i < bfregi->num_sys_pages; i++) { - err = mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]); - if (err) { - mlx5_ib_warn(dev, "failed to free uar %d\n", i); - return err; + if (i < bfregi->num_static_sys_pages || + bfregi->sys_pages[i] != MLX5_IB_INVALID_UAR_INDEX) { + err = mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]); + if (err) { + mlx5_ib_warn(dev, "failed to free uar %d, err=%d\n", i, err); + return err; + } } } + return 0; } @@ -1362,6 +1581,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_alloc_ucontext_req_v2 req = {}; struct mlx5_ib_alloc_ucontext_resp resp = {}; + struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_ib_ucontext *context; struct mlx5_bfreg_info *bfregi; int ver; @@ -1422,13 +1642,13 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, bfregi = &context->bfregi; /* updates req->total_num_bfregs */ - err = calc_total_bfregs(dev, lib_uar_4k, &req, &bfregi->num_sys_pages); + err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi); if (err) goto out_ctx; mutex_init(&bfregi->lock); bfregi->lib_uar_4k = lib_uar_4k; - bfregi->count = kcalloc(req.total_num_bfregs, sizeof(*bfregi->count), + bfregi->count = kcalloc(bfregi->total_num_bfregs, sizeof(*bfregi->count), GFP_KERNEL); if (!bfregi->count) { err = -ENOMEM; @@ -1470,7 +1690,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, mutex_init(&context->db_page_mutex); resp.tot_bfregs = req.total_num_bfregs; - resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports); + resp.num_ports = dev->num_ports; if (field_avail(typeof(resp), cqe_version, udata->outlen)) resp.response_length += sizeof(resp.cqe_version); @@ -1489,6 +1709,12 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, resp.response_length += sizeof(resp.eth_min_inline); } + if (field_avail(typeof(resp), clock_info_versions, udata->outlen)) { + if (mdev->clock_info) + resp.clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1); + resp.response_length += sizeof(resp.clock_info_versions); + } + /* * We don't want to expose information from the PCI bar that is located * after 4096 bytes, so if the arch only supports larger pages, let's @@ -1502,8 +1728,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, resp.hca_core_clock_offset = offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE; } - resp.response_length += sizeof(resp.hca_core_clock_offset) + - sizeof(resp.reserved2); + resp.response_length += sizeof(resp.hca_core_clock_offset); } if (field_avail(typeof(resp), log_uar_size, udata->outlen)) @@ -1512,6 +1737,11 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, if (field_avail(typeof(resp), num_uars_per_page, udata->outlen)) resp.response_length += sizeof(resp.num_uars_per_page); + if (field_avail(typeof(resp), num_dyn_bfregs, udata->outlen)) { + resp.num_dyn_bfregs = bfregi->num_dyn_bfregs; + resp.response_length += sizeof(resp.num_dyn_bfregs); + } + err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) goto out_td; @@ -1566,15 +1796,13 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) } static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, - struct mlx5_bfreg_info *bfregi, - int idx) + int uar_idx) { int fw_uars_per_page; fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1; - return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + - bfregi->sys_pages[idx] / fw_uars_per_page; + return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + uar_idx / fw_uars_per_page; } static int get_command(unsigned long offset) @@ -1592,6 +1820,12 @@ static int get_index(unsigned long offset) return get_arg(offset); } +/* Index resides in an extra byte to enable larger values than 255 */ +static int get_extended_index(unsigned long offset) +{ + return get_arg(offset) | ((offset >> 16) & 0xff) << 8; +} + static void mlx5_ib_vma_open(struct vm_area_struct *area) { /* vma_open is called when a new VMA is created on top of our VMA. This @@ -1733,6 +1967,38 @@ static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) } } +static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, + struct vm_area_struct *vma, + struct mlx5_ib_ucontext *context) +{ + phys_addr_t pfn; + int err; + + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + if (get_index(vma->vm_pgoff) != MLX5_IB_CLOCK_INFO_V1) + return -EOPNOTSUPP; + + if (vma->vm_flags & VM_WRITE) + return -EPERM; + + if (!dev->mdev->clock_info_page) + return -EOPNOTSUPP; + + pfn = page_to_pfn(dev->mdev->clock_info_page); + err = remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, + vma->vm_page_prot); + if (err) + return err; + + mlx5_ib_dbg(dev, "mapped clock info at 0x%lx, PA 0x%llx\n", + vma->vm_start, + (unsigned long long)pfn << PAGE_SHIFT); + + return mlx5_ib_set_vma_data(vma, context); +} + static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, struct vm_area_struct *vma, struct mlx5_ib_ucontext *context) @@ -1742,21 +2008,29 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, unsigned long idx; phys_addr_t pfn, pa; pgprot_t prot; - int uars_per_page; + u32 bfreg_dyn_idx = 0; + u32 uar_index; + int dyn_uar = (cmd == MLX5_IB_MMAP_ALLOC_WC); + int max_valid_idx = dyn_uar ? bfregi->num_sys_pages : + bfregi->num_static_sys_pages; if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; - uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k); - idx = get_index(vma->vm_pgoff); - if (idx % uars_per_page || - idx * uars_per_page >= bfregi->num_sys_pages) { - mlx5_ib_warn(dev, "invalid uar index %lu\n", idx); + if (dyn_uar) + idx = get_extended_index(vma->vm_pgoff) + bfregi->num_static_sys_pages; + else + idx = get_index(vma->vm_pgoff); + + if (idx >= max_valid_idx) { + mlx5_ib_warn(dev, "invalid uar index %lu, max=%d\n", + idx, max_valid_idx); return -EINVAL; } switch (cmd) { case MLX5_IB_MMAP_WC_PAGE: + case MLX5_IB_MMAP_ALLOC_WC: /* Some architectures don't support WC memory */ #if defined(CONFIG_X86) if (!pat_enabled()) @@ -1776,7 +2050,40 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, return -EINVAL; } - pfn = uar_index2pfn(dev, bfregi, idx); + if (dyn_uar) { + int uars_per_page; + + uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k); + bfreg_dyn_idx = idx * (uars_per_page * MLX5_NON_FP_BFREGS_PER_UAR); + if (bfreg_dyn_idx >= bfregi->total_num_bfregs) { + mlx5_ib_warn(dev, "invalid bfreg_dyn_idx %u, max=%u\n", + bfreg_dyn_idx, bfregi->total_num_bfregs); + return -EINVAL; + } + + mutex_lock(&bfregi->lock); + /* Fail if uar already allocated, first bfreg index of each + * page holds its count. + */ + if (bfregi->count[bfreg_dyn_idx]) { + mlx5_ib_warn(dev, "wrong offset, idx %lu is busy, bfregn=%u\n", idx, bfreg_dyn_idx); + mutex_unlock(&bfregi->lock); + return -EINVAL; + } + + bfregi->count[bfreg_dyn_idx]++; + mutex_unlock(&bfregi->lock); + + err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index); + if (err) { + mlx5_ib_warn(dev, "UAR alloc failed\n"); + goto free_bfreg; + } + } else { + uar_index = bfregi->sys_pages[idx]; + } + + pfn = uar_index2pfn(dev, uar_index); mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn); vma->vm_page_prot = prot; @@ -1785,14 +2092,32 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, if (err) { mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%lx, pfn=%pa, mmap_cmd=%s\n", err, vma->vm_start, &pfn, mmap_cmd2str(cmd)); - return -EAGAIN; + err = -EAGAIN; + goto err; } pa = pfn << PAGE_SHIFT; mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd), vma->vm_start, &pa); - return mlx5_ib_set_vma_data(vma, context); + err = mlx5_ib_set_vma_data(vma, context); + if (err) + goto err; + + if (dyn_uar) + bfregi->sys_pages[idx] = uar_index; + return 0; + +err: + if (!dyn_uar) + return err; + + mlx5_cmd_free_uar(dev->mdev, idx); + +free_bfreg: + mlx5_ib_free_bfreg(dev, bfregi, bfreg_dyn_idx); + + return err; } static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) @@ -1807,6 +2132,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm case MLX5_IB_MMAP_WC_PAGE: case MLX5_IB_MMAP_NC_PAGE: case MLX5_IB_MMAP_REGULAR_PAGE: + case MLX5_IB_MMAP_ALLOC_WC: return uar_mmap(dev, command, vma, context); case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: @@ -1835,6 +2161,8 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm vma->vm_start, (unsigned long long)pfn << PAGE_SHIFT); break; + case MLX5_IB_MMAP_CLOCK_INFO: + return mlx5_ib_mmap_clock_info_page(dev, vma, context); default: return -EINVAL; @@ -2663,7 +2991,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, return ERR_PTR(-ENOMEM); if (domain != IB_FLOW_DOMAIN_USER || - flow_attr->port > MLX5_CAP_GEN(dev->mdev, num_ports) || + flow_attr->port > dev->num_ports || (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP)) return ERR_PTR(-EINVAL); @@ -2928,15 +3256,24 @@ static void delay_drop_handler(struct work_struct *work) mutex_unlock(&delay_drop->lock); } -static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, - enum mlx5_dev_event event, unsigned long param) +static void mlx5_ib_handle_event(struct work_struct *_work) { - struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context; + struct mlx5_ib_event_work *work = + container_of(_work, struct mlx5_ib_event_work, work); + struct mlx5_ib_dev *ibdev; struct ib_event ibev; bool fatal = false; u8 port = 0; - switch (event) { + if (mlx5_core_is_mp_slave(work->dev)) { + ibdev = mlx5_ib_get_ibdev_from_mpi(work->context); + if (!ibdev) + goto out; + } else { + ibdev = work->context; + } + + switch (work->event) { case MLX5_DEV_EVENT_SYS_ERROR: ibev.event = IB_EVENT_DEVICE_FATAL; mlx5_ib_handle_internal_error(ibdev); @@ -2946,39 +3283,39 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, case MLX5_DEV_EVENT_PORT_UP: case MLX5_DEV_EVENT_PORT_DOWN: case MLX5_DEV_EVENT_PORT_INITIALIZED: - port = (u8)param; + port = (u8)work->param; /* In RoCE, port up/down events are handled in * mlx5_netdev_event(). */ if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) == IB_LINK_LAYER_ETHERNET) - return; + goto out; - ibev.event = (event == MLX5_DEV_EVENT_PORT_UP) ? + ibev.event = (work->event == MLX5_DEV_EVENT_PORT_UP) ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; break; case MLX5_DEV_EVENT_LID_CHANGE: ibev.event = IB_EVENT_LID_CHANGE; - port = (u8)param; + port = (u8)work->param; break; case MLX5_DEV_EVENT_PKEY_CHANGE: ibev.event = IB_EVENT_PKEY_CHANGE; - port = (u8)param; + port = (u8)work->param; schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work); break; case MLX5_DEV_EVENT_GUID_CHANGE: ibev.event = IB_EVENT_GID_CHANGE; - port = (u8)param; + port = (u8)work->param; break; case MLX5_DEV_EVENT_CLIENT_REREG: ibev.event = IB_EVENT_CLIENT_REREGISTER; - port = (u8)param; + port = (u8)work->param; break; case MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT: schedule_work(&ibdev->delay_drop.delay_drop_work); @@ -3000,9 +3337,26 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, if (fatal) ibdev->ib_active = false; - out: - return; + kfree(work); +} + +static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, + enum mlx5_dev_event event, unsigned long param) +{ + struct mlx5_ib_event_work *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (!work) + return; + + INIT_WORK(&work->work, mlx5_ib_handle_event); + work->dev = dev; + work->param = param; + work->context = context; + work->event = event; + + queue_work(mlx5_ib_event_wq, &work->work); } static int set_has_smi_cap(struct mlx5_ib_dev *dev) @@ -3011,7 +3365,7 @@ static int set_has_smi_cap(struct mlx5_ib_dev *dev) int err; int port; - for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) { + for (port = 1; port <= dev->num_ports; port++) { dev->mdev->port_caps[port - 1].has_smi = false; if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB) { @@ -3038,16 +3392,15 @@ static void get_ext_port_caps(struct mlx5_ib_dev *dev) { int port; - for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) + for (port = 1; port <= dev->num_ports; port++) mlx5_query_ext_port_caps(dev, port); } -static int get_port_caps(struct mlx5_ib_dev *dev) +static int get_port_caps(struct mlx5_ib_dev *dev, u8 port) { struct ib_device_attr *dprops = NULL; struct ib_port_attr *pprops = NULL; int err = -ENOMEM; - int port; struct ib_udata uhw = {.inlen = 0, .outlen = 0}; pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); @@ -3068,22 +3421,21 @@ static int get_port_caps(struct mlx5_ib_dev *dev) goto out; } - for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) { - memset(pprops, 0, sizeof(*pprops)); - err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); - if (err) { - mlx5_ib_warn(dev, "query_port %d failed %d\n", - port, err); - break; - } - dev->mdev->port_caps[port - 1].pkey_table_len = - dprops->max_pkeys; - dev->mdev->port_caps[port - 1].gid_table_len = - pprops->gid_tbl_len; - mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n", - dprops->max_pkeys, pprops->gid_tbl_len); + memset(pprops, 0, sizeof(*pprops)); + err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); + if (err) { + mlx5_ib_warn(dev, "query_port %d failed %d\n", + port, err); + goto out; } + dev->mdev->port_caps[port - 1].pkey_table_len = + dprops->max_pkeys; + dev->mdev->port_caps[port - 1].gid_table_len = + pprops->gid_tbl_len; + mlx5_ib_dbg(dev, "port %d: pkey_table_len %d, gid_table_len %d\n", + port, dprops->max_pkeys, pprops->gid_tbl_len); + out: kfree(pprops); kfree(dprops); @@ -3373,12 +3725,14 @@ static u32 get_core_cap_flags(struct ib_device *ibdev) enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1); u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type); u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version); + bool raw_support = !mlx5_core_mp_enabled(dev->mdev); u32 ret = 0; if (ll == IB_LINK_LAYER_INFINIBAND) return RDMA_CORE_PORT_IBA_IB; - ret = RDMA_CORE_PORT_RAW_PACKET; + if (raw_support) + ret = RDMA_CORE_PORT_RAW_PACKET; if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP)) return ret; @@ -3468,33 +3822,33 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev) } } -static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev) +static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num) { int err; - dev->roce.nb.notifier_call = mlx5_netdev_event; - err = register_netdevice_notifier(&dev->roce.nb); + dev->roce[port_num].nb.notifier_call = mlx5_netdev_event; + err = register_netdevice_notifier(&dev->roce[port_num].nb); if (err) { - dev->roce.nb.notifier_call = NULL; + dev->roce[port_num].nb.notifier_call = NULL; return err; } return 0; } -static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev) +static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num) { - if (dev->roce.nb.notifier_call) { - unregister_netdevice_notifier(&dev->roce.nb); - dev->roce.nb.notifier_call = NULL; + if (dev->roce[port_num].nb.notifier_call) { + unregister_netdevice_notifier(&dev->roce[port_num].nb); + dev->roce[port_num].nb.notifier_call = NULL; } } -static int mlx5_enable_eth(struct mlx5_ib_dev *dev) +static int mlx5_enable_eth(struct mlx5_ib_dev *dev, u8 port_num) { int err; - err = mlx5_add_netdev_notifier(dev); + err = mlx5_add_netdev_notifier(dev, port_num); if (err) return err; @@ -3515,7 +3869,7 @@ err_disable_roce: mlx5_nic_vport_disable_roce(dev->mdev); err_unregister_netdevice_notifier: - mlx5_remove_netdev_notifier(dev); + mlx5_remove_netdev_notifier(dev, port_num); return err; } @@ -3577,11 +3931,12 @@ static const struct mlx5_ib_counter extended_err_cnts[] = { static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) { - unsigned int i; + int i; for (i = 0; i < dev->num_ports; i++) { - mlx5_core_dealloc_q_counter(dev->mdev, - dev->port[i].cnts.set_id); + if (dev->port[i].cnts.set_id) + mlx5_core_dealloc_q_counter(dev->mdev, + dev->port[i].cnts.set_id); kfree(dev->port[i].cnts.names); kfree(dev->port[i].cnts.offsets); } @@ -3623,6 +3978,7 @@ static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev, err_names: kfree(cnts->names); + cnts->names = NULL; return -ENOMEM; } @@ -3669,37 +4025,33 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev) { + int err = 0; int i; - int ret; for (i = 0; i < dev->num_ports; i++) { - struct mlx5_ib_port *port = &dev->port[i]; + err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts); + if (err) + goto err_alloc; + + mlx5_ib_fill_counters(dev, dev->port[i].cnts.names, + dev->port[i].cnts.offsets); - ret = mlx5_core_alloc_q_counter(dev->mdev, - &port->cnts.set_id); - if (ret) { + err = mlx5_core_alloc_q_counter(dev->mdev, + &dev->port[i].cnts.set_id); + if (err) { mlx5_ib_warn(dev, "couldn't allocate queue counter for port %d, err %d\n", - i + 1, ret); - goto dealloc_counters; + i + 1, err); + goto err_alloc; } - - ret = __mlx5_ib_alloc_counters(dev, &port->cnts); - if (ret) - goto dealloc_counters; - - mlx5_ib_fill_counters(dev, port->cnts.names, - port->cnts.offsets); + dev->port[i].cnts.set_id_valid = true; } return 0; -dealloc_counters: - while (--i >= 0) - mlx5_core_dealloc_q_counter(dev->mdev, - dev->port[i].cnts.set_id); - - return ret; +err_alloc: + mlx5_ib_dealloc_counters(dev); + return err; } static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev, @@ -3718,7 +4070,7 @@ static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev, RDMA_HW_STATS_DEFAULT_LIFESPAN); } -static int mlx5_ib_query_q_counters(struct mlx5_ib_dev *dev, +static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev, struct mlx5_ib_port *port, struct rdma_hw_stats *stats) { @@ -3731,7 +4083,7 @@ static int mlx5_ib_query_q_counters(struct mlx5_ib_dev *dev, if (!out) return -ENOMEM; - ret = mlx5_core_query_q_counter(dev->mdev, + ret = mlx5_core_query_q_counter(mdev, port->cnts.set_id, 0, out, outlen); if (ret) @@ -3753,28 +4105,43 @@ static int mlx5_ib_get_hw_stats(struct ib_device *ibdev, { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_port *port = &dev->port[port_num - 1]; + struct mlx5_core_dev *mdev; int ret, num_counters; + u8 mdev_port_num; if (!stats) return -EINVAL; - ret = mlx5_ib_query_q_counters(dev, port, stats); + num_counters = port->cnts.num_q_counters + port->cnts.num_cong_counters; + + /* q_counters are per IB device, query the master mdev */ + ret = mlx5_ib_query_q_counters(dev->mdev, port, stats); if (ret) return ret; - num_counters = port->cnts.num_q_counters; if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, + &mdev_port_num); + if (!mdev) { + /* If port is not affiliated yet, its in down state + * which doesn't have any counters yet, so it would be + * zero. So no need to read from the HCA. + */ + goto done; + } ret = mlx5_lag_query_cong_counters(dev->mdev, stats->value + port->cnts.num_q_counters, port->cnts.num_cong_counters, port->cnts.offsets + port->cnts.num_q_counters); + + mlx5_ib_put_native_port_mdev(dev, port_num); if (ret) return ret; - num_counters += port->cnts.num_cong_counters; } +done: return num_counters; } @@ -3936,36 +4303,250 @@ mlx5_ib_get_vector_affinity(struct ib_device *ibdev, int comp_vector) return mlx5_get_vector_affinity(dev->mdev, comp_vector); } -static void *mlx5_ib_add(struct mlx5_core_dev *mdev) +/* The mlx5_ib_multiport_mutex should be held when calling this function */ +static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev, + struct mlx5_ib_multiport_info *mpi) { - struct mlx5_ib_dev *dev; - enum rdma_link_layer ll; - int port_type_cap; - const char *name; + u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1; + struct mlx5_ib_port *port = &ibdev->port[port_num]; + int comps; int err; int i; - port_type_cap = MLX5_CAP_GEN(mdev, port_type); - ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); + mlx5_ib_cleanup_cong_debugfs(ibdev, port_num); - printk_once(KERN_INFO "%s", mlx5_version); + spin_lock(&port->mp.mpi_lock); + if (!mpi->ibdev) { + spin_unlock(&port->mp.mpi_lock); + return; + } + mpi->ibdev = NULL; - dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); - if (!dev) - return NULL; + spin_unlock(&port->mp.mpi_lock); + mlx5_remove_netdev_notifier(ibdev, port_num); + spin_lock(&port->mp.mpi_lock); - dev->mdev = mdev; + comps = mpi->mdev_refcnt; + if (comps) { + mpi->unaffiliate = true; + init_completion(&mpi->unref_comp); + spin_unlock(&port->mp.mpi_lock); + + for (i = 0; i < comps; i++) + wait_for_completion(&mpi->unref_comp); + + spin_lock(&port->mp.mpi_lock); + mpi->unaffiliate = false; + } + + port->mp.mpi = NULL; + + list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list); + + spin_unlock(&port->mp.mpi_lock); + + err = mlx5_nic_vport_unaffiliate_multiport(mpi->mdev); + + mlx5_ib_dbg(ibdev, "unaffiliated port %d\n", port_num + 1); + /* Log an error, still needed to cleanup the pointers and add + * it back to the list. + */ + if (err) + mlx5_ib_err(ibdev, "Failed to unaffiliate port %u\n", + port_num + 1); + + ibdev->roce[port_num].last_port_state = IB_PORT_DOWN; +} + +/* The mlx5_ib_multiport_mutex should be held when calling this function */ +static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev, + struct mlx5_ib_multiport_info *mpi) +{ + u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1; + int err; + + spin_lock(&ibdev->port[port_num].mp.mpi_lock); + if (ibdev->port[port_num].mp.mpi) { + mlx5_ib_warn(ibdev, "port %d already affiliated.\n", + port_num + 1); + spin_unlock(&ibdev->port[port_num].mp.mpi_lock); + return false; + } + + ibdev->port[port_num].mp.mpi = mpi; + mpi->ibdev = ibdev; + spin_unlock(&ibdev->port[port_num].mp.mpi_lock); + + err = mlx5_nic_vport_affiliate_multiport(ibdev->mdev, mpi->mdev); + if (err) + goto unbind; + + err = get_port_caps(ibdev, mlx5_core_native_port_num(mpi->mdev)); + if (err) + goto unbind; + + err = mlx5_add_netdev_notifier(ibdev, port_num); + if (err) { + mlx5_ib_err(ibdev, "failed adding netdev notifier for port %u\n", + port_num + 1); + goto unbind; + } + + err = mlx5_ib_init_cong_debugfs(ibdev, port_num); + if (err) + goto unbind; + + return true; + +unbind: + mlx5_ib_unbind_slave_port(ibdev, mpi); + return false; +} + +static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev) +{ + int port_num = mlx5_core_native_port_num(dev->mdev) - 1; + enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, + port_num + 1); + struct mlx5_ib_multiport_info *mpi; + int err; + int i; + + if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET) + return 0; + + err = mlx5_query_nic_vport_system_image_guid(dev->mdev, + &dev->sys_image_guid); + if (err) + return err; - dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port), + err = mlx5_nic_vport_enable_roce(dev->mdev); + if (err) + return err; + + mutex_lock(&mlx5_ib_multiport_mutex); + for (i = 0; i < dev->num_ports; i++) { + bool bound = false; + + /* build a stub multiport info struct for the native port. */ + if (i == port_num) { + mpi = kzalloc(sizeof(*mpi), GFP_KERNEL); + if (!mpi) { + mutex_unlock(&mlx5_ib_multiport_mutex); + mlx5_nic_vport_disable_roce(dev->mdev); + return -ENOMEM; + } + + mpi->is_master = true; + mpi->mdev = dev->mdev; + mpi->sys_image_guid = dev->sys_image_guid; + dev->port[i].mp.mpi = mpi; + mpi->ibdev = dev; + mpi = NULL; + continue; + } + + list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list, + list) { + if (dev->sys_image_guid == mpi->sys_image_guid && + (mlx5_core_native_port_num(mpi->mdev) - 1) == i) { + bound = mlx5_ib_bind_slave_port(dev, mpi); + } + + if (bound) { + dev_dbg(&mpi->mdev->pdev->dev, "removing port from unaffiliated list.\n"); + mlx5_ib_dbg(dev, "port %d bound\n", i + 1); + list_del(&mpi->list); + break; + } + } + if (!bound) { + get_port_caps(dev, i + 1); + mlx5_ib_dbg(dev, "no free port found for port %d\n", + i + 1); + } + } + + list_add_tail(&dev->ib_dev_list, &mlx5_ib_dev_list); + mutex_unlock(&mlx5_ib_multiport_mutex); + return err; +} + +static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev) +{ + int port_num = mlx5_core_native_port_num(dev->mdev) - 1; + enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, + port_num + 1); + int i; + + if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET) + return; + + mutex_lock(&mlx5_ib_multiport_mutex); + for (i = 0; i < dev->num_ports; i++) { + if (dev->port[i].mp.mpi) { + /* Destroy the native port stub */ + if (i == port_num) { + kfree(dev->port[i].mp.mpi); + dev->port[i].mp.mpi = NULL; + } else { + mlx5_ib_dbg(dev, "unbinding port_num: %d\n", i + 1); + mlx5_ib_unbind_slave_port(dev, dev->port[i].mp.mpi); + } + } + } + + mlx5_ib_dbg(dev, "removing from devlist\n"); + list_del(&dev->ib_dev_list); + mutex_unlock(&mlx5_ib_multiport_mutex); + + mlx5_nic_vport_disable_roce(dev->mdev); +} + +static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_ib_cleanup_multiport_master(dev); +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + cleanup_srcu_struct(&dev->mr_srcu); +#endif + kfree(dev->port); +} + +static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + const char *name; + int err; + int i; + + dev->port = kcalloc(dev->num_ports, sizeof(*dev->port), GFP_KERNEL); if (!dev->port) - goto err_dealloc; + return -ENOMEM; + + for (i = 0; i < dev->num_ports; i++) { + spin_lock_init(&dev->port[i].mp.mpi_lock); + rwlock_init(&dev->roce[i].netdev_lock); + } - rwlock_init(&dev->roce.netdev_lock); - err = get_port_caps(dev); + err = mlx5_ib_init_multiport_master(dev); if (err) goto err_free_port; + if (!mlx5_core_mp_enabled(mdev)) { + int i; + + for (i = 1; i <= dev->num_ports; i++) { + err = get_port_caps(dev, i); + if (err) + break; + } + } else { + err = get_port_caps(dev, mlx5_core_native_port_num(mdev)); + } + if (err) + goto err_mp; + if (mlx5_use_mad_ifc(dev)) get_ext_port_caps(dev); @@ -3978,12 +4559,37 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; - dev->num_ports = MLX5_CAP_GEN(mdev, num_ports); - dev->ib_dev.phys_port_cnt = dev->num_ports; + dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.num_comp_vectors = dev->mdev->priv.eq_table.num_comp_vectors; dev->ib_dev.dev.parent = &mdev->pdev->dev; + mutex_init(&dev->flow_db.lock); + mutex_init(&dev->cap_mask_mutex); + INIT_LIST_HEAD(&dev->qp_list); + spin_lock_init(&dev->reset_flow_resource_lock); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + err = init_srcu_struct(&dev->mr_srcu); + if (err) + goto err_free_port; +#endif + + return 0; +err_mp: + mlx5_ib_cleanup_multiport_master(dev); + +err_free_port: + kfree(dev->port); + + return -ENOMEM; +} + +static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + int err; + dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; dev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | @@ -4022,8 +4628,6 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->ib_dev.query_device = mlx5_ib_query_device; dev->ib_dev.query_port = mlx5_ib_query_port; dev->ib_dev.get_link_layer = mlx5_ib_port_link_layer; - if (ll == IB_LINK_LAYER_ETHERNET) - dev->ib_dev.get_netdev = mlx5_ib_get_netdev; dev->ib_dev.query_gid = mlx5_ib_query_gid; dev->ib_dev.add_gid = mlx5_ib_add_gid; dev->ib_dev.del_gid = mlx5_ib_del_gid; @@ -4080,8 +4684,6 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext; - mlx5_ib_internal_fill_odp_caps(dev); - dev->umr_fence = mlx5_get_umr_fence(MLX5_CAP_GEN(mdev, umr_fence)); if (MLX5_CAP_GEN(mdev, imaicl)) { @@ -4092,11 +4694,6 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); } - if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) { - dev->ib_dev.get_hw_stats = mlx5_ib_get_hw_stats; - dev->ib_dev.alloc_hw_stats = mlx5_ib_alloc_hw_stats; - } - if (MLX5_CAP_GEN(mdev, xrc)) { dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; @@ -4111,8 +4708,39 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); - if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == - IB_LINK_LAYER_ETHERNET) { + err = init_node_data(dev); + if (err) + return err; + + if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && + (MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) || + MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) + mutex_init(&dev->lb_mutex); + + return 0; +} + +static int mlx5_ib_stage_roce_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + enum rdma_link_layer ll; + int port_type_cap; + u8 port_num; + int err; + int i; + + port_num = mlx5_core_native_port_num(dev->mdev) - 1; + port_type_cap = MLX5_CAP_GEN(mdev, port_type); + ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); + + if (ll == IB_LINK_LAYER_ETHERNET) { + for (i = 0; i < dev->num_ports; i++) { + dev->roce[i].dev = dev; + dev->roce[i].native_port_num = i + 1; + dev->roce[i].last_port_state = IB_PORT_DOWN; + } + + dev->ib_dev.get_netdev = mlx5_ib_get_netdev; dev->ib_dev.create_wq = mlx5_ib_create_wq; dev->ib_dev.modify_wq = mlx5_ib_modify_wq; dev->ib_dev.destroy_wq = mlx5_ib_destroy_wq; @@ -4124,143 +4752,329 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) | (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL); + err = mlx5_enable_eth(dev, port_num); + if (err) + return err; } - err = init_node_data(dev); - if (err) - goto err_free_port; - mutex_init(&dev->flow_db.lock); - mutex_init(&dev->cap_mask_mutex); - INIT_LIST_HEAD(&dev->qp_list); - spin_lock_init(&dev->reset_flow_resource_lock); + return 0; +} + +static void mlx5_ib_stage_roce_cleanup(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + enum rdma_link_layer ll; + int port_type_cap; + u8 port_num; + + port_num = mlx5_core_native_port_num(dev->mdev) - 1; + port_type_cap = MLX5_CAP_GEN(mdev, port_type); + ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); if (ll == IB_LINK_LAYER_ETHERNET) { - err = mlx5_enable_eth(dev); - if (err) - goto err_free_port; - dev->roce.last_port_state = IB_PORT_DOWN; + mlx5_disable_eth(dev); + mlx5_remove_netdev_notifier(dev, port_num); } +} - err = create_dev_resources(&dev->devr); - if (err) - goto err_disable_eth; +static int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev) +{ + return create_dev_resources(&dev->devr); +} - err = mlx5_ib_odp_init_one(dev); - if (err) - goto err_rsrc; +static void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev) +{ + destroy_dev_resources(&dev->devr); +} +static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev) +{ + mlx5_ib_internal_fill_odp_caps(dev); + + return mlx5_ib_odp_init_one(dev); +} + +static int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev) +{ if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) { - err = mlx5_ib_alloc_counters(dev); - if (err) - goto err_odp; + dev->ib_dev.get_hw_stats = mlx5_ib_get_hw_stats; + dev->ib_dev.alloc_hw_stats = mlx5_ib_alloc_hw_stats; + + return mlx5_ib_alloc_counters(dev); } - err = mlx5_ib_init_cong_debugfs(dev); - if (err) - goto err_cnt; + return 0; +} + +static void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev) +{ + if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) + mlx5_ib_dealloc_counters(dev); +} +static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev) +{ + return mlx5_ib_init_cong_debugfs(dev, + mlx5_core_native_port_num(dev->mdev) - 1); +} + +static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_ib_cleanup_cong_debugfs(dev, + mlx5_core_native_port_num(dev->mdev) - 1); +} + +static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev) +{ dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev); - if (IS_ERR(dev->mdev->priv.uar)) - goto err_cong; + if (!dev->mdev->priv.uar) + return -ENOMEM; + return 0; +} + +static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar); +} + +static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev) +{ + int err; err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false); if (err) - goto err_uar_page; + return err; err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true); if (err) - goto err_bfreg; + mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); - err = ib_register_device(&dev->ib_dev, NULL); - if (err) - goto err_fp_bfreg; + return err; +} - err = create_umr_res(dev); - if (err) - goto err_dev; +static void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); + mlx5_free_bfreg(dev->mdev, &dev->bfreg); +} + +static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev) +{ + return ib_register_device(&dev->ib_dev, NULL); +} + +static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev) +{ + ib_unregister_device(&dev->ib_dev); +} +static int mlx5_ib_stage_umr_res_init(struct mlx5_ib_dev *dev) +{ + return create_umr_res(dev); +} + +static void mlx5_ib_stage_umr_res_cleanup(struct mlx5_ib_dev *dev) +{ + destroy_umrc_res(dev); +} + +static int mlx5_ib_stage_delay_drop_init(struct mlx5_ib_dev *dev) +{ init_delay_drop(dev); + return 0; +} + +static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev) +{ + cancel_delay_drop(dev); +} + +static int mlx5_ib_stage_class_attr_init(struct mlx5_ib_dev *dev) +{ + int err; + int i; + for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { err = device_create_file(&dev->ib_dev.dev, mlx5_class_attributes[i]); if (err) - goto err_delay_drop; + return err; } - if ((MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && - (MLX5_CAP_GEN(mdev, disable_local_lb_uc) || - MLX5_CAP_GEN(mdev, disable_local_lb_mc))) - mutex_init(&dev->lb_mutex); + return 0; +} + +static void __mlx5_ib_remove(struct mlx5_ib_dev *dev, + const struct mlx5_ib_profile *profile, + int stage) +{ + /* Number of stages to cleanup */ + while (stage) { + stage--; + if (profile->stage[stage].cleanup) + profile->stage[stage].cleanup(dev); + } + + ib_dealloc_device((struct ib_device *)dev); +} + +static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev, u8 port_num); + +static void *__mlx5_ib_add(struct mlx5_core_dev *mdev, + const struct mlx5_ib_profile *profile) +{ + struct mlx5_ib_dev *dev; + int err; + int i; + + printk_once(KERN_INFO "%s", mlx5_version); + dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); + if (!dev) + return NULL; + + dev->mdev = mdev; + dev->num_ports = max(MLX5_CAP_GEN(mdev, num_ports), + MLX5_CAP_GEN(mdev, num_vhca_ports)); + + for (i = 0; i < MLX5_IB_STAGE_MAX; i++) { + if (profile->stage[i].init) { + err = profile->stage[i].init(dev); + if (err) + goto err_out; + } + } + + dev->profile = profile; dev->ib_active = true; return dev; -err_delay_drop: - cancel_delay_drop(dev); - destroy_umrc_res(dev); +err_out: + __mlx5_ib_remove(dev, profile, i); -err_dev: - ib_unregister_device(&dev->ib_dev); + return NULL; +} -err_fp_bfreg: - mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); +static const struct mlx5_ib_profile pf_profile = { + STAGE_CREATE(MLX5_IB_STAGE_INIT, + mlx5_ib_stage_init_init, + mlx5_ib_stage_init_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CAPS, + mlx5_ib_stage_caps_init, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_ROCE, + mlx5_ib_stage_roce_init, + mlx5_ib_stage_roce_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, + mlx5_ib_stage_dev_res_init, + mlx5_ib_stage_dev_res_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_ODP, + mlx5_ib_stage_odp_init, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_COUNTERS, + mlx5_ib_stage_counters_init, + mlx5_ib_stage_counters_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS, + mlx5_ib_stage_cong_debugfs_init, + mlx5_ib_stage_cong_debugfs_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_UAR, + mlx5_ib_stage_uar_init, + mlx5_ib_stage_uar_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_BFREG, + mlx5_ib_stage_bfrag_init, + mlx5_ib_stage_bfrag_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_IB_REG, + mlx5_ib_stage_ib_reg_init, + mlx5_ib_stage_ib_reg_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_UMR_RESOURCES, + mlx5_ib_stage_umr_res_init, + mlx5_ib_stage_umr_res_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP, + mlx5_ib_stage_delay_drop_init, + mlx5_ib_stage_delay_drop_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR, + mlx5_ib_stage_class_attr_init, + NULL), +}; -err_bfreg: - mlx5_free_bfreg(dev->mdev, &dev->bfreg); +static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev, u8 port_num) +{ + struct mlx5_ib_multiport_info *mpi; + struct mlx5_ib_dev *dev; + bool bound = false; + int err; -err_uar_page: - mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar); + mpi = kzalloc(sizeof(*mpi), GFP_KERNEL); + if (!mpi) + return NULL; -err_cong: - mlx5_ib_cleanup_cong_debugfs(dev); -err_cnt: - if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) - mlx5_ib_dealloc_counters(dev); + mpi->mdev = mdev; -err_odp: - mlx5_ib_odp_remove_one(dev); + err = mlx5_query_nic_vport_system_image_guid(mdev, + &mpi->sys_image_guid); + if (err) { + kfree(mpi); + return NULL; + } -err_rsrc: - destroy_dev_resources(&dev->devr); + mutex_lock(&mlx5_ib_multiport_mutex); + list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) { + if (dev->sys_image_guid == mpi->sys_image_guid) + bound = mlx5_ib_bind_slave_port(dev, mpi); -err_disable_eth: - if (ll == IB_LINK_LAYER_ETHERNET) { - mlx5_disable_eth(dev); - mlx5_remove_netdev_notifier(dev); + if (bound) { + rdma_roce_rescan_device(&dev->ib_dev); + break; + } } -err_free_port: - kfree(dev->port); + if (!bound) { + list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list); + dev_dbg(&mdev->pdev->dev, "no suitable IB device found to bind to, added to unaffiliated list.\n"); + } else { + mlx5_ib_dbg(dev, "bound port %u\n", port_num + 1); + } + mutex_unlock(&mlx5_ib_multiport_mutex); -err_dealloc: - ib_dealloc_device((struct ib_device *)dev); + return mpi; +} - return NULL; +static void *mlx5_ib_add(struct mlx5_core_dev *mdev) +{ + enum rdma_link_layer ll; + int port_type_cap; + + port_type_cap = MLX5_CAP_GEN(mdev, port_type); + ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); + + if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET) { + u8 port_num = mlx5_core_native_port_num(mdev) - 1; + + return mlx5_ib_add_slave_port(mdev, port_num); + } + + return __mlx5_ib_add(mdev, &pf_profile); } static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) { - struct mlx5_ib_dev *dev = context; - enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1); + struct mlx5_ib_multiport_info *mpi; + struct mlx5_ib_dev *dev; - cancel_delay_drop(dev); - mlx5_remove_netdev_notifier(dev); - ib_unregister_device(&dev->ib_dev); - mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); - mlx5_free_bfreg(dev->mdev, &dev->bfreg); - mlx5_put_uars_page(dev->mdev, mdev->priv.uar); - mlx5_ib_cleanup_cong_debugfs(dev); - if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) - mlx5_ib_dealloc_counters(dev); - destroy_umrc_res(dev); - mlx5_ib_odp_remove_one(dev); - destroy_dev_resources(&dev->devr); - if (ll == IB_LINK_LAYER_ETHERNET) - mlx5_disable_eth(dev); - kfree(dev->port); - ib_dealloc_device(&dev->ib_dev); + if (mlx5_core_is_mp_slave(mdev)) { + mpi = context; + mutex_lock(&mlx5_ib_multiport_mutex); + if (mpi->ibdev) + mlx5_ib_unbind_slave_port(mpi->ibdev, mpi); + list_del(&mpi->list); + mutex_unlock(&mlx5_ib_multiport_mutex); + return; + } + + dev = context; + __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX); } static struct mlx5_interface mlx5_ib_interface = { @@ -4277,6 +5091,10 @@ static int __init mlx5_ib_init(void) { int err; + mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0); + if (!mlx5_ib_event_wq) + return -ENOMEM; + mlx5_ib_odp_init(); err = mlx5_register_interface(&mlx5_ib_interface); @@ -4287,6 +5105,7 @@ static int __init mlx5_ib_init(void) static void __exit mlx5_ib_cleanup(void) { mlx5_unregister_interface(&mlx5_ib_interface); + destroy_workqueue(mlx5_ib_event_wq); } module_init(mlx5_ib_init); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 2c5f3533bbc9..139385129973 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -70,15 +70,6 @@ enum { MLX5_IB_MMAP_CMD_MASK = 0xff, }; -enum mlx5_ib_mmap_cmd { - MLX5_IB_MMAP_REGULAR_PAGE = 0, - MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, - MLX5_IB_MMAP_WC_PAGE = 2, - MLX5_IB_MMAP_NC_PAGE = 3, - /* 5 is chosen in order to be compatible with old versions of libmlx5 */ - MLX5_IB_MMAP_CORE_CLOCK = 5, -}; - enum { MLX5_RES_SCAT_DATA32_CQE = 0x1, MLX5_RES_SCAT_DATA64_CQE = 0x2, @@ -112,6 +103,11 @@ enum { MLX5_TM_MAX_SGE = 1, }; +enum { + MLX5_IB_INVALID_UAR_INDEX = BIT(31), + MLX5_IB_INVALID_BFREG = BIT(31), +}; + struct mlx5_ib_vma_private_data { struct list_head list; struct vm_area_struct *vma; @@ -200,6 +196,8 @@ struct mlx5_ib_flow_db { * creates the actual hardware QP. */ #define MLX5_IB_QPT_HW_GSI IB_QPT_RESERVED2 +#define MLX5_IB_QPT_DCI IB_QPT_RESERVED3 +#define MLX5_IB_QPT_DCT IB_QPT_RESERVED4 #define MLX5_IB_WR_UMR IB_WR_RESERVED1 #define MLX5_IB_UMR_OCTOWORD 16 @@ -360,12 +358,18 @@ struct mlx5_bf { struct mlx5_sq_bfreg *bfreg; }; +struct mlx5_ib_dct { + struct mlx5_core_dct mdct; + u32 *in; +}; + struct mlx5_ib_qp { struct ib_qp ibqp; union { struct mlx5_ib_qp_trans trans_qp; struct mlx5_ib_raw_packet_qp raw_packet_qp; struct mlx5_ib_rss_qp rss_qp; + struct mlx5_ib_dct dct; }; struct mlx5_buf buf; @@ -404,6 +408,8 @@ struct mlx5_ib_qp { u32 rate_limit; u32 underlay_qpn; bool tunnel_offload_en; + /* storage for qp sub type when core qp type is IB_QPT_DRIVER */ + enum ib_qp_type qp_sub_type; }; struct mlx5_ib_cq_buf { @@ -636,10 +642,21 @@ struct mlx5_ib_counters { u32 num_q_counters; u32 num_cong_counters; u16 set_id; + bool set_id_valid; +}; + +struct mlx5_ib_multiport_info; + +struct mlx5_ib_multiport { + struct mlx5_ib_multiport_info *mpi; + /* To be held when accessing the multiport info */ + spinlock_t mpi_lock; }; struct mlx5_ib_port { struct mlx5_ib_counters cnts; + struct mlx5_ib_multiport mp; + struct mlx5_ib_dbg_cc_params *dbg_cc_params; }; struct mlx5_roce { @@ -651,12 +668,15 @@ struct mlx5_roce { struct notifier_block nb; atomic_t next_port; enum ib_port_state last_port_state; + struct mlx5_ib_dev *dev; + u8 native_port_num; }; struct mlx5_ib_dbg_param { int offset; struct mlx5_ib_dev *dev; struct dentry *dentry; + u8 port_num; }; enum mlx5_ib_dbg_cc_types { @@ -709,10 +729,50 @@ struct mlx5_ib_delay_drop { struct mlx5_ib_dbg_delay_drop *dbg; }; +enum mlx5_ib_stages { + MLX5_IB_STAGE_INIT, + MLX5_IB_STAGE_CAPS, + MLX5_IB_STAGE_ROCE, + MLX5_IB_STAGE_DEVICE_RESOURCES, + MLX5_IB_STAGE_ODP, + MLX5_IB_STAGE_COUNTERS, + MLX5_IB_STAGE_CONG_DEBUGFS, + MLX5_IB_STAGE_UAR, + MLX5_IB_STAGE_BFREG, + MLX5_IB_STAGE_IB_REG, + MLX5_IB_STAGE_UMR_RESOURCES, + MLX5_IB_STAGE_DELAY_DROP, + MLX5_IB_STAGE_CLASS_ATTR, + MLX5_IB_STAGE_MAX, +}; + +struct mlx5_ib_stage { + int (*init)(struct mlx5_ib_dev *dev); + void (*cleanup)(struct mlx5_ib_dev *dev); +}; + +#define STAGE_CREATE(_stage, _init, _cleanup) \ + .stage[_stage] = {.init = _init, .cleanup = _cleanup} + +struct mlx5_ib_profile { + struct mlx5_ib_stage stage[MLX5_IB_STAGE_MAX]; +}; + +struct mlx5_ib_multiport_info { + struct list_head list; + struct mlx5_ib_dev *ibdev; + struct mlx5_core_dev *mdev; + struct completion unref_comp; + u64 sys_image_guid; + u32 mdev_refcnt; + bool is_master; + bool unaffiliate; +}; + struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; - struct mlx5_roce roce; + struct mlx5_roce roce[MLX5_MAX_PORTS]; int num_ports; /* serialize update of capability mask */ @@ -746,12 +806,14 @@ struct mlx5_ib_dev { struct mlx5_sq_bfreg bfreg; struct mlx5_sq_bfreg fp_bfreg; struct mlx5_ib_delay_drop delay_drop; - struct mlx5_ib_dbg_cc_params *dbg_cc_params; + const struct mlx5_ib_profile *profile; /* protect the user_td */ struct mutex lb_mutex; u32 user_td; u8 umr_fence; + struct list_head ib_dev_list; + u64 sys_image_guid; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -956,13 +1018,14 @@ struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, struct ib_rwq_ind_table_init_attr *init_attr, struct ib_udata *udata); int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); +bool mlx5_ib_dc_atomic_is_supported(struct mlx5_ib_dev *dev); + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev); void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, struct mlx5_pagefault *pfault); int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); -void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev); int __init mlx5_ib_odp_init(void); void mlx5_ib_odp_cleanup(void); void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, @@ -977,7 +1040,6 @@ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) } static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } -static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {} static inline int mlx5_ib_odp_init(void) { return 0; } static inline void mlx5_ib_odp_cleanup(void) {} static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {} @@ -1001,8 +1063,8 @@ __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num, int index, enum ib_gid_type *gid_type); -void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev); -int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev); +void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num); +int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num); /* GSI QP helper functions */ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, @@ -1021,6 +1083,15 @@ void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi); int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc); +void mlx5_ib_free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, + int bfregn); +struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi); +struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *dev, + u8 ib_port_num, + u8 *native_port_num); +void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *dev, + u8 port_num); + static inline void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; @@ -1052,8 +1123,8 @@ static inline u32 check_cq_create_flags(u32 flags) * It returns non-zero value for unsupported CQ * create flags, otherwise it returns zero. */ - return (flags & ~(IB_CQ_FLAGS_IGNORE_OVERRUN | - IB_CQ_FLAGS_TIMESTAMP_COMPLETION)); + return (flags & ~(IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN | + IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION)); } static inline int verify_assign_uidx(u8 cqe_version, u32 cmd_uidx, @@ -1113,10 +1184,10 @@ static inline int get_uars_per_sys_page(struct mlx5_ib_dev *dev, bool lib_suppor MLX5_UARS_IN_PAGE : 1; } -static inline int get_num_uars(struct mlx5_ib_dev *dev, - struct mlx5_bfreg_info *bfregi) +static inline int get_num_static_uars(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) { - return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * bfregi->num_sys_pages; + return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * bfregi->num_static_sys_pages; } #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index d109fe8290a7..556e015678de 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1206,6 +1206,9 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int err; bool use_umr = true; + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) + return ERR_PTR(-EINVAL); + mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", start, virt_addr, length, access_flags); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index e2197bdda89c..f1a87a690a4c 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -1207,10 +1207,6 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) { int ret; - ret = init_srcu_struct(&dev->mr_srcu); - if (ret) - return ret; - if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); if (ret) { @@ -1222,11 +1218,6 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) return 0; } -void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *dev) -{ - cleanup_srcu_struct(&dev->mr_srcu); -} - int mlx5_ib_odp_init(void) { mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index cffe5966aef9..39d24bf694a8 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -493,7 +493,7 @@ enum { static int max_bfregs(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi) { - return get_num_uars(dev, bfregi) * MLX5_NON_FP_BFREGS_PER_UAR; + return get_num_static_uars(dev, bfregi) * MLX5_NON_FP_BFREGS_PER_UAR; } static int num_med_bfreg(struct mlx5_ib_dev *dev, @@ -581,7 +581,7 @@ static int alloc_bfreg(struct mlx5_ib_dev *dev, return bfregn; } -static void free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, int bfregn) +void mlx5_ib_free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, int bfregn) { mutex_lock(&bfregi->lock); bfregi->count[bfregn]--; @@ -613,6 +613,7 @@ static int to_mlx5_st(enum ib_qp_type type) case IB_QPT_XRC_TGT: return MLX5_QP_ST_XRC; case IB_QPT_SMI: return MLX5_QP_ST_QP0; case MLX5_IB_QPT_HW_GSI: return MLX5_QP_ST_QP1; + case MLX5_IB_QPT_DCI: return MLX5_QP_ST_DCI; case IB_QPT_RAW_IPV6: return MLX5_QP_ST_RAW_IPV6; case IB_QPT_RAW_PACKET: case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; @@ -627,7 +628,8 @@ static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq); static int bfregn_to_uar_index(struct mlx5_ib_dev *dev, - struct mlx5_bfreg_info *bfregi, int bfregn) + struct mlx5_bfreg_info *bfregi, int bfregn, + bool dyn_bfreg) { int bfregs_per_sys_page; int index_of_sys_page; @@ -637,8 +639,16 @@ static int bfregn_to_uar_index(struct mlx5_ib_dev *dev, MLX5_NON_FP_BFREGS_PER_UAR; index_of_sys_page = bfregn / bfregs_per_sys_page; - offset = bfregn % bfregs_per_sys_page / MLX5_NON_FP_BFREGS_PER_UAR; + if (dyn_bfreg) { + index_of_sys_page += bfregi->num_static_sys_pages; + if (bfregn > bfregi->num_dyn_bfregs || + bfregi->sys_pages[index_of_sys_page] == MLX5_IB_INVALID_UAR_INDEX) { + mlx5_ib_dbg(dev, "Invalid dynamic uar index\n"); + return -EINVAL; + } + } + offset = bfregn % bfregs_per_sys_page / MLX5_NON_FP_BFREGS_PER_UAR; return bfregi->sys_pages[index_of_sys_page] + offset; } @@ -764,7 +774,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_create_qp ucmd; struct mlx5_ib_ubuffer *ubuffer = &base->ubuffer; int page_shift = 0; - int uar_index; + int uar_index = 0; int npages; u32 offset = 0; int bfregn; @@ -780,12 +790,20 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, } context = to_mucontext(pd->uobject->context); - /* - * TBD: should come from the verbs when we have the API - */ - if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) + if (ucmd.flags & MLX5_QP_FLAG_BFREG_INDEX) { + uar_index = bfregn_to_uar_index(dev, &context->bfregi, + ucmd.bfreg_index, true); + if (uar_index < 0) + return uar_index; + + bfregn = MLX5_IB_INVALID_BFREG; + } else if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) { + /* + * TBD: should come from the verbs when we have the API + */ /* In CROSS_CHANNEL CQ and QP must use the same UAR */ bfregn = MLX5_CROSS_CHANNEL_BFREG; + } else { bfregn = alloc_bfreg(dev, &context->bfregi, MLX5_IB_LATENCY_CLASS_HIGH); if (bfregn < 0) { @@ -804,8 +822,10 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, } } - uar_index = bfregn_to_uar_index(dev, &context->bfregi, bfregn); mlx5_ib_dbg(dev, "bfregn 0x%x, uar_index 0x%x\n", bfregn, uar_index); + if (bfregn != MLX5_IB_INVALID_BFREG) + uar_index = bfregn_to_uar_index(dev, &context->bfregi, bfregn, + false); qp->rq.offset = 0; qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); @@ -845,7 +865,10 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, MLX5_SET(qpc, qpc, page_offset, offset); MLX5_SET(qpc, qpc, uar_page, uar_index); - resp->bfreg_index = adjust_bfregn(dev, &context->bfregi, bfregn); + if (bfregn != MLX5_IB_INVALID_BFREG) + resp->bfreg_index = adjust_bfregn(dev, &context->bfregi, bfregn); + else + resp->bfreg_index = MLX5_IB_INVALID_BFREG; qp->bfregn = bfregn; err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db); @@ -874,7 +897,8 @@ err_umem: ib_umem_release(ubuffer->umem); err_bfreg: - free_bfreg(dev, &context->bfregi, bfregn); + if (bfregn != MLX5_IB_INVALID_BFREG) + mlx5_ib_free_bfreg(dev, &context->bfregi, bfregn); return err; } @@ -887,7 +911,13 @@ static void destroy_qp_user(struct mlx5_ib_dev *dev, struct ib_pd *pd, mlx5_ib_db_unmap_user(context, &qp->db); if (base->ubuffer.umem) ib_umem_release(base->ubuffer.umem); - free_bfreg(dev, &context->bfregi, qp->bfregn); + + /* + * Free only the BFREGs which are handled by the kernel. + * BFREGs of UARs allocated dynamically are handled by user. + */ + if (qp->bfregn != MLX5_IB_INVALID_BFREG) + mlx5_ib_free_bfreg(dev, &context->bfregi, qp->bfregn); } static int create_kernel_qp(struct mlx5_ib_dev *dev, @@ -1015,6 +1045,7 @@ static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) { if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) || + (attr->qp_type == MLX5_IB_QPT_DCI) || (attr->qp_type == IB_QPT_XRC_INI)) return MLX5_SRQ_RQ; else if (!qp->has_rq) @@ -2086,20 +2117,108 @@ static const char *ib_qp_type_str(enum ib_qp_type type) return "IB_QPT_RAW_PACKET"; case MLX5_IB_QPT_REG_UMR: return "MLX5_IB_QPT_REG_UMR"; + case IB_QPT_DRIVER: + return "IB_QPT_DRIVER"; case IB_QPT_MAX: default: return "Invalid QP type"; } } +static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct mlx5_ib_create_qp *ucmd) +{ + struct mlx5_ib_dev *dev; + struct mlx5_ib_qp *qp; + int err = 0; + u32 uidx = MLX5_IB_DEFAULT_UIDX; + void *dctc; + + if (!attr->srq || !attr->recv_cq) + return ERR_PTR(-EINVAL); + + dev = to_mdev(pd->device); + + err = get_qp_user_index(to_mucontext(pd->uobject->context), + ucmd, sizeof(*ucmd), &uidx); + if (err) + return ERR_PTR(err); + + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->dct.in = kzalloc(MLX5_ST_SZ_BYTES(create_dct_in), GFP_KERNEL); + if (!qp->dct.in) { + err = -ENOMEM; + goto err_free; + } + + dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry); + qp->qp_sub_type = MLX5_IB_QPT_DCT; + MLX5_SET(dctc, dctc, pd, to_mpd(pd)->pdn); + MLX5_SET(dctc, dctc, srqn_xrqn, to_msrq(attr->srq)->msrq.srqn); + MLX5_SET(dctc, dctc, cqn, to_mcq(attr->recv_cq)->mcq.cqn); + MLX5_SET64(dctc, dctc, dc_access_key, ucmd->access_key); + MLX5_SET(dctc, dctc, user_index, uidx); + + qp->state = IB_QPS_RESET; + + return &qp->ibqp; +err_free: + kfree(qp); + return ERR_PTR(err); +} + +static int set_mlx_qp_type(struct mlx5_ib_dev *dev, + struct ib_qp_init_attr *init_attr, + struct mlx5_ib_create_qp *ucmd, + struct ib_udata *udata) +{ + enum { MLX_QP_FLAGS = MLX5_QP_FLAG_TYPE_DCT | MLX5_QP_FLAG_TYPE_DCI }; + int err; + + if (!udata) + return -EINVAL; + + if (udata->inlen < sizeof(*ucmd)) { + mlx5_ib_dbg(dev, "create_qp user command is smaller than expected\n"); + return -EINVAL; + } + err = ib_copy_from_udata(ucmd, udata, sizeof(*ucmd)); + if (err) + return err; + + if ((ucmd->flags & MLX_QP_FLAGS) == MLX5_QP_FLAG_TYPE_DCI) { + init_attr->qp_type = MLX5_IB_QPT_DCI; + } else { + if ((ucmd->flags & MLX_QP_FLAGS) == MLX5_QP_FLAG_TYPE_DCT) { + init_attr->qp_type = MLX5_IB_QPT_DCT; + } else { + mlx5_ib_dbg(dev, "Invalid QP flags\n"); + return -EINVAL; + } + } + + if (!MLX5_CAP_GEN(dev->mdev, dct)) { + mlx5_ib_dbg(dev, "DC transport is not supported\n"); + return -EOPNOTSUPP; + } + + return 0; +} + struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, - struct ib_qp_init_attr *init_attr, + struct ib_qp_init_attr *verbs_init_attr, struct ib_udata *udata) { struct mlx5_ib_dev *dev; struct mlx5_ib_qp *qp; u16 xrcdn = 0; int err; + struct ib_qp_init_attr mlx_init_attr; + struct ib_qp_init_attr *init_attr = verbs_init_attr; if (pd) { dev = to_mdev(pd->device); @@ -2124,6 +2243,26 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); } + if (init_attr->qp_type == IB_QPT_DRIVER) { + struct mlx5_ib_create_qp ucmd; + + init_attr = &mlx_init_attr; + memcpy(init_attr, verbs_init_attr, sizeof(*verbs_init_attr)); + err = set_mlx_qp_type(dev, init_attr, &ucmd, udata); + if (err) + return ERR_PTR(err); + + if (init_attr->qp_type == MLX5_IB_QPT_DCI) { + if (init_attr->cap.max_recv_wr || + init_attr->cap.max_recv_sge) { + mlx5_ib_dbg(dev, "DCI QP requires zero size receive queue\n"); + return ERR_PTR(-EINVAL); + } + } else { + return mlx5_ib_create_dct(pd, init_attr, &ucmd); + } + } + switch (init_attr->qp_type) { case IB_QPT_XRC_TGT: case IB_QPT_XRC_INI: @@ -2145,6 +2284,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, case IB_QPT_SMI: case MLX5_IB_QPT_HW_GSI: case MLX5_IB_QPT_REG_UMR: + case MLX5_IB_QPT_DCI: qp = kzalloc(sizeof(*qp), GFP_KERNEL); if (!qp) return ERR_PTR(-ENOMEM); @@ -2185,9 +2325,31 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, return ERR_PTR(-EINVAL); } + if (verbs_init_attr->qp_type == IB_QPT_DRIVER) + qp->qp_sub_type = init_attr->qp_type; + return &qp->ibqp; } +static int mlx5_ib_destroy_dct(struct mlx5_ib_qp *mqp) +{ + struct mlx5_ib_dev *dev = to_mdev(mqp->ibqp.device); + + if (mqp->state == IB_QPS_RTR) { + int err; + + err = mlx5_core_destroy_dct(dev->mdev, &mqp->dct.mdct); + if (err) { + mlx5_ib_warn(dev, "failed to destroy DCT %d\n", err); + return err; + } + } + + kfree(mqp->dct.in); + kfree(mqp); + return 0; +} + int mlx5_ib_destroy_qp(struct ib_qp *qp) { struct mlx5_ib_dev *dev = to_mdev(qp->device); @@ -2196,6 +2358,9 @@ int mlx5_ib_destroy_qp(struct ib_qp *qp) if (unlikely(qp->qp_type == IB_QPT_GSI)) return mlx5_ib_gsi_destroy_qp(qp); + if (mqp->qp_sub_type == MLX5_IB_QPT_DCT) + return mlx5_ib_destroy_dct(mqp); + destroy_qp_common(dev, mqp); kfree(mqp); @@ -2763,7 +2928,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (!context) return -ENOMEM; - err = to_mlx5_st(ibqp->qp_type); + err = to_mlx5_st(ibqp->qp_type == IB_QPT_DRIVER ? + qp->qp_sub_type : ibqp->qp_type); if (err < 0) { mlx5_ib_dbg(dev, "unsupported qp type %d\n", ibqp->qp_type); goto out; @@ -2796,8 +2962,9 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, (ibqp->qp_type == IB_QPT_XRC_INI) || (ibqp->qp_type == IB_QPT_XRC_TGT)) { if (mlx5_lag_is_active(dev->mdev)) { + u8 p = mlx5_core_native_port_num(dev->mdev); tx_affinity = (unsigned int)atomic_add_return(1, - &dev->roce.next_port) % + &dev->roce[p].next_port) % MLX5_MAX_PORTS + 1; context->flags |= cpu_to_be32(tx_affinity << 24); } @@ -2922,7 +3089,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, mlx5_cur = to_mlx5_state(cur_state); mlx5_new = to_mlx5_state(new_state); - mlx5_st = to_mlx5_st(ibqp->qp_type); + mlx5_st = to_mlx5_st(ibqp->qp_type == IB_QPT_DRIVER ? + qp->qp_sub_type : ibqp->qp_type); if (mlx5_st < 0) goto out; @@ -2994,6 +3162,139 @@ out: return err; } +static inline bool is_valid_mask(int mask, int req, int opt) +{ + if ((mask & req) != req) + return false; + + if (mask & ~(req | opt)) + return false; + + return true; +} + +/* check valid transition for driver QP types + * for now the only QP type that this function supports is DCI + */ +static bool modify_dci_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state new_state, + enum ib_qp_attr_mask attr_mask) +{ + int req = IB_QP_STATE; + int opt = 0; + + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + req |= IB_QP_PKEY_INDEX | IB_QP_PORT; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) { + opt = IB_QP_PKEY_INDEX | IB_QP_PORT; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + req |= IB_QP_PATH_MTU; + opt = IB_QP_PKEY_INDEX; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_RTR && new_state == IB_QPS_RTS) { + req |= IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC | IB_QP_SQ_PSN; + opt = IB_QP_MIN_RNR_TIMER; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_RTS && new_state == IB_QPS_RTS) { + opt = IB_QP_MIN_RNR_TIMER; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state != IB_QPS_RESET && new_state == IB_QPS_ERR) { + return is_valid_mask(attr_mask, req, opt); + } + return false; +} + +/* mlx5_ib_modify_dct: modify a DCT QP + * valid transitions are: + * RESET to INIT: must set access_flags, pkey_index and port + * INIT to RTR : must set min_rnr_timer, tclass, flow_label, + * mtu, gid_index and hop_limit + * Other transitions and attributes are illegal + */ +static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + enum ib_qp_state cur_state, new_state; + int err = 0; + int required = IB_QP_STATE; + void *dctc; + + if (!(attr_mask & IB_QP_STATE)) + return -EINVAL; + + cur_state = qp->state; + new_state = attr->qp_state; + + dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry); + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + required |= IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; + if (!is_valid_mask(attr_mask, required, 0)) + return -EINVAL; + + if (attr->port_num == 0 || + attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports)) { + mlx5_ib_dbg(dev, "invalid port number %d. number of ports is %d\n", + attr->port_num, dev->num_ports); + return -EINVAL; + } + if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) + MLX5_SET(dctc, dctc, rre, 1); + if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) + MLX5_SET(dctc, dctc, rwe, 1); + if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) { + if (!mlx5_ib_dc_atomic_is_supported(dev)) + return -EOPNOTSUPP; + MLX5_SET(dctc, dctc, rae, 1); + MLX5_SET(dctc, dctc, atomic_mode, MLX5_ATOMIC_MODE_DCT_CX); + } + MLX5_SET(dctc, dctc, pkey_index, attr->pkey_index); + MLX5_SET(dctc, dctc, port, attr->port_num); + MLX5_SET(dctc, dctc, counter_set_id, dev->port[attr->port_num - 1].cnts.set_id); + + } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + struct mlx5_ib_modify_qp_resp resp = {}; + u32 min_resp_len = offsetof(typeof(resp), dctn) + + sizeof(resp.dctn); + + if (udata->outlen < min_resp_len) + return -EINVAL; + resp.response_length = min_resp_len; + + required |= IB_QP_MIN_RNR_TIMER | IB_QP_AV | IB_QP_PATH_MTU; + if (!is_valid_mask(attr_mask, required, 0)) + return -EINVAL; + MLX5_SET(dctc, dctc, min_rnr_nak, attr->min_rnr_timer); + MLX5_SET(dctc, dctc, tclass, attr->ah_attr.grh.traffic_class); + MLX5_SET(dctc, dctc, flow_label, attr->ah_attr.grh.flow_label); + MLX5_SET(dctc, dctc, mtu, attr->path_mtu); + MLX5_SET(dctc, dctc, my_addr_index, attr->ah_attr.grh.sgid_index); + MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit); + + err = mlx5_core_create_dct(dev->mdev, &qp->dct.mdct, qp->dct.in, + MLX5_ST_SZ_BYTES(create_dct_in)); + if (err) + return err; + resp.dctn = qp->dct.mdct.mqp.qpn; + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) { + mlx5_core_destroy_dct(dev->mdev, &qp->dct.mdct); + return err; + } + } else { + mlx5_ib_warn(dev, "Modify DCT: Invalid transition from %d to %d\n", cur_state, new_state); + return -EINVAL; + } + if (err) + qp->state = IB_QPS_ERR; + else + qp->state = new_state; + return err; +} + int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { @@ -3011,8 +3312,14 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (unlikely(ibqp->qp_type == IB_QPT_GSI)) return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask); - qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ? - IB_QPT_GSI : ibqp->qp_type; + if (ibqp->qp_type == IB_QPT_DRIVER) + qp_type = qp->qp_sub_type; + else + qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ? + IB_QPT_GSI : ibqp->qp_type; + + if (qp_type == MLX5_IB_QPT_DCT) + return mlx5_ib_modify_dct(ibqp, attr, attr_mask, udata); mutex_lock(&qp->mutex); @@ -3031,15 +3338,21 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto out; } } else if (qp_type != MLX5_IB_QPT_REG_UMR && - !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) { + qp_type != MLX5_IB_QPT_DCI && + !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) { mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n", cur_state, new_state, ibqp->qp_type, attr_mask); goto out; + } else if (qp_type == MLX5_IB_QPT_DCI && + !modify_dci_qp_is_ok(cur_state, new_state, attr_mask)) { + mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n", + cur_state, new_state, qp_type, attr_mask); + goto out; } if ((attr_mask & IB_QP_PORT) && (attr->port_num == 0 || - attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports))) { + attr->port_num > dev->num_ports)) { mlx5_ib_dbg(dev, "invalid port number %d. number of ports is %d\n", attr->port_num, dev->num_ports); goto out; @@ -4358,11 +4671,10 @@ static void to_rdma_ah_attr(struct mlx5_ib_dev *ibdev, struct rdma_ah_attr *ah_attr, struct mlx5_qp_path *path) { - struct mlx5_core_dev *dev = ibdev->mdev; memset(ah_attr, 0, sizeof(*ah_attr)); - if (!path->port || path->port > MLX5_CAP_GEN(dev, num_ports)) + if (!path->port || path->port > ibdev->num_ports) return; ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, path->port); @@ -4577,6 +4889,71 @@ out: return err; } +static int mlx5_ib_dct_query_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *mqp, + struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_core_dct *dct = &mqp->dct.mdct; + u32 *out; + u32 access_flags = 0; + int outlen = MLX5_ST_SZ_BYTES(query_dct_out); + void *dctc; + int err; + int supported_mask = IB_QP_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_PORT | + IB_QP_MIN_RNR_TIMER | + IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_PKEY_INDEX; + + if (qp_attr_mask & ~supported_mask) + return -EINVAL; + if (mqp->state != IB_QPS_RTR) + return -EINVAL; + + out = kzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_core_dct_query(dev->mdev, dct, out, outlen); + if (err) + goto out; + + dctc = MLX5_ADDR_OF(query_dct_out, out, dct_context_entry); + + if (qp_attr_mask & IB_QP_STATE) + qp_attr->qp_state = IB_QPS_RTR; + + if (qp_attr_mask & IB_QP_ACCESS_FLAGS) { + if (MLX5_GET(dctc, dctc, rre)) + access_flags |= IB_ACCESS_REMOTE_READ; + if (MLX5_GET(dctc, dctc, rwe)) + access_flags |= IB_ACCESS_REMOTE_WRITE; + if (MLX5_GET(dctc, dctc, rae)) + access_flags |= IB_ACCESS_REMOTE_ATOMIC; + qp_attr->qp_access_flags = access_flags; + } + + if (qp_attr_mask & IB_QP_PORT) + qp_attr->port_num = MLX5_GET(dctc, dctc, port); + if (qp_attr_mask & IB_QP_MIN_RNR_TIMER) + qp_attr->min_rnr_timer = MLX5_GET(dctc, dctc, min_rnr_nak); + if (qp_attr_mask & IB_QP_AV) { + qp_attr->ah_attr.grh.traffic_class = MLX5_GET(dctc, dctc, tclass); + qp_attr->ah_attr.grh.flow_label = MLX5_GET(dctc, dctc, flow_label); + qp_attr->ah_attr.grh.sgid_index = MLX5_GET(dctc, dctc, my_addr_index); + qp_attr->ah_attr.grh.hop_limit = MLX5_GET(dctc, dctc, hop_limit); + } + if (qp_attr_mask & IB_QP_PATH_MTU) + qp_attr->path_mtu = MLX5_GET(dctc, dctc, mtu); + if (qp_attr_mask & IB_QP_PKEY_INDEX) + qp_attr->pkey_index = MLX5_GET(dctc, dctc, pkey_index); +out: + kfree(out); + return err; +} + int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { @@ -4596,6 +4973,10 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, memset(qp_init_attr, 0, sizeof(*qp_init_attr)); memset(qp_attr, 0, sizeof(*qp_attr)); + if (unlikely(qp->qp_sub_type == MLX5_IB_QPT_DCT)) + return mlx5_ib_dct_query_qp(dev, qp, qp_attr, + qp_attr_mask, qp_init_attr); + mutex_lock(&qp->mutex); if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || @@ -4685,13 +5066,10 @@ int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd) int err; err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn); - if (err) { + if (err) mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn); - return err; - } kfree(xrcd); - return 0; } diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index c6fe89d79248..2fe503e86c1d 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -472,7 +472,7 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar, goto out; } - ret = get_user_pages(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages, NULL); + ret = get_user_pages_fast(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages); if (ret < 0) goto out; @@ -623,13 +623,12 @@ int mthca_alloc_db(struct mthca_dev *dev, enum mthca_db_type type, page = dev->db_tab->page + end; alloc: - page->db_rec = dma_alloc_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE, - &page->mapping, GFP_KERNEL); + page->db_rec = dma_zalloc_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE, + &page->mapping, GFP_KERNEL); if (!page->db_rec) { ret = -ENOMEM; goto out; } - memset(page->db_rec, 0, MTHCA_ICM_PAGE_SIZE); ret = mthca_MAP_ICM_page(dev, page->mapping, mthca_uarc_virt(dev, &dev->driver_uar, i)); diff --git a/drivers/infiniband/hw/mthca/mthca_user.h b/drivers/infiniband/hw/mthca/mthca_user.h deleted file mode 100644 index 5fe56e810739..000000000000 --- a/drivers/infiniband/hw/mthca/mthca_user.h +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef MTHCA_USER_H -#define MTHCA_USER_H - -#include <linux/types.h> - -/* - * Increment this value if any changes that break userspace ABI - * compatibility are made. - */ -#define MTHCA_UVERBS_ABI_VERSION 1 - -/* - * Make sure that all structs defined in this file remain laid out so - * that they pack the same way on 32-bit and 64-bit architectures (to - * avoid incompatibility between 32-bit userspace and 64-bit kernels). - * In particular do not use pointer types -- pass pointers in __u64 - * instead. - */ - -struct mthca_alloc_ucontext_resp { - __u32 qp_tab_size; - __u32 uarc_size; -}; - -struct mthca_alloc_pd_resp { - __u32 pdn; - __u32 reserved; -}; - -struct mthca_reg_mr { -/* - * Mark the memory region with a DMA attribute that causes - * in-flight DMA to be flushed when the region is written to: - */ -#define MTHCA_MR_DMASYNC 0x1 - __u32 mr_attrs; - __u32 reserved; -}; - -struct mthca_create_cq { - __u32 lkey; - __u32 pdn; - __u64 arm_db_page; - __u64 set_db_page; - __u32 arm_db_index; - __u32 set_db_index; -}; - -struct mthca_create_cq_resp { - __u32 cqn; - __u32 reserved; -}; - -struct mthca_resize_cq { - __u32 lkey; - __u32 reserved; -}; - -struct mthca_create_srq { - __u32 lkey; - __u32 db_index; - __u64 db_page; -}; - -struct mthca_create_srq_resp { - __u32 srqn; - __u32 reserved; -}; - -struct mthca_create_qp { - __u32 lkey; - __u32 reserved; - __u64 sq_db_page; - __u64 rq_db_page; - __u32 sq_db_index; - __u32 rq_db_index; -}; - -#endif /* MTHCA_USER_H */ diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index c56ca2a74df5..6cdfbf8c5674 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1365,7 +1365,7 @@ static int mini_cm_del_listen(struct nes_cm_core *cm_core, static inline int mini_cm_accelerated(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) { - cm_node->accelerated = 1; + cm_node->accelerated = true; if (cm_node->accept_pend) { BUG_ON(!cm_node->listener); diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index d827d03e3941..b9cc02b4e8d5 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -279,7 +279,6 @@ struct nes_cm_tcp_context { u8 rcv_wscale; struct nes_cm_tsa_context tsa_cntxt; - struct timeval sent_ts; }; @@ -341,7 +340,7 @@ struct nes_cm_node { u16 mpa_frame_size; struct iw_cm_id *cm_id; struct list_head list; - int accelerated; + bool accelerated; struct nes_cm_listener *listener; enum nes_cm_conn_type conn_type; struct nes_vnic *nesvnic; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 0ba695a88b62..9904918589a4 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -380,11 +380,10 @@ static int ocrdma_alloc_q(struct ocrdma_dev *dev, q->len = len; q->entry_size = entry_size; q->size = len * entry_size; - q->va = dma_alloc_coherent(&dev->nic_info.pdev->dev, q->size, - &q->dma, GFP_KERNEL); + q->va = dma_zalloc_coherent(&dev->nic_info.pdev->dev, q->size, + &q->dma, GFP_KERNEL); if (!q->va) return -ENOMEM; - memset(q->va, 0, q->size); return 0; } @@ -1819,12 +1818,11 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq, return -ENOMEM; ocrdma_init_mch(&cmd->cmd.req, OCRDMA_CMD_CREATE_CQ, OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); - cq->va = dma_alloc_coherent(&pdev->dev, cq->len, &cq->pa, GFP_KERNEL); + cq->va = dma_zalloc_coherent(&pdev->dev, cq->len, &cq->pa, GFP_KERNEL); if (!cq->va) { status = -ENOMEM; goto mem_err; } - memset(cq->va, 0, cq->len); page_size = cq->len / hw_pages; cmd->cmd.pgsz_pgcnt = (page_size / OCRDMA_MIN_Q_PAGE_SIZE) << OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT; @@ -2212,10 +2210,9 @@ static int ocrdma_set_create_qp_sq_cmd(struct ocrdma_create_qp_req *cmd, qp->sq.max_cnt = max_wqe_allocated; len = (hw_pages * hw_page_size); - qp->sq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL); + qp->sq.va = dma_zalloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL); if (!qp->sq.va) return -EINVAL; - memset(qp->sq.va, 0, len); qp->sq.len = len; qp->sq.pa = pa; qp->sq.entry_size = dev->attr.wqe_size; @@ -2263,10 +2260,9 @@ static int ocrdma_set_create_qp_rq_cmd(struct ocrdma_create_qp_req *cmd, qp->rq.max_cnt = max_rqe_allocated; len = (hw_pages * hw_page_size); - qp->rq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL); + qp->rq.va = dma_zalloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL); if (!qp->rq.va) return -ENOMEM; - memset(qp->rq.va, 0, len); qp->rq.pa = pa; qp->rq.len = len; qp->rq.entry_size = dev->attr.rqe_size; @@ -2320,11 +2316,10 @@ static int ocrdma_set_create_qp_ird_cmd(struct ocrdma_create_qp_req *cmd, if (dev->attr.ird == 0) return 0; - qp->ird_q_va = dma_alloc_coherent(&pdev->dev, ird_q_len, - &pa, GFP_KERNEL); + qp->ird_q_va = dma_zalloc_coherent(&pdev->dev, ird_q_len, &pa, + GFP_KERNEL); if (!qp->ird_q_va) return -ENOMEM; - memset(qp->ird_q_va, 0, ird_q_len); ocrdma_build_q_pages(&cmd->ird_addr[0], dev->attr.num_ird_pages, pa, ird_page_size); for (; i < ird_q_len / dev->attr.rqe_size; i++) { diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c index e528d7acb7f6..24d20a4aa262 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c @@ -73,15 +73,13 @@ bool ocrdma_alloc_stats_resources(struct ocrdma_dev *dev) mem->size = max_t(u32, sizeof(struct ocrdma_rdma_stats_req), sizeof(struct ocrdma_rdma_stats_resp)); - mem->va = dma_alloc_coherent(&dev->nic_info.pdev->dev, mem->size, - &mem->pa, GFP_KERNEL); + mem->va = dma_zalloc_coherent(&dev->nic_info.pdev->dev, mem->size, + &mem->pa, GFP_KERNEL); if (!mem->va) { pr_err("%s: stats mbox allocation failed\n", __func__); return false; } - memset(mem->va, 0, mem->size); - /* Alloc debugfs mem */ mem->debugfs_mem = kzalloc(OCRDMA_MAX_DBGFS_MEM, GFP_KERNEL); if (!mem->debugfs_mem) @@ -834,7 +832,7 @@ void ocrdma_add_port_stats(struct ocrdma_dev *dev) dev->reset_stats.type = OCRDMA_RESET_STATS; dev->reset_stats.dev = dev; - if (!debugfs_create_file("reset_stats", S_IRUSR, dev->dir, + if (!debugfs_create_file("reset_stats", 0200, dev->dir, &dev->reset_stats, &ocrdma_dbg_ops)) goto err; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 7866fd8051f6..8009bdad4e5b 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -461,7 +461,7 @@ retry: static inline int is_ucontext_pd(struct ocrdma_ucontext *uctx, struct ocrdma_pd *pd) { - return (uctx->cntxt_pd == pd ? true : false); + return (uctx->cntxt_pd == pd); } static int _ocrdma_dealloc_pd(struct ocrdma_dev *dev, @@ -550,13 +550,12 @@ struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev, INIT_LIST_HEAD(&ctx->mm_head); mutex_init(&ctx->mm_list_lock); - ctx->ah_tbl.va = dma_alloc_coherent(&pdev->dev, map_len, - &ctx->ah_tbl.pa, GFP_KERNEL); + ctx->ah_tbl.va = dma_zalloc_coherent(&pdev->dev, map_len, + &ctx->ah_tbl.pa, GFP_KERNEL); if (!ctx->ah_tbl.va) { kfree(ctx); return ERR_PTR(-ENOMEM); } - memset(ctx->ah_tbl.va, 0, map_len); ctx->ah_tbl.len = map_len; memset(&resp, 0, sizeof(resp)); @@ -885,13 +884,12 @@ static int ocrdma_build_pbl_tbl(struct ocrdma_dev *dev, struct ocrdma_hw_mr *mr) return -ENOMEM; for (i = 0; i < mr->num_pbls; i++) { - va = dma_alloc_coherent(&pdev->dev, dma_len, &pa, GFP_KERNEL); + va = dma_zalloc_coherent(&pdev->dev, dma_len, &pa, GFP_KERNEL); if (!va) { ocrdma_free_mr_pbl_tbl(dev, mr); status = -ENOMEM; break; } - memset(va, 0, dma_len); mr->pbl_table[i].va = va; mr->pbl_table[i].pa = pa; } diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index b26aa88dab48..53f00dbf313f 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -604,12 +604,11 @@ static struct qedr_pbl *qedr_alloc_pbl_tbl(struct qedr_dev *dev, return ERR_PTR(-ENOMEM); for (i = 0; i < pbl_info->num_pbls; i++) { - va = dma_alloc_coherent(&pdev->dev, pbl_info->pbl_size, - &pa, flags); + va = dma_zalloc_coherent(&pdev->dev, pbl_info->pbl_size, + &pa, flags); if (!va) goto err; - memset(va, 0, pbl_info->pbl_size); pbl_table[i].va = va; pbl_table[i].pa = pa; } @@ -3040,7 +3039,7 @@ static int __qedr_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, swqe->wqe_size = 2; swqe2 = qed_chain_produce(&qp->sq.pbl); - swqe->inv_key_or_imm_data = cpu_to_le32(wr->ex.imm_data); + swqe->inv_key_or_imm_data = cpu_to_le32(be32_to_cpu(wr->ex.imm_data)); length = qedr_prepare_sq_send_data(dev, qp, swqe, swqe2, wr, bad_wr); swqe->length = cpu_to_le32(length); @@ -3471,9 +3470,9 @@ static int qedr_poll_cq_req(struct qedr_dev *dev, break; case RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR: if (qp->state != QED_ROCE_QP_STATE_ERR) - DP_ERR(dev, - "Error: POLL CQ with RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR. CQ icid=0x%x, QP icid=0x%x\n", - cq->icid, qp->icid); + DP_DEBUG(dev, QEDR_MSG_CQ, + "Error: POLL CQ with RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR. CQ icid=0x%x, QP icid=0x%x\n", + cq->icid, qp->icid); cnt = process_req(dev, qp, cq, num_entries, wc, req->sq_cons, IB_WC_WR_FLUSH_ERR, 1); break; @@ -3591,7 +3590,7 @@ static inline int qedr_set_ok_cqe_resp_wc(struct rdma_cqe_responder *resp, wc->byte_len = le32_to_cpu(resp->length); if (resp->flags & QEDR_RESP_IMM) { - wc->ex.imm_data = le32_to_cpu(resp->imm_data_or_inv_r_Key); + wc->ex.imm_data = cpu_to_be32(le32_to_cpu(resp->imm_data_or_inv_r_Key)); wc->wc_flags |= IB_WC_WITH_IMM; if (resp->flags & QEDR_RESP_RDMA) diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 092ed8103842..0235f76bbc72 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -1428,8 +1428,6 @@ u64 qib_sps_ints(void); */ dma_addr_t qib_map_page(struct pci_dev *, struct page *, unsigned long, size_t, int); -const char *qib_get_unit_name(int unit); -const char *qib_get_card_name(struct rvt_dev_info *rdi); struct pci_dev *qib_get_pci_dev(struct rvt_dev_info *rdi); /* @@ -1488,15 +1486,15 @@ extern struct mutex qib_mutex; #define qib_dev_err(dd, fmt, ...) \ dev_err(&(dd)->pcidev->dev, "%s: " fmt, \ - qib_get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) #define qib_dev_warn(dd, fmt, ...) \ dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \ - qib_get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) #define qib_dev_porterr(dd, port, fmt, ...) \ dev_err(&(dd)->pcidev->dev, "%s: IB%u:%u " fmt, \ - qib_get_unit_name((dd)->unit), (dd)->unit, (port), \ + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), (dd)->unit, (port), \ ##__VA_ARGS__) #define qib_devinfo(pcidev, fmt, ...) \ diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c index 33d3335385e8..3117cc5f2a9a 100644 --- a/drivers/infiniband/hw/qib/qib_driver.c +++ b/drivers/infiniband/hw/qib/qib_driver.c @@ -81,22 +81,6 @@ MODULE_DESCRIPTION("Intel IB driver"); struct qlogic_ib_stats qib_stats; -const char *qib_get_unit_name(int unit) -{ - static char iname[16]; - - snprintf(iname, sizeof(iname), "infinipath%u", unit); - return iname; -} - -const char *qib_get_card_name(struct rvt_dev_info *rdi) -{ - struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi); - struct qib_devdata *dd = container_of(ibdev, - struct qib_devdata, verbs_dev); - return qib_get_unit_name(dd->unit); -} - struct pci_dev *qib_get_pci_dev(struct rvt_dev_info *rdi) { struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi); diff --git a/drivers/infiniband/hw/qib/qib_eeprom.c b/drivers/infiniband/hw/qib/qib_eeprom.c index 33a2e74c8495..5838b3bf34b9 100644 --- a/drivers/infiniband/hw/qib/qib_eeprom.c +++ b/drivers/infiniband/hw/qib/qib_eeprom.c @@ -163,8 +163,7 @@ void qib_get_eeprom_info(struct qib_devdata *dd) if (bguid[6] == 0xff) { if (bguid[5] == 0xff) { qib_dev_err(dd, - "Can't set %s GUID from base, wraps to OUI!\n", - qib_get_unit_name(t)); + "Can't set GUID from base, wraps to OUI!\n"); dd->base_guid = 0; goto bail; } diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index b67df63bd64b..f7593b5e2b76 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -568,20 +568,16 @@ done: static int qib_set_part_key(struct qib_ctxtdata *rcd, u16 key) { struct qib_pportdata *ppd = rcd->ppd; - int i, any = 0, pidx = -1; + int i, pidx = -1; + bool any = false; u16 lkey = key & 0x7FFF; - int ret; - if (lkey == (QIB_DEFAULT_P_KEY & 0x7FFF)) { + if (lkey == (QIB_DEFAULT_P_KEY & 0x7FFF)) /* nothing to do; this key always valid */ - ret = 0; - goto bail; - } + return 0; - if (!lkey) { - ret = -EINVAL; - goto bail; - } + if (!lkey) + return -EINVAL; /* * Set the full membership bit, because it has to be @@ -594,18 +590,14 @@ static int qib_set_part_key(struct qib_ctxtdata *rcd, u16 key) for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) { if (!rcd->pkeys[i] && pidx == -1) pidx = i; - if (rcd->pkeys[i] == key) { - ret = -EEXIST; - goto bail; - } + if (rcd->pkeys[i] == key) + return -EEXIST; } - if (pidx == -1) { - ret = -EBUSY; - goto bail; - } - for (any = i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + if (pidx == -1) + return -EBUSY; + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { if (!ppd->pkeys[i]) { - any++; + any = true; continue; } if (ppd->pkeys[i] == key) { @@ -613,44 +605,34 @@ static int qib_set_part_key(struct qib_ctxtdata *rcd, u16 key) if (atomic_inc_return(pkrefs) > 1) { rcd->pkeys[pidx] = key; - ret = 0; - goto bail; - } else { - /* - * lost race, decrement count, catch below - */ - atomic_dec(pkrefs); - any++; + return 0; } + /* + * lost race, decrement count, catch below + */ + atomic_dec(pkrefs); + any = true; } - if ((ppd->pkeys[i] & 0x7FFF) == lkey) { + if ((ppd->pkeys[i] & 0x7FFF) == lkey) /* * It makes no sense to have both the limited and * full membership PKEY set at the same time since * the unlimited one will disable the limited one. */ - ret = -EEXIST; - goto bail; - } - } - if (!any) { - ret = -EBUSY; - goto bail; + return -EEXIST; } - for (any = i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + if (!any) + return -EBUSY; + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { if (!ppd->pkeys[i] && atomic_inc_return(&ppd->pkeyrefs[i]) == 1) { rcd->pkeys[pidx] = key; ppd->pkeys[i] = key; (void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0); - ret = 0; - goto bail; + return 0; } } - ret = -EBUSY; - -bail: - return ret; + return -EBUSY; } /** diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 85dfbba427f6..3990f386aa32 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -1119,6 +1119,8 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) "Could not allocate unit ID: error %d\n", -ret); goto bail; } + rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s%d", "qib", dd->unit); + dd->int_counter = alloc_percpu(u64); if (!dd->int_counter) { ret = -ENOMEM; diff --git a/drivers/infiniband/hw/qib/qib_keys.c b/drivers/infiniband/hw/qib/qib_keys.c deleted file mode 100644 index 8fdf79f8d4e4..000000000000 --- a/drivers/infiniband/hw/qib/qib_keys.c +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Copyright (c) 2006, 2007, 2009 QLogic Corporation. All rights reserved. - * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "qib.h" - -/** - * qib_alloc_lkey - allocate an lkey - * @mr: memory region that this lkey protects - * @dma_region: 0->normal key, 1->restricted DMA key - * - * Returns 0 if successful, otherwise returns -errno. - * - * Increments mr reference count as required. - * - * Sets the lkey field mr for non-dma regions. - * - */ - -int qib_alloc_lkey(struct rvt_mregion *mr, int dma_region) -{ - unsigned long flags; - u32 r; - u32 n; - int ret = 0; - struct qib_ibdev *dev = to_idev(mr->pd->device); - struct rvt_lkey_table *rkt = &dev->lk_table; - - spin_lock_irqsave(&rkt->lock, flags); - - /* special case for dma_mr lkey == 0 */ - if (dma_region) { - struct rvt_mregion *tmr; - - tmr = rcu_access_pointer(dev->dma_mr); - if (!tmr) { - qib_get_mr(mr); - rcu_assign_pointer(dev->dma_mr, mr); - mr->lkey_published = 1; - } - goto success; - } - - /* Find the next available LKEY */ - r = rkt->next; - n = r; - for (;;) { - if (rkt->table[r] == NULL) - break; - r = (r + 1) & (rkt->max - 1); - if (r == n) - goto bail; - } - rkt->next = (r + 1) & (rkt->max - 1); - /* - * Make sure lkey is never zero which is reserved to indicate an - * unrestricted LKEY. - */ - rkt->gen++; - /* - * bits are capped in qib_verbs.c to insure enough bits - * for generation number - */ - mr->lkey = (r << (32 - ib_rvt_lkey_table_size)) | - ((((1 << (24 - ib_rvt_lkey_table_size)) - 1) & rkt->gen) - << 8); - if (mr->lkey == 0) { - mr->lkey |= 1 << 8; - rkt->gen++; - } - qib_get_mr(mr); - rcu_assign_pointer(rkt->table[r], mr); - mr->lkey_published = 1; -success: - spin_unlock_irqrestore(&rkt->lock, flags); -out: - return ret; -bail: - spin_unlock_irqrestore(&rkt->lock, flags); - ret = -ENOMEM; - goto out; -} - -/** - * qib_free_lkey - free an lkey - * @mr: mr to free from tables - */ -void qib_free_lkey(struct rvt_mregion *mr) -{ - unsigned long flags; - u32 lkey = mr->lkey; - u32 r; - struct qib_ibdev *dev = to_idev(mr->pd->device); - struct rvt_lkey_table *rkt = &dev->lk_table; - - spin_lock_irqsave(&rkt->lock, flags); - if (!mr->lkey_published) - goto out; - if (lkey == 0) - RCU_INIT_POINTER(dev->dma_mr, NULL); - else { - r = lkey >> (32 - ib_rvt_lkey_table_size); - RCU_INIT_POINTER(rkt->table[r], NULL); - } - qib_put_mr(mr); - mr->lkey_published = 0; -out: - spin_unlock_irqrestore(&rkt->lock, flags); -} - -/** - * qib_rkey_ok - check the IB virtual address, length, and RKEY - * @qp: qp for validation - * @sge: SGE state - * @len: length of data - * @vaddr: virtual address to place data - * @rkey: rkey to check - * @acc: access flags - * - * Return 1 if successful, otherwise 0. - * - * increments the reference count upon success - */ -int qib_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge, - u32 len, u64 vaddr, u32 rkey, int acc) -{ - struct rvt_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table; - struct rvt_mregion *mr; - unsigned n, m; - size_t off; - - /* We use RKEY == zero for kernel virtual addresses */ - rcu_read_lock(); - if (rkey == 0) { - struct rvt_pd *pd = ibpd_to_rvtpd(qp->ibqp.pd); - struct qib_ibdev *dev = to_idev(pd->ibpd.device); - - if (pd->user) - goto bail; - mr = rcu_dereference(dev->dma_mr); - if (!mr) - goto bail; - if (unlikely(!atomic_inc_not_zero(&mr->refcount))) - goto bail; - rcu_read_unlock(); - - sge->mr = mr; - sge->vaddr = (void *) vaddr; - sge->length = len; - sge->sge_length = len; - sge->m = 0; - sge->n = 0; - goto ok; - } - - mr = rcu_dereference( - rkt->table[(rkey >> (32 - ib_rvt_lkey_table_size))]); - if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd)) - goto bail; - - off = vaddr - mr->iova; - if (unlikely(vaddr < mr->iova || off + len > mr->length || - (mr->access_flags & acc) == 0)) - goto bail; - if (unlikely(!atomic_inc_not_zero(&mr->refcount))) - goto bail; - rcu_read_unlock(); - - off += mr->offset; - if (mr->page_shift) { - /* - page sizes are uniform power of 2 so no loop is necessary - entries_spanned_by_off is the number of times the loop below - would have executed. - */ - size_t entries_spanned_by_off; - - entries_spanned_by_off = off >> mr->page_shift; - off -= (entries_spanned_by_off << mr->page_shift); - m = entries_spanned_by_off / RVT_SEGSZ; - n = entries_spanned_by_off % RVT_SEGSZ; - } else { - m = 0; - n = 0; - while (off >= mr->map[m]->segs[n].length) { - off -= mr->map[m]->segs[n].length; - n++; - if (n >= RVT_SEGSZ) { - m++; - n = 0; - } - } - } - sge->mr = mr; - sge->vaddr = mr->map[m]->segs[n].vaddr + off; - sge->length = mr->map[m]->segs[n].length - off; - sge->sge_length = len; - sge->m = m; - sge->n = n; -ok: - return 1; -bail: - rcu_read_unlock(); - return 0; -} - diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 1a785c37ad0a..cfddff45413f 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -432,13 +432,13 @@ no_flow_control: qp->s_state = OP(COMPARE_SWAP); put_ib_ateth_swap(wqe->atomic_wr.swap, &ohdr->u.atomic_eth); - put_ib_ateth_swap(wqe->atomic_wr.compare_add, - &ohdr->u.atomic_eth); + put_ib_ateth_compare(wqe->atomic_wr.compare_add, + &ohdr->u.atomic_eth); } else { qp->s_state = OP(FETCH_ADD); put_ib_ateth_swap(wqe->atomic_wr.compare_add, &ohdr->u.atomic_eth); - put_ib_ateth_swap(0, &ohdr->u.atomic_eth); + put_ib_ateth_compare(0, &ohdr->u.atomic_eth); } put_ib_ateth_vaddr(wqe->atomic_wr.remote_addr, &ohdr->u.atomic_eth); diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index c55000501582..fabee760407e 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1571,7 +1571,6 @@ int qib_register_ib_device(struct qib_devdata *dd) if (!ib_qib_sys_image_guid) ib_qib_sys_image_guid = ppd->guid; - strlcpy(ibdev->name, "qib%d", IB_DEVICE_NAME_MAX); ibdev->owner = THIS_MODULE; ibdev->node_guid = ppd->guid; ibdev->phys_port_cnt = dd->num_pports; @@ -1586,7 +1585,6 @@ int qib_register_ib_device(struct qib_devdata *dd) * Fill in rvt info object. */ dd->verbs_dev.rdi.driver_f.port_callback = qib_create_port_files; - dd->verbs_dev.rdi.driver_f.get_card_name = qib_get_card_name; dd->verbs_dev.rdi.driver_f.get_pci_dev = qib_get_pci_dev; dd->verbs_dev.rdi.driver_f.check_ah = qib_check_ah; dd->verbs_dev.rdi.driver_f.check_send_wqe = qib_check_send_wqe; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c index 685ef2293cb8..4210ca14014d 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c @@ -45,7 +45,6 @@ #include "usnic_ib_verbs.h" #include "usnic_ib_sysfs.h" #include "usnic_log.h" -#include "usnic_ib_sysfs.h" static ssize_t usnic_ib_show_board(struct device *device, struct device_attribute *attr, diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index aa2456a4f9bd..a688a5669168 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -47,7 +47,6 @@ #include "usnic_log.h" #include "usnic_uiom.h" #include "usnic_transport.h" -#include "usnic_ib_verbs.h" #define USNIC_DEFAULT_TRANSPORT USNIC_TRANSPORT_ROCE_CUSTOM diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h index 4f7bd3b6a315..44cb1cfba417 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h @@ -93,7 +93,7 @@ struct pvrdma_cq { struct pvrdma_page_dir pdir; u32 cq_handle; bool is_kernel; - atomic_t refcnt; + refcount_t refcnt; struct completion free; }; @@ -196,7 +196,7 @@ struct pvrdma_qp { u8 state; bool is_kernel; struct mutex mutex; /* QP state mutex. */ - atomic_t refcnt; + refcount_t refcnt; struct completion free; }; diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index e529622cefad..faa9478c14a6 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -132,8 +132,9 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, } cq->ibcq.cqe = entries; + cq->is_kernel = !context; - if (context) { + if (!cq->is_kernel) { if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { ret = -EFAULT; goto err_cq; @@ -148,8 +149,6 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, npages = ib_umem_page_count(cq->umem); } else { - cq->is_kernel = true; - /* One extra page for shared ring state */ npages = 1 + (entries * sizeof(struct pvrdma_cqe) + PAGE_SIZE - 1) / PAGE_SIZE; @@ -178,7 +177,7 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, else pvrdma_page_dir_insert_umem(&cq->pdir, cq->umem, 0); - atomic_set(&cq->refcnt, 1); + refcount_set(&cq->refcnt, 1); init_completion(&cq->free); spin_lock_init(&cq->cq_lock); @@ -202,7 +201,7 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, dev->cq_tbl[cq->cq_handle % dev->dsr->caps.max_cq] = cq; spin_unlock_irqrestore(&dev->cq_tbl_lock, flags); - if (context) { + if (!cq->is_kernel) { cq->uar = &(to_vucontext(context)->uar); /* Copy udata back. */ @@ -219,7 +218,7 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, err_page_dir: pvrdma_page_dir_cleanup(dev, &cq->pdir); err_umem: - if (context) + if (!cq->is_kernel) ib_umem_release(cq->umem); err_cq: atomic_dec(&dev->num_cqs); @@ -230,7 +229,7 @@ err_cq: static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq) { - if (atomic_dec_and_test(&cq->refcnt)) + if (refcount_dec_and_test(&cq->refcnt)) complete(&cq->free); wait_for_completion(&cq->free); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index e92681878c93..d650a9fcde24 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -243,13 +243,13 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) mutex_init(&dev->port_mutex); spin_lock_init(&dev->desc_lock); - dev->cq_tbl = kcalloc(dev->dsr->caps.max_cq, sizeof(void *), + dev->cq_tbl = kcalloc(dev->dsr->caps.max_cq, sizeof(struct pvrdma_cq *), GFP_KERNEL); if (!dev->cq_tbl) return ret; spin_lock_init(&dev->cq_tbl_lock); - dev->qp_tbl = kcalloc(dev->dsr->caps.max_qp, sizeof(void *), + dev->qp_tbl = kcalloc(dev->dsr->caps.max_qp, sizeof(struct pvrdma_qp *), GFP_KERNEL); if (!dev->qp_tbl) goto err_cq_free; @@ -333,7 +333,7 @@ static void pvrdma_qp_event(struct pvrdma_dev *dev, u32 qpn, int type) spin_lock_irqsave(&dev->qp_tbl_lock, flags); qp = dev->qp_tbl[qpn % dev->dsr->caps.max_qp]; if (qp) - atomic_inc(&qp->refcnt); + refcount_inc(&qp->refcnt); spin_unlock_irqrestore(&dev->qp_tbl_lock, flags); if (qp && qp->ibqp.event_handler) { @@ -346,7 +346,7 @@ static void pvrdma_qp_event(struct pvrdma_dev *dev, u32 qpn, int type) ibqp->event_handler(&e, ibqp->qp_context); } if (qp) { - if (atomic_dec_and_test(&qp->refcnt)) + if (refcount_dec_and_test(&qp->refcnt)) complete(&qp->free); } } @@ -359,7 +359,7 @@ static void pvrdma_cq_event(struct pvrdma_dev *dev, u32 cqn, int type) spin_lock_irqsave(&dev->cq_tbl_lock, flags); cq = dev->cq_tbl[cqn % dev->dsr->caps.max_cq]; if (cq) - atomic_inc(&cq->refcnt); + refcount_inc(&cq->refcnt); spin_unlock_irqrestore(&dev->cq_tbl_lock, flags); if (cq && cq->ibcq.event_handler) { @@ -372,7 +372,7 @@ static void pvrdma_cq_event(struct pvrdma_dev *dev, u32 cqn, int type) ibcq->event_handler(&e, ibcq->cq_context); } if (cq) { - if (atomic_dec_and_test(&cq->refcnt)) + if (refcount_dec_and_test(&cq->refcnt)) complete(&cq->free); } } @@ -531,13 +531,13 @@ static irqreturn_t pvrdma_intrx_handler(int irq, void *dev_id) spin_lock_irqsave(&dev->cq_tbl_lock, flags); cq = dev->cq_tbl[cqne->info % dev->dsr->caps.max_cq]; if (cq) - atomic_inc(&cq->refcnt); + refcount_inc(&cq->refcnt); spin_unlock_irqrestore(&dev->cq_tbl_lock, flags); if (cq && cq->ibcq.comp_handler) cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); if (cq) { - if (atomic_dec_and_test(&cq->refcnt)) + if (refcount_dec_and_test(&cq->refcnt)) complete(&cq->free); } pvrdma_idx_ring_inc(&ring->cons_head, ring_slots); @@ -882,8 +882,8 @@ static int pvrdma_pci_probe(struct pci_dev *pdev, dev_info(&pdev->dev, "device version %d, driver version %d\n", dev->dsr_version, PVRDMA_VERSION); - dev->dsr = dma_alloc_coherent(&pdev->dev, sizeof(*dev->dsr), - &dev->dsrbase, GFP_KERNEL); + dev->dsr = dma_zalloc_coherent(&pdev->dev, sizeof(*dev->dsr), + &dev->dsrbase, GFP_KERNEL); if (!dev->dsr) { dev_err(&pdev->dev, "failed to allocate shared region\n"); ret = -ENOMEM; @@ -891,7 +891,6 @@ static int pvrdma_pci_probe(struct pci_dev *pdev, } /* Setup the shared region */ - memset(dev->dsr, 0, sizeof(*dev->dsr)); dev->dsr->driver_version = PVRDMA_VERSION; dev->dsr->gos_info.gos_bits = sizeof(void *) == 4 ? PVRDMA_GOS_BITS_32 : diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c index 8519f3212e52..fa96fa4fb829 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c @@ -119,10 +119,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, union pvrdma_cmd_resp rsp; struct pvrdma_cmd_create_mr *cmd = &req.create_mr; struct pvrdma_cmd_create_mr_resp *resp = &rsp.create_mr_resp; - int nchunks; int ret; - int entry; - struct scatterlist *sg; if (length == 0 || length > dev->dsr->caps.max_mr_size) { dev_warn(&dev->pdev->dev, "invalid mem region length\n"); @@ -137,13 +134,9 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return ERR_CAST(umem); } - nchunks = 0; - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) - nchunks += sg_dma_len(sg) >> PAGE_SHIFT; - - if (nchunks < 0 || nchunks > PVRDMA_PAGE_DIR_MAX_PAGES) { + if (umem->npages < 0 || umem->npages > PVRDMA_PAGE_DIR_MAX_PAGES) { dev_warn(&dev->pdev->dev, "overflow %d pages in mem region\n", - nchunks); + umem->npages); ret = -EINVAL; goto err_umem; } @@ -158,7 +151,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mr->mmr.size = length; mr->umem = umem; - ret = pvrdma_page_dir_init(dev, &mr->pdir, nchunks, false); + ret = pvrdma_page_dir_init(dev, &mr->pdir, umem->npages, false); if (ret) { dev_warn(&dev->pdev->dev, "could not allocate page directory\n"); @@ -175,7 +168,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, cmd->length = length; cmd->pd_handle = to_vpd(pd)->pd_handle; cmd->access_flags = access_flags; - cmd->nchunks = nchunks; + cmd->nchunks = umem->npages; cmd->pdir_dma = mr->pdir.dir_dma; ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_MR_RESP); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c index 4059308e1454..7bf518bdbf21 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c @@ -245,12 +245,13 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd, spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); mutex_init(&qp->mutex); - atomic_set(&qp->refcnt, 1); + refcount_set(&qp->refcnt, 1); init_completion(&qp->free); qp->state = IB_QPS_RESET; + qp->is_kernel = !(pd->uobject && udata); - if (pd->uobject && udata) { + if (!qp->is_kernel) { dev_dbg(&dev->pdev->dev, "create queuepair from user space\n"); @@ -291,8 +292,6 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd, qp->npages_recv = 0; qp->npages = qp->npages_send + qp->npages_recv; } else { - qp->is_kernel = true; - ret = pvrdma_set_sq_size(to_vdev(pd->device), &init_attr->cap, qp); if (ret) @@ -394,7 +393,7 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd, err_pdir: pvrdma_page_dir_cleanup(dev, &qp->pdir); err_umem: - if (pd->uobject && udata) { + if (!qp->is_kernel) { if (qp->rumem) ib_umem_release(qp->rumem); if (qp->sumem) @@ -428,7 +427,7 @@ static void pvrdma_free_qp(struct pvrdma_qp *qp) pvrdma_unlock_cqs(scq, rcq, &scq_flags, &rcq_flags); - if (atomic_dec_and_test(&qp->refcnt)) + if (refcount_dec_and_test(&qp->refcnt)) complete(&qp->free); wait_for_completion(&qp->free); diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 97d71e49c092..fb52b669bfce 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -56,7 +56,7 @@ * rvt_cq_enter - add a new entry to the completion queue * @cq: completion queue * @entry: work completion entry to add - * @sig: true if @entry is solicited + * @solicited: true if @entry is solicited * * This may be called with qp->s_lock held. */ @@ -101,8 +101,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) wc->uqueue[head].opcode = entry->opcode; wc->uqueue[head].vendor_err = entry->vendor_err; wc->uqueue[head].byte_len = entry->byte_len; - wc->uqueue[head].ex.imm_data = - (__u32 __force)entry->ex.imm_data; + wc->uqueue[head].ex.imm_data = entry->ex.imm_data; wc->uqueue[head].qp_num = entry->qp->qp_num; wc->uqueue[head].src_qp = entry->src_qp; wc->uqueue[head].wc_flags = entry->wc_flags; @@ -198,7 +197,7 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, return ERR_PTR(-EINVAL); /* Allocate the completion queue structure. */ - cq = kzalloc(sizeof(*cq), GFP_KERNEL); + cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, rdi->dparms.node); if (!cq) return ERR_PTR(-ENOMEM); @@ -214,7 +213,9 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, sz += sizeof(struct ib_uverbs_wc) * (entries + 1); else sz += sizeof(struct ib_wc) * (entries + 1); - wc = vmalloc_user(sz); + wc = udata ? + vmalloc_user(sz) : + vzalloc_node(sz, rdi->dparms.node); if (!wc) { ret = ERR_PTR(-ENOMEM); goto bail_cq; @@ -369,7 +370,9 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) sz += sizeof(struct ib_uverbs_wc) * (cqe + 1); else sz += sizeof(struct ib_wc) * (cqe + 1); - wc = vmalloc_user(sz); + wc = udata ? + vmalloc_user(sz) : + vzalloc_node(sz, rdi->dparms.node); if (!wc) return -ENOMEM; diff --git a/drivers/infiniband/sw/rdmavt/mcast.c b/drivers/infiniband/sw/rdmavt/mcast.c index b3a38c5e4cad..dd11c6fcd060 100644 --- a/drivers/infiniband/sw/rdmavt/mcast.c +++ b/drivers/infiniband/sw/rdmavt/mcast.c @@ -272,7 +272,7 @@ bail: /** * rvt_attach_mcast - attach a qp to a multicast group * @ibqp: Infiniband qp - * @igd: multicast guid + * @gid: multicast guid * @lid: multicast lid * * Return: 0 on success @@ -335,7 +335,7 @@ bail_mcast: /** * rvt_detach_mcast - remove a qp from a multicast group * @ibqp: Infiniband qp - * @igd: multicast guid + * @gid: multicast guid * @lid: multicast lid * * Return: 0 on success diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 42713511b53b..1b2e5362a3ff 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -768,7 +768,7 @@ bail: /** * rvt_map_phys_fmr - set up a fast memory region - * @ibmfr: the fast memory region to set up + * @ibfmr: the fast memory region to set up * @page_list: the list of pages to associate with the fast memory region * @list_len: the number of pages to associate with the fast memory region * @iova: the virtual address of the start of the fast memory region diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index eae84c216e2f..c82e6bb3d77c 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -269,7 +269,7 @@ no_qp_table: /** * free_all_qps - check for QPs still in use - * @qpt: the QP table to empty + * @rdi: rvt device info structure * * There should not be any QPs still in use. * Free memory for table. @@ -335,9 +335,9 @@ static inline unsigned mk_qpn(struct rvt_qpn_table *qpt, /** * alloc_qpn - Allocate the next available qpn or zero/one for QP type * IB_QPT_SMI/IB_QPT_GSI - *@rdi: rvt device info structure - *@qpt: queue pair number table pointer - *@port_num: IB port number, 1 based, comes from core + * @rdi: rvt device info structure + * @qpt: queue pair number table pointer + * @port_num: IB port number, 1 based, comes from core * * Return: The queue pair number */ @@ -1650,9 +1650,9 @@ static inline int rvt_qp_valid_operation( /** * rvt_qp_is_avail - determine queue capacity - * @qp - the qp - * @rdi - the rdmavt device - * @reserved_op - is reserved operation + * @qp: the qp + * @rdi: the rdmavt device + * @reserved_op: is reserved operation * * This assumes the s_hlock is held but the s_last * qp variable is uncontrolled. @@ -2074,6 +2074,7 @@ void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth) lockdep_assert_held(&qp->s_lock); qp->s_flags |= RVT_S_WAIT_RNR; to = rvt_aeth_to_usec(aeth); + trace_rvt_rnrnak_add(qp, to); hrtimer_start(&qp->s_rnr_timer, ns_to_ktime(1000 * to), HRTIMER_MODE_REL); } @@ -2103,17 +2104,14 @@ EXPORT_SYMBOL(rvt_stop_rc_timers); * stop an rnr timer and return if the timer * had been pending. */ -static int rvt_stop_rnr_timer(struct rvt_qp *qp) +static void rvt_stop_rnr_timer(struct rvt_qp *qp) { - int rval = 0; - lockdep_assert_held(&qp->s_lock); /* Remove QP from rnr timer */ if (qp->s_flags & RVT_S_WAIT_RNR) { qp->s_flags &= ~RVT_S_WAIT_RNR; - rval = hrtimer_try_to_cancel(&qp->s_rnr_timer); + trace_rvt_rnrnak_stop(qp, 0); } - return rval; } /** @@ -2166,6 +2164,7 @@ enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t) spin_lock_irqsave(&qp->s_lock, flags); rvt_stop_rnr_timer(qp); + trace_rvt_rnrnak_timeout(qp, 0); rdi->driver_f.schedule_send(qp); spin_unlock_irqrestore(&qp->s_lock, flags); return HRTIMER_NORESTART; @@ -2174,8 +2173,8 @@ EXPORT_SYMBOL(rvt_rc_rnr_retry); /** * rvt_qp_iter_init - initial for QP iteration - * @rdi - rvt devinfo - * @v - u64 value + * @rdi: rvt devinfo + * @v: u64 value * * This returns an iterator suitable for iterating QPs * in the system. diff --git a/drivers/infiniband/sw/rdmavt/srq.c b/drivers/infiniband/sw/rdmavt/srq.c index f7c48e9023de..3707952b4364 100644 --- a/drivers/infiniband/sw/rdmavt/srq.c +++ b/drivers/infiniband/sw/rdmavt/srq.c @@ -90,7 +90,7 @@ struct ib_srq *rvt_create_srq(struct ib_pd *ibpd, srq_init_attr->attr.max_wr > dev->dparms.props.max_srq_wr) return ERR_PTR(-EINVAL); - srq = kmalloc(sizeof(*srq), GFP_KERNEL); + srq = kzalloc_node(sizeof(*srq), GFP_KERNEL, dev->dparms.node); if (!srq) return ERR_PTR(-ENOMEM); @@ -101,7 +101,10 @@ struct ib_srq *rvt_create_srq(struct ib_pd *ibpd, srq->rq.max_sge = srq_init_attr->attr.max_sge; sz = sizeof(struct ib_sge) * srq->rq.max_sge + sizeof(struct rvt_rwqe); - srq->rq.wq = vmalloc_user(sizeof(struct rvt_rwq) + srq->rq.size * sz); + srq->rq.wq = udata ? + vmalloc_user(sizeof(struct rvt_rwq) + srq->rq.size * sz) : + vzalloc_node(sizeof(struct rvt_rwq) + srq->rq.size * sz, + dev->dparms.node); if (!srq->rq.wq) { ret = ERR_PTR(-ENOMEM); goto bail_srq; @@ -129,16 +132,12 @@ struct ib_srq *rvt_create_srq(struct ib_pd *ibpd, ret = ERR_PTR(err); goto bail_ip; } - } else { - srq->ip = NULL; } /* * ib_create_srq() will initialize srq->ibsrq. */ spin_lock_init(&srq->rq.lock); - srq->rq.wq->head = 0; - srq->rq.wq->tail = 0; srq->limit = srq_init_attr->attr.srq_limit; spin_lock(&dev->n_srqs_lock); @@ -200,7 +199,10 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, sz = sizeof(struct rvt_rwqe) + srq->rq.max_sge * sizeof(struct ib_sge); size = attr->max_wr + 1; - wq = vmalloc_user(sizeof(struct rvt_rwq) + size * sz); + wq = udata ? + vmalloc_user(sizeof(struct rvt_rwq) + size * sz) : + vzalloc_node(sizeof(struct rvt_rwq) + size * sz, + dev->dparms.node); if (!wq) return -ENOMEM; diff --git a/drivers/infiniband/sw/rdmavt/trace.h b/drivers/infiniband/sw/rdmavt/trace.h index bb4b1e710f22..36ddbd291ee0 100644 --- a/drivers/infiniband/sw/rdmavt/trace.h +++ b/drivers/infiniband/sw/rdmavt/trace.h @@ -45,8 +45,8 @@ * */ -#define RDI_DEV_ENTRY(rdi) __string(dev, rdi->driver_f.get_card_name(rdi)) -#define RDI_DEV_ASSIGN(rdi) __assign_str(dev, rdi->driver_f.get_card_name(rdi)) +#define RDI_DEV_ENTRY(rdi) __string(dev, rvt_get_ibdev_name(rdi)) +#define RDI_DEV_ASSIGN(rdi) __assign_str(dev, rvt_get_ibdev_name(rdi)) #include "trace_rvt.h" #include "trace_qp.h" diff --git a/drivers/infiniband/sw/rdmavt/trace_qp.h b/drivers/infiniband/sw/rdmavt/trace_qp.h index 4c77a3119bda..efc9d814b032 100644 --- a/drivers/infiniband/sw/rdmavt/trace_qp.h +++ b/drivers/infiniband/sw/rdmavt/trace_qp.h @@ -85,6 +85,48 @@ DEFINE_EVENT(rvt_qphash_template, rvt_qpremove, TP_PROTO(struct rvt_qp *qp, u32 bucket), TP_ARGS(qp, bucket)); +DECLARE_EVENT_CLASS( + rvt_rnrnak_template, + TP_PROTO(struct rvt_qp *qp, u32 to), + TP_ARGS(qp, to), + TP_STRUCT__entry( + RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device)) + __field(u32, qpn) + __field(void *, hrtimer) + __field(u32, s_flags) + __field(u32, to) + ), + TP_fast_assign( + RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)) + __entry->qpn = qp->ibqp.qp_num; + __entry->hrtimer = &qp->s_rnr_timer; + __entry->s_flags = qp->s_flags; + __entry->to = to; + ), + TP_printk( + "[%s] qpn 0x%x hrtimer 0x%p s_flags 0x%x timeout %u us", + __get_str(dev), + __entry->qpn, + __entry->hrtimer, + __entry->s_flags, + __entry->to + ) +); + +DEFINE_EVENT( + rvt_rnrnak_template, rvt_rnrnak_add, + TP_PROTO(struct rvt_qp *qp, u32 to), + TP_ARGS(qp, to)); + +DEFINE_EVENT( + rvt_rnrnak_template, rvt_rnrnak_timeout, + TP_PROTO(struct rvt_qp *qp, u32 to), + TP_ARGS(qp, to)); + +DEFINE_EVENT( + rvt_rnrnak_template, rvt_rnrnak_stop, + TP_PROTO(struct rvt_qp *qp, u32 to), + TP_ARGS(qp, to)); #endif /* __RVT_TRACE_QP_H */ diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 64bdd442078a..a4553b2b3696 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -224,7 +224,8 @@ static int rvt_modify_port(struct ib_device *ibdev, u8 port_num, * rvt_query_pkey - Return a pkey from the table at a given index * @ibdev: Verbs IB dev * @port_num: Port number, 1 based from ib core - * @intex: Index into pkey table + * @index: Index into pkey table + * @pkey: returned pkey from the port pkey table * * Return: 0 on failure pkey otherwise */ @@ -255,7 +256,7 @@ static int rvt_query_pkey(struct ib_device *ibdev, u8 port_num, u16 index, * rvt_query_gid - Return a gid from the table * @ibdev: Verbs IB dev * @port_num: Port number, 1 based from ib core - * @index: = Index in table + * @guid_index: Index in table * @gid: Gid to return * * Return: 0 on success @@ -297,8 +298,8 @@ static inline struct rvt_ucontext *to_iucontext(struct ib_ucontext /** * rvt_alloc_ucontext - Allocate a user context - * @ibdev: Vers IB dev - * @data: User data allocated + * @ibdev: Verbs IB dev + * @udata: User data allocated */ static struct ib_ucontext *rvt_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) @@ -413,7 +414,6 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb) * required for rdmavt to function. */ if ((!rdi->driver_f.port_callback) || - (!rdi->driver_f.get_card_name) || (!rdi->driver_f.get_pci_dev)) return -EINVAL; break; diff --git a/drivers/infiniband/sw/rdmavt/vt.h b/drivers/infiniband/sw/rdmavt/vt.h index f363505312be..8823b2e7aac6 100644 --- a/drivers/infiniband/sw/rdmavt/vt.h +++ b/drivers/infiniband/sw/rdmavt/vt.h @@ -63,19 +63,19 @@ #define rvt_pr_info(rdi, fmt, ...) \ __rvt_pr_info(rdi->driver_f.get_pci_dev(rdi), \ - rdi->driver_f.get_card_name(rdi), \ + rvt_get_ibdev_name(rdi), \ fmt, \ ##__VA_ARGS__) #define rvt_pr_warn(rdi, fmt, ...) \ __rvt_pr_warn(rdi->driver_f.get_pci_dev(rdi), \ - rdi->driver_f.get_card_name(rdi), \ + rvt_get_ibdev_name(rdi), \ fmt, \ ##__VA_ARGS__) #define rvt_pr_err(rdi, fmt, ...) \ __rvt_pr_err(rdi->driver_f.get_pci_dev(rdi), \ - rdi->driver_f.get_card_name(rdi), \ + rvt_get_ibdev_name(rdi), \ fmt, \ ##__VA_ARGS__) diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig index 320bffc980d8..bad4a576d7cf 100644 --- a/drivers/infiniband/sw/rxe/Kconfig +++ b/drivers/infiniband/sw/rxe/Kconfig @@ -1,8 +1,8 @@ config RDMA_RXE tristate "Software RDMA over Ethernet (RoCE) driver" depends on INET && PCI && INFINIBAND - depends on NET_UDP_TUNNEL - depends on CRYPTO_CRC32 + select NET_UDP_TUNNEL + select CRYPTO_CRC32 select DMA_VIRT_OPS ---help--- This driver implements the InfiniBand RDMA transport over diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 8c3d30b3092d..b7debb6f2eac 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -77,12 +77,6 @@ void rxe_release(struct kref *kref) ib_dealloc_device(&rxe->ib_dev); } -void rxe_dev_put(struct rxe_dev *rxe) -{ - kref_put(&rxe->ref_cnt, rxe_release); -} -EXPORT_SYMBOL_GPL(rxe_dev_put); - /* initialize rxe device parameters */ static int rxe_init_device_param(struct rxe_dev *rxe) { diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index 6447d736d5a4..7d232611303f 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -57,6 +57,7 @@ #include "rxe_hdr.h" #include "rxe_param.h" #include "rxe_verbs.h" +#include "rxe_loc.h" #define RXE_UVERBS_ABI_VERSION (1) @@ -95,7 +96,10 @@ void rxe_remove_all(void); int rxe_rcv(struct sk_buff *skb); -void rxe_dev_put(struct rxe_dev *rxe); +static inline void rxe_dev_put(struct rxe_dev *rxe) +{ + kref_put(&rxe->ref_cnt, rxe_release); +} struct rxe_dev *net_to_rxe(struct net_device *ndev); struct rxe_dev *get_rxe_by_name(const char *name); diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index d7472a442a2c..96c3a6c5c4b5 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -237,7 +237,6 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, void rxe_release(struct kref *kref); -void rxe_drain_req_pkts(struct rxe_qp *qp, bool notify); int rxe_completer(void *arg); int rxe_requester(void *arg); int rxe_responder(void *arg); diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 59dee10bebcb..159246b03867 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -82,7 +82,7 @@ struct rxe_dev *get_rxe_by_name(const char *name) } -struct rxe_recv_sockets recv_sockets; +static struct rxe_recv_sockets recv_sockets; struct device *rxe_dma_device(struct rxe_dev *rxe) { @@ -452,31 +452,26 @@ static void rxe_skb_tx_dtor(struct sk_buff *skb) int rxe_send(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, struct sk_buff *skb) { - struct sk_buff *nskb; struct rxe_av *av; int err; av = rxe_get_av(pkt); - nskb = skb_clone(skb, GFP_ATOMIC); - if (!nskb) - return -ENOMEM; - - nskb->destructor = rxe_skb_tx_dtor; - nskb->sk = pkt->qp->sk->sk; + skb->destructor = rxe_skb_tx_dtor; + skb->sk = pkt->qp->sk->sk; rxe_add_ref(pkt->qp); atomic_inc(&pkt->qp->skb_out); if (av->network_type == RDMA_NETWORK_IPV4) { - err = ip_local_out(dev_net(skb_dst(skb)->dev), nskb->sk, nskb); + err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); } else if (av->network_type == RDMA_NETWORK_IPV6) { - err = ip6_local_out(dev_net(skb_dst(skb)->dev), nskb->sk, nskb); + err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); } else { pr_err("Unknown layer 3 protocol: %d\n", av->network_type); atomic_dec(&pkt->qp->skb_out); rxe_drop_ref(pkt->qp); - kfree_skb(nskb); + kfree_skb(skb); return -EINVAL; } @@ -485,7 +480,6 @@ int rxe_send(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, struct sk_buff *skb) return -EAGAIN; } - kfree_skb(skb); return 0; } diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h index 1c06b3bfe1b6..728d8c71b36a 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.h +++ b/drivers/infiniband/sw/rxe/rxe_net.h @@ -43,7 +43,6 @@ struct rxe_recv_sockets { struct socket *sk6; }; -extern struct rxe_recv_sockets recv_sockets; extern struct notifier_block rxe_net_notifier; void rxe_release_udp_tunnel(struct socket *sk); diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 4469592b839d..137d6c0c49d4 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -824,9 +824,9 @@ void rxe_qp_destroy(struct rxe_qp *qp) } /* called when the last reference to the qp is dropped */ -void rxe_qp_cleanup(struct rxe_pool_entry *arg) +static void rxe_qp_do_cleanup(struct work_struct *work) { - struct rxe_qp *qp = container_of(arg, typeof(*qp), pelem); + struct rxe_qp *qp = container_of(work, typeof(*qp), cleanup_work.work); rxe_drop_all_mcast_groups(qp); @@ -859,3 +859,11 @@ void rxe_qp_cleanup(struct rxe_pool_entry *arg) kernel_sock_shutdown(qp->sk, SHUT_RDWR); sock_release(qp->sk); } + +/* called when the last reference to the qp is dropped */ +void rxe_qp_cleanup(struct rxe_pool_entry *arg) +{ + struct rxe_qp *qp = container_of(arg, typeof(*qp), pelem); + + execute_in_process_context(rxe_qp_do_cleanup, &qp->cleanup_work); +} diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index fb8c83e055e1..4c3f899241d4 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -336,7 +336,6 @@ static int rxe_match_dgid(struct rxe_dev *rxe, struct sk_buff *skb) { union ib_gid dgid; union ib_gid *pdgid; - u16 index; if (skb->protocol == htons(ETH_P_IP)) { ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, @@ -348,7 +347,7 @@ static int rxe_match_dgid(struct rxe_dev *rxe, struct sk_buff *skb) return ib_find_cached_gid_by_port(&rxe->ib_dev, pdgid, IB_GID_TYPE_ROCE_UDP_ENCAP, - 1, rxe->ndev, &index); + 1, rxe->ndev, NULL); } /* rxe_rcv is called from the interface driver */ diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 26a7f923045b..7bdaf71b8221 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -594,15 +594,8 @@ int rxe_requester(void *arg) rxe_add_ref(qp); next_wqe: - if (unlikely(!qp->valid)) { - rxe_drain_req_pkts(qp, true); + if (unlikely(!qp->valid || qp->req.state == QP_STATE_ERROR)) goto exit; - } - - if (unlikely(qp->req.state == QP_STATE_ERROR)) { - rxe_drain_req_pkts(qp, true); - goto exit; - } if (unlikely(qp->req.state == QP_STATE_RESET)) { qp->req.wqe_index = consumer_index(qp->sq.queue); diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 4240866a5331..d37bb9b97569 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -863,8 +863,7 @@ static enum resp_states do_complete(struct rxe_qp *qp, if (pkt->mask & RXE_IMMDT_MASK) { uwc->wc_flags |= IB_WC_WITH_IMM; - uwc->ex.imm_data = - (__u32 __force)immdt_imm(pkt); + uwc->ex.imm_data = immdt_imm(pkt); } if (pkt->mask & RXE_IETH_MASK) { @@ -1210,7 +1209,7 @@ static enum resp_states do_class_d1e_error(struct rxe_qp *qp) } } -void rxe_drain_req_pkts(struct rxe_qp *qp, bool notify) +static void rxe_drain_req_pkts(struct rxe_qp *qp, bool notify) { struct sk_buff *skb; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index d03002b9d84d..7210a784abb4 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -814,6 +814,8 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, struct ib_send_wr *wr, (queue_count(qp->sq.queue) > 1); rxe_run_task(&qp->req.task, must_sched); + if (unlikely(qp->req.state == QP_STATE_ERROR)) + rxe_run_task(&qp->comp.task, 1); return err; } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 0c2dbe45c729..1019f5e7dbdd 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -35,6 +35,7 @@ #define RXE_VERBS_H #include <linux/interrupt.h> +#include <linux/workqueue.h> #include <rdma/rdma_user_rxe.h> #include "rxe_pool.h" #include "rxe_task.h" @@ -281,6 +282,8 @@ struct rxe_qp { struct timer_list rnr_nak_timer; spinlock_t state_lock; /* guard requester and completer */ + + struct execute_work cleanup_work; }; enum rxe_mem_state { diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 71ea9e26666c..962fbcb57dc7 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -766,12 +766,14 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ skb_orphan(skb); skb_dst_drop(skb); - if (netif_queue_stopped(dev)) - if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP | - IB_CQ_REPORT_MISSED_EVENTS)) { + if (netif_queue_stopped(dev)) { + rc = ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS); + if (unlikely(rc < 0)) ipoib_warn(priv, "IPoIB/CM:request notify on send CQ failed\n"); + else if (rc) napi_schedule(&priv->send_napi); - } + } rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), tx_req); if (unlikely(rc)) { @@ -876,7 +878,7 @@ int ipoib_cm_dev_open(struct net_device *dev) priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev); if (IS_ERR(priv->cm.id)) { - printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name); + pr_warn("%s: failed to create CM ID\n", priv->ca->name); ret = PTR_ERR(priv->cm.id); goto err_cm; } @@ -884,8 +886,8 @@ int ipoib_cm_dev_open(struct net_device *dev) ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num), 0); if (ret) { - printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name, - IPOIB_CM_IETF_ID | priv->qp->qp_num); + pr_warn("%s: failed to listen on ID 0x%llx\n", priv->ca->name, + IPOIB_CM_IETF_ID | priv->qp->qp_num); goto err_listen; } @@ -1562,7 +1564,7 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge) priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr); if (IS_ERR(priv->cm.srq)) { if (PTR_ERR(priv->cm.srq) != -ENOSYS) - printk(KERN_WARNING "%s: failed to allocate SRQ, error %ld\n", + pr_warn("%s: failed to allocate SRQ, error %ld\n", priv->ca->name, PTR_ERR(priv->cm.srq)); priv->cm.srq = NULL; return; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index e6151a29c412..10384ea50bed 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -644,7 +644,7 @@ int ipoib_send(struct net_device *dev, struct sk_buff *skb, if (netif_queue_stopped(dev)) if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP | - IB_CQ_REPORT_MISSED_EVENTS)) + IB_CQ_REPORT_MISSED_EVENTS) < 0) ipoib_warn(priv, "request notify on send CQ failed\n"); rc = post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), @@ -1085,8 +1085,7 @@ static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv) netif_addr_unlock_bh(priv->dev); - err = ib_find_gid(priv->ca, &search_gid, IB_GID_TYPE_IB, - priv->dev, &port, &index); + err = ib_find_gid(priv->ca, &search_gid, priv->dev, &port, &index); netif_addr_lock_bh(priv->dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 8880351df179..5930c7d9a8fb 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -768,13 +768,30 @@ static void path_rec_completion(int status, if (!status) { struct rdma_ah_attr av; - if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av)) + if (!ib_init_ah_attr_from_path(priv->ca, priv->port, + pathrec, &av)) ah = ipoib_create_ah(dev, priv->pd, &av); } spin_lock_irqsave(&priv->lock, flags); if (!IS_ERR_OR_NULL(ah)) { + /* + * pathrec.dgid is used as the database key from the LLADDR, + * it must remain unchanged even if the SA returns a different + * GID to use in the AH. + */ + if (memcmp(pathrec->dgid.raw, path->pathrec.dgid.raw, + sizeof(union ib_gid))) { + ipoib_dbg( + priv, + "%s got PathRec for gid %pI6 while asked for %pI6\n", + dev->name, pathrec->dgid.raw, + path->pathrec.dgid.raw); + memcpy(pathrec->dgid.raw, path->pathrec.dgid.raw, + sizeof(union ib_gid)); + } + path->pathrec = *pathrec; old_ah = path->ah; @@ -840,6 +857,23 @@ static void path_rec_completion(int status, } } +static void init_path_rec(struct ipoib_dev_priv *priv, struct ipoib_path *path, + void *gid) +{ + path->dev = priv->dev; + + if (rdma_cap_opa_ah(priv->ca, priv->port)) + path->pathrec.rec_type = SA_PATH_REC_TYPE_OPA; + else + path->pathrec.rec_type = SA_PATH_REC_TYPE_IB; + + memcpy(path->pathrec.dgid.raw, gid, sizeof(union ib_gid)); + path->pathrec.sgid = priv->local_gid; + path->pathrec.pkey = cpu_to_be16(priv->pkey); + path->pathrec.numb_path = 1; + path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; +} + static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) { struct ipoib_dev_priv *priv = ipoib_priv(dev); @@ -852,21 +886,11 @@ static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) if (!path) return NULL; - path->dev = dev; - skb_queue_head_init(&path->queue); INIT_LIST_HEAD(&path->neigh_list); - if (rdma_cap_opa_ah(priv->ca, priv->port)) - path->pathrec.rec_type = SA_PATH_REC_TYPE_OPA; - else - path->pathrec.rec_type = SA_PATH_REC_TYPE_IB; - memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid)); - path->pathrec.sgid = priv->local_gid; - path->pathrec.pkey = cpu_to_be16(priv->pkey); - path->pathrec.numb_path = 1; - path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; + init_path_rec(priv, path, gid); return path; } @@ -1005,6 +1029,10 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, spin_lock_irqsave(&priv->lock, flags); + /* no broadcast means that all paths are (going to be) not valid */ + if (!priv->broadcast) + goto drop_and_unlock; + path = __path_find(dev, phdr->hwaddr + 4); if (!path || !path->valid) { int new_path = 0; @@ -1014,6 +1042,10 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, new_path = 1; } if (path) { + if (!new_path) + /* make sure there is no changes in the existing path record */ + init_path_rec(priv, path, phdr->hwaddr + 4); + if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, phdr->hwaddr); __skb_queue_tail(&path->queue, skb); @@ -1030,8 +1062,7 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, } else __path_add(dev, path); } else { - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb); + goto drop_and_unlock; } spin_unlock_irqrestore(&priv->lock, flags); @@ -1051,11 +1082,16 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, push_pseudo_header(skb, phdr->hwaddr); __skb_queue_tail(&path->queue, skb); } else { - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb); + goto drop_and_unlock; } spin_unlock_irqrestore(&priv->lock, flags); + return; + +drop_and_unlock: + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + spin_unlock_irqrestore(&priv->lock, flags); } static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) @@ -1674,8 +1710,8 @@ static int ipoib_dev_init_default(struct net_device *dev) priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); if (!priv->tx_ring) { - printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", - priv->ca->name, ipoib_sendq_size); + pr_warn("%s: failed to allocate TX ring (%d entries)\n", + priv->ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } @@ -2207,16 +2243,17 @@ static struct net_device *ipoib_add_port(const char *format, int result = -ENOMEM; priv = ipoib_intf_alloc(hca, port, format); - if (!priv) + if (!priv) { + pr_warn("%s, %d: ipoib_intf_alloc failed\n", hca->name, port); goto alloc_mem_failed; + } SET_NETDEV_DEV(priv->dev, hca->dev.parent); priv->dev->dev_id = port - 1; result = ib_query_port(hca, port, &attr); if (result) { - printk(KERN_WARNING "%s: ib_query_port %d failed\n", - hca->name, port); + pr_warn("%s: ib_query_port %d failed\n", hca->name, port); goto device_init_failed; } @@ -2231,8 +2268,8 @@ static struct net_device *ipoib_add_port(const char *format, result = ib_query_pkey(hca, port, 0, &priv->pkey); if (result) { - printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", - hca->name, port, result); + pr_warn("%s: ib_query_pkey port %d failed (ret = %d)\n", + hca->name, port, result); goto device_init_failed; } @@ -2249,8 +2286,8 @@ static struct net_device *ipoib_add_port(const char *format, result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL); if (result) { - printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", - hca->name, port, result); + pr_warn("%s: ib_query_gid port %d failed (ret = %d)\n", + hca->name, port, result); goto device_init_failed; } @@ -2260,8 +2297,8 @@ static struct net_device *ipoib_add_port(const char *format, result = ipoib_dev_init(priv->dev, hca, port); if (result) { - printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", - hca->name, port, result); + pr_warn("%s: failed to initialize port %d (ret = %d)\n", + hca->name, port, result); goto device_init_failed; } @@ -2271,8 +2308,8 @@ static struct net_device *ipoib_add_port(const char *format, result = register_netdev(priv->dev); if (result) { - printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n", - hca->name, port, result); + pr_warn("%s: couldn't register ipoib port %d; error %d\n", + hca->name, port, result); goto register_failed; } @@ -2337,8 +2374,7 @@ static void ipoib_add_one(struct ib_device *device) } if (!count) { - pr_err("Failed to init port, removing it\n"); - ipoib_remove_one(device, dev_list); + kfree(dev_list); return; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index a1ed25422b72..984a88096f39 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -178,7 +178,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_rx_completion, NULL, priv, &cq_attr); if (IS_ERR(priv->recv_cq)) { - printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name); + pr_warn("%s: failed to create receive CQ\n", ca->name); goto out_cm_dev_cleanup; } @@ -187,7 +187,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) priv->send_cq = ib_create_cq(priv->ca, ipoib_ib_tx_completion, NULL, priv, &cq_attr); if (IS_ERR(priv->send_cq)) { - printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name); + pr_warn("%s: failed to create send CQ\n", ca->name); goto out_free_recv_cq; } @@ -208,7 +208,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) priv->qp = ib_create_qp(priv->pd, &init_attr); if (IS_ERR(priv->qp)) { - printk(KERN_WARNING "%s: failed to create QP\n", ca->name); + pr_warn("%s: failed to create QP\n", ca->name); goto out_free_send_cq; } diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 2a07692007bd..df49c4eb67f7 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -142,8 +142,7 @@ iser_prepare_write_cmd(struct iscsi_task *task, hdr->write_va = cpu_to_be64(mem_reg->sge.addr + unsol_sz); } - iser_dbg("Cmd itt:%d, WRITE tags, RKEY:%#.4X " - "VA:%#llX + unsol:%d\n", + iser_dbg("Cmd itt:%d, WRITE tags, RKEY:%#.4X VA:%#llX + unsol:%d\n", task->itt, mem_reg->rkey, (unsigned long long)mem_reg->sge.addr, unsol_sz); } @@ -436,7 +435,7 @@ int iser_send_data_out(struct iscsi_conn *conn, { struct iser_conn *iser_conn = conn->dd_data; struct iscsi_iser_task *iser_task = task->dd_data; - struct iser_tx_desc *tx_desc = NULL; + struct iser_tx_desc *tx_desc; struct iser_mem_reg *mem_reg; unsigned long buf_offset; unsigned long data_seg_len; @@ -452,10 +451,8 @@ int iser_send_data_out(struct iscsi_conn *conn, __func__,(int)itt,(int)data_seg_len,(int)buf_offset); tx_desc = kmem_cache_zalloc(ig.desc_cache, GFP_ATOMIC); - if (tx_desc == NULL) { - iser_err("Failed to alloc desc for post dataout\n"); + if (!tx_desc) return -ENOMEM; - } tx_desc->type = ISCSI_TX_DATAOUT; tx_desc->cqe.done = iser_dataout_comp; @@ -475,8 +472,7 @@ int iser_send_data_out(struct iscsi_conn *conn, tx_desc->num_sge = 2; if (buf_offset + data_seg_len > iser_task->data[ISER_DIR_OUT].data_len) { - iser_err("Offset:%ld & DSL:%ld in Data-Out " - "inconsistent with total len:%ld, itt:%d\n", + iser_err("Offset:%ld & DSL:%ld in Data-Out inconsistent with total len:%ld, itt:%d\n", buf_offset, data_seg_len, iser_task->data[ISER_DIR_OUT].data_len, itt); err = -EINVAL; @@ -614,8 +610,8 @@ iser_check_remote_inv(struct iser_conn *iser_conn, iser_conn, rkey); if (unlikely(!iser_conn->snd_w_inv)) { - iser_err("conn %p: unexpected remote invalidation, " - "terminating connection\n", iser_conn); + iser_err("conn %p: unexpected remote invalidation, terminating connection\n", + iser_conn); return -EPROTO; } diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 1b02283ce20e..fff40b097947 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -2124,6 +2124,9 @@ isert_rdma_rw_ctx_post(struct isert_cmd *cmd, struct isert_conn *conn, u32 rkey, offset; int ret; + if (cmd->ctx_init_done) + goto rdma_ctx_post; + if (dir == DMA_FROM_DEVICE) { addr = cmd->write_va; rkey = cmd->write_stag; @@ -2151,11 +2154,15 @@ isert_rdma_rw_ctx_post(struct isert_cmd *cmd, struct isert_conn *conn, se_cmd->t_data_sg, se_cmd->t_data_nents, offset, addr, rkey, dir); } + if (ret < 0) { isert_err("Cmd: %p failed to prepare RDMA res\n", cmd); return ret; } + cmd->ctx_init_done = true; + +rdma_ctx_post: ret = rdma_rw_ctx_post(&cmd->rw, conn->qp, port_num, cqe, chain_wr); if (ret < 0) isert_err("Cmd: %p failed to post RDMA res\n", cmd); diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h index d6fd248320ae..3b296bac4f60 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.h +++ b/drivers/infiniband/ulp/isert/ib_isert.h @@ -126,6 +126,7 @@ struct isert_cmd { struct rdma_rw_ctx rw; struct work_struct comp_work; struct scatterlist sg; + bool ctx_init_done; }; static inline struct isert_cmd *tx_desc_to_cmd(struct iser_tx_desc *desc) diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c index 4b615c1451e7..15711dcc6f58 100644 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c @@ -710,7 +710,7 @@ vema_get_port(struct opa_vnic_ctrl_port *cport, u8 port_num) /** * opa_vnic_vema_send_trap -- This function sends a trap to the EM - * @cport: pointer to vnic control port + * @adapter: pointer to vnic adapter * @data: pointer to trap data filled by calling function * @lid: issuers lid (encap_slid from vesw_port_info) * diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 972d4b3c5223..b48843833d69 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -41,6 +41,7 @@ #include <linux/random.h> #include <linux/jiffies.h> #include <linux/lockdep.h> +#include <linux/inet.h> #include <rdma/ib_cache.h> #include <linux/atomic.h> @@ -144,7 +145,9 @@ static void srp_remove_one(struct ib_device *device, void *client_data); static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc); static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc, const char *opname); -static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); +static int srp_ib_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); +static int srp_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); static struct scsi_transport_template *ib_srp_transport_template; static struct workqueue_struct *srp_remove_wq; @@ -265,8 +268,8 @@ static void srp_qp_event(struct ib_event *event, void *context) ib_event_msg(event->event), event->event); } -static int srp_init_qp(struct srp_target_port *target, - struct ib_qp *qp) +static int srp_init_ib_qp(struct srp_target_port *target, + struct ib_qp *qp) { struct ib_qp_attr *attr; int ret; @@ -277,7 +280,7 @@ static int srp_init_qp(struct srp_target_port *target, ret = ib_find_cached_pkey(target->srp_host->srp_dev->dev, target->srp_host->port, - be16_to_cpu(target->pkey), + be16_to_cpu(target->ib_cm.pkey), &attr->pkey_index); if (ret) goto out; @@ -298,32 +301,110 @@ out: return ret; } -static int srp_new_cm_id(struct srp_rdma_ch *ch) +static int srp_new_ib_cm_id(struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; struct ib_cm_id *new_cm_id; new_cm_id = ib_create_cm_id(target->srp_host->srp_dev->dev, - srp_cm_handler, ch); + srp_ib_cm_handler, ch); if (IS_ERR(new_cm_id)) return PTR_ERR(new_cm_id); - if (ch->cm_id) - ib_destroy_cm_id(ch->cm_id); - ch->cm_id = new_cm_id; + if (ch->ib_cm.cm_id) + ib_destroy_cm_id(ch->ib_cm.cm_id); + ch->ib_cm.cm_id = new_cm_id; if (rdma_cap_opa_ah(target->srp_host->srp_dev->dev, target->srp_host->port)) - ch->path.rec_type = SA_PATH_REC_TYPE_OPA; + ch->ib_cm.path.rec_type = SA_PATH_REC_TYPE_OPA; else - ch->path.rec_type = SA_PATH_REC_TYPE_IB; - ch->path.sgid = target->sgid; - ch->path.dgid = target->orig_dgid; - ch->path.pkey = target->pkey; - ch->path.service_id = target->service_id; + ch->ib_cm.path.rec_type = SA_PATH_REC_TYPE_IB; + ch->ib_cm.path.sgid = target->sgid; + ch->ib_cm.path.dgid = target->ib_cm.orig_dgid; + ch->ib_cm.path.pkey = target->ib_cm.pkey; + ch->ib_cm.path.service_id = target->ib_cm.service_id; return 0; } +static const char *inet_ntop(const void *sa, char *dst, unsigned int size) +{ + switch (((struct sockaddr *)sa)->sa_family) { + case AF_INET: + snprintf(dst, size, "%pI4", + &((struct sockaddr_in *)sa)->sin_addr); + break; + case AF_INET6: + snprintf(dst, size, "%pI6", + &((struct sockaddr_in6 *)sa)->sin6_addr); + break; + default: + snprintf(dst, size, "???"); + break; + } + return dst; +} + +static int srp_new_rdma_cm_id(struct srp_rdma_ch *ch) +{ + struct srp_target_port *target = ch->target; + struct rdma_cm_id *new_cm_id; + char src_addr[64], dst_addr[64]; + int ret; + + new_cm_id = rdma_create_id(target->net, srp_rdma_cm_handler, ch, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(new_cm_id)) { + ret = PTR_ERR(new_cm_id); + new_cm_id = NULL; + goto out; + } + + init_completion(&ch->done); + ret = rdma_resolve_addr(new_cm_id, target->rdma_cm.src_specified ? + (struct sockaddr *)&target->rdma_cm.src : NULL, + (struct sockaddr *)&target->rdma_cm.dst, + SRP_PATH_REC_TIMEOUT_MS); + if (ret) { + pr_err("No route available from %s to %s (%d)\n", + target->rdma_cm.src_specified ? + inet_ntop(&target->rdma_cm.src, src_addr, + sizeof(src_addr)) : "(any)", + inet_ntop(&target->rdma_cm.dst, dst_addr, + sizeof(dst_addr)), + ret); + goto out; + } + ret = wait_for_completion_interruptible(&ch->done); + if (ret < 0) + goto out; + + ret = ch->status; + if (ret) { + pr_err("Resolving address %s failed (%d)\n", + inet_ntop(&target->rdma_cm.dst, dst_addr, + sizeof(dst_addr)), + ret); + goto out; + } + + swap(ch->rdma_cm.cm_id, new_cm_id); + +out: + if (new_cm_id) + rdma_destroy_id(new_cm_id); + + return ret; +} + +static int srp_new_cm_id(struct srp_rdma_ch *ch) +{ + struct srp_target_port *target = ch->target; + + return target->using_rdma_cm ? srp_new_rdma_cm_id(ch) : + srp_new_ib_cm_id(ch); +} + static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target) { struct srp_device *dev = target->srp_host->srp_dev; @@ -521,16 +602,25 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) init_attr->send_cq = send_cq; init_attr->recv_cq = recv_cq; - qp = ib_create_qp(dev->pd, init_attr); - if (IS_ERR(qp)) { - ret = PTR_ERR(qp); + if (target->using_rdma_cm) { + ret = rdma_create_qp(ch->rdma_cm.cm_id, dev->pd, init_attr); + qp = ch->rdma_cm.cm_id->qp; + } else { + qp = ib_create_qp(dev->pd, init_attr); + if (!IS_ERR(qp)) { + ret = srp_init_ib_qp(target, qp); + if (ret) + ib_destroy_qp(qp); + } else { + ret = PTR_ERR(qp); + } + } + if (ret) { + pr_err("QP creation failed for dev %s: %d\n", + dev_name(&dev->dev->dev), ret); goto err_send_cq; } - ret = srp_init_qp(target, qp); - if (ret) - goto err_qp; - if (dev->use_fast_reg) { fr_pool = srp_alloc_fr_pool(target); if (IS_ERR(fr_pool)) { @@ -574,7 +664,10 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) return 0; err_qp: - ib_destroy_qp(qp); + if (target->using_rdma_cm) + rdma_destroy_qp(ch->rdma_cm.cm_id); + else + ib_destroy_qp(qp); err_send_cq: ib_free_cq(send_cq); @@ -600,9 +693,16 @@ static void srp_free_ch_ib(struct srp_target_port *target, if (!ch->target) return; - if (ch->cm_id) { - ib_destroy_cm_id(ch->cm_id); - ch->cm_id = NULL; + if (target->using_rdma_cm) { + if (ch->rdma_cm.cm_id) { + rdma_destroy_id(ch->rdma_cm.cm_id); + ch->rdma_cm.cm_id = NULL; + } + } else { + if (ch->ib_cm.cm_id) { + ib_destroy_cm_id(ch->ib_cm.cm_id); + ch->ib_cm.cm_id = NULL; + } } /* If srp_new_cm_id() succeeded but srp_create_ch_ib() not, return. */ @@ -658,16 +758,16 @@ static void srp_path_rec_completion(int status, shost_printk(KERN_ERR, target->scsi_host, PFX "Got failed path rec status %d\n", status); else - ch->path = *pathrec; + ch->ib_cm.path = *pathrec; complete(&ch->done); } -static int srp_lookup_path(struct srp_rdma_ch *ch) +static int srp_ib_lookup_path(struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; int ret = -ENODEV; - ch->path.numb_path = 1; + ch->ib_cm.path.numb_path = 1; init_completion(&ch->done); @@ -678,10 +778,10 @@ static int srp_lookup_path(struct srp_rdma_ch *ch) if (!scsi_host_get(target->scsi_host)) goto out; - ch->path_query_id = ib_sa_path_rec_get(&srp_sa_client, + ch->ib_cm.path_query_id = ib_sa_path_rec_get(&srp_sa_client, target->srp_host->srp_dev->dev, target->srp_host->port, - &ch->path, + &ch->ib_cm.path, IB_SA_PATH_REC_SERVICE_ID | IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | @@ -690,8 +790,8 @@ static int srp_lookup_path(struct srp_rdma_ch *ch) SRP_PATH_REC_TIMEOUT_MS, GFP_KERNEL, srp_path_rec_completion, - ch, &ch->path_query); - ret = ch->path_query_id; + ch, &ch->ib_cm.path_query); + ret = ch->ib_cm.path_query_id; if (ret < 0) goto put; @@ -702,7 +802,10 @@ static int srp_lookup_path(struct srp_rdma_ch *ch) ret = ch->status; if (ret < 0) shost_printk(KERN_WARNING, target->scsi_host, - PFX "Path record query failed\n"); + PFX "Path record query failed: sgid %pI6, dgid %pI6, pkey %#04x, service_id %#16llx\n", + ch->ib_cm.path.sgid.raw, ch->ib_cm.path.dgid.raw, + be16_to_cpu(target->ib_cm.pkey), + be64_to_cpu(target->ib_cm.service_id)); put: scsi_host_put(target->scsi_host); @@ -711,6 +814,34 @@ out: return ret; } +static int srp_rdma_lookup_path(struct srp_rdma_ch *ch) +{ + struct srp_target_port *target = ch->target; + int ret; + + init_completion(&ch->done); + + ret = rdma_resolve_route(ch->rdma_cm.cm_id, SRP_PATH_REC_TIMEOUT_MS); + if (ret) + return ret; + + wait_for_completion_interruptible(&ch->done); + + if (ch->status != 0) + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Path resolution failed\n"); + + return ch->status; +} + +static int srp_lookup_path(struct srp_rdma_ch *ch) +{ + struct srp_target_port *target = ch->target; + + return target->using_rdma_cm ? srp_rdma_lookup_path(ch) : + srp_ib_lookup_path(ch); +} + static u8 srp_get_subnet_timeout(struct srp_host *host) { struct ib_port_attr attr; @@ -732,48 +863,76 @@ static int srp_send_req(struct srp_rdma_ch *ch, bool multich) { struct srp_target_port *target = ch->target; struct { - struct ib_cm_req_param param; - struct srp_login_req priv; + struct rdma_conn_param rdma_param; + struct srp_login_req_rdma rdma_req; + struct ib_cm_req_param ib_param; + struct srp_login_req ib_req; } *req = NULL; + char *ipi, *tpi; int status; - u8 subnet_timeout; - - subnet_timeout = srp_get_subnet_timeout(target->srp_host); req = kzalloc(sizeof *req, GFP_KERNEL); if (!req) return -ENOMEM; - req->param.primary_path = &ch->path; - req->param.alternate_path = NULL; - req->param.service_id = target->service_id; - req->param.qp_num = ch->qp->qp_num; - req->param.qp_type = ch->qp->qp_type; - req->param.private_data = &req->priv; - req->param.private_data_len = sizeof req->priv; - req->param.flow_control = 1; - - get_random_bytes(&req->param.starting_psn, 4); - req->param.starting_psn &= 0xffffff; + req->ib_param.flow_control = 1; + req->ib_param.retry_count = target->tl_retry_count; /* * Pick some arbitrary defaults here; we could make these * module parameters if anyone cared about setting them. */ - req->param.responder_resources = 4; - req->param.remote_cm_response_timeout = subnet_timeout + 2; - req->param.local_cm_response_timeout = subnet_timeout + 2; - req->param.retry_count = target->tl_retry_count; - req->param.rnr_retry_count = 7; - req->param.max_cm_retries = 15; - - req->priv.opcode = SRP_LOGIN_REQ; - req->priv.tag = 0; - req->priv.req_it_iu_len = cpu_to_be32(target->max_iu_len); - req->priv.req_buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | + req->ib_param.responder_resources = 4; + req->ib_param.rnr_retry_count = 7; + req->ib_param.max_cm_retries = 15; + + req->ib_req.opcode = SRP_LOGIN_REQ; + req->ib_req.tag = 0; + req->ib_req.req_it_iu_len = cpu_to_be32(target->max_iu_len); + req->ib_req.req_buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | SRP_BUF_FORMAT_INDIRECT); - req->priv.req_flags = (multich ? SRP_MULTICHAN_MULTI : - SRP_MULTICHAN_SINGLE); + req->ib_req.req_flags = (multich ? SRP_MULTICHAN_MULTI : + SRP_MULTICHAN_SINGLE); + + if (target->using_rdma_cm) { + req->rdma_param.flow_control = req->ib_param.flow_control; + req->rdma_param.responder_resources = + req->ib_param.responder_resources; + req->rdma_param.initiator_depth = req->ib_param.initiator_depth; + req->rdma_param.retry_count = req->ib_param.retry_count; + req->rdma_param.rnr_retry_count = req->ib_param.rnr_retry_count; + req->rdma_param.private_data = &req->rdma_req; + req->rdma_param.private_data_len = sizeof(req->rdma_req); + + req->rdma_req.opcode = req->ib_req.opcode; + req->rdma_req.tag = req->ib_req.tag; + req->rdma_req.req_it_iu_len = req->ib_req.req_it_iu_len; + req->rdma_req.req_buf_fmt = req->ib_req.req_buf_fmt; + req->rdma_req.req_flags = req->ib_req.req_flags; + + ipi = req->rdma_req.initiator_port_id; + tpi = req->rdma_req.target_port_id; + } else { + u8 subnet_timeout; + + subnet_timeout = srp_get_subnet_timeout(target->srp_host); + + req->ib_param.primary_path = &ch->ib_cm.path; + req->ib_param.alternate_path = NULL; + req->ib_param.service_id = target->ib_cm.service_id; + get_random_bytes(&req->ib_param.starting_psn, 4); + req->ib_param.starting_psn &= 0xffffff; + req->ib_param.qp_num = ch->qp->qp_num; + req->ib_param.qp_type = ch->qp->qp_type; + req->ib_param.local_cm_response_timeout = subnet_timeout + 2; + req->ib_param.remote_cm_response_timeout = subnet_timeout + 2; + req->ib_param.private_data = &req->ib_req; + req->ib_param.private_data_len = sizeof(req->ib_req); + + ipi = req->ib_req.initiator_port_id; + tpi = req->ib_req.target_port_id; + } + /* * In the published SRP specification (draft rev. 16a), the * port identifier format is 8 bytes of ID extension followed @@ -784,19 +943,15 @@ static int srp_send_req(struct srp_rdma_ch *ch, bool multich) * recognized by the I/O Class they report. */ if (target->io_class == SRP_REV10_IB_IO_CLASS) { - memcpy(req->priv.initiator_port_id, - &target->sgid.global.interface_id, 8); - memcpy(req->priv.initiator_port_id + 8, - &target->initiator_ext, 8); - memcpy(req->priv.target_port_id, &target->ioc_guid, 8); - memcpy(req->priv.target_port_id + 8, &target->id_ext, 8); + memcpy(ipi, &target->sgid.global.interface_id, 8); + memcpy(ipi + 8, &target->initiator_ext, 8); + memcpy(tpi, &target->ioc_guid, 8); + memcpy(tpi + 8, &target->id_ext, 8); } else { - memcpy(req->priv.initiator_port_id, - &target->initiator_ext, 8); - memcpy(req->priv.initiator_port_id + 8, - &target->sgid.global.interface_id, 8); - memcpy(req->priv.target_port_id, &target->id_ext, 8); - memcpy(req->priv.target_port_id + 8, &target->ioc_guid, 8); + memcpy(ipi, &target->initiator_ext, 8); + memcpy(ipi + 8, &target->sgid.global.interface_id, 8); + memcpy(tpi, &target->id_ext, 8); + memcpy(tpi + 8, &target->ioc_guid, 8); } /* @@ -809,12 +964,14 @@ static int srp_send_req(struct srp_rdma_ch *ch, bool multich) PFX "Topspin/Cisco initiator port ID workaround " "activated for target GUID %016llx\n", be64_to_cpu(target->ioc_guid)); - memset(req->priv.initiator_port_id, 0, 8); - memcpy(req->priv.initiator_port_id + 8, - &target->srp_host->srp_dev->dev->node_guid, 8); + memset(ipi, 0, 8); + memcpy(ipi + 8, &target->srp_host->srp_dev->dev->node_guid, 8); } - status = ib_send_cm_req(ch->cm_id, &req->param); + if (target->using_rdma_cm) + status = rdma_connect(ch->rdma_cm.cm_id, &req->rdma_param); + else + status = ib_send_cm_req(ch->ib_cm.cm_id, &req->ib_param); kfree(req); @@ -841,14 +998,23 @@ static bool srp_queue_remove_work(struct srp_target_port *target) static void srp_disconnect_target(struct srp_target_port *target) { struct srp_rdma_ch *ch; - int i; + int i, ret; /* XXX should send SRP_I_LOGOUT request */ for (i = 0; i < target->ch_count; i++) { ch = &target->ch[i]; ch->connected = false; - if (ch->cm_id && ib_send_cm_dreq(ch->cm_id, NULL, 0)) { + ret = 0; + if (target->using_rdma_cm) { + if (ch->rdma_cm.cm_id) + rdma_disconnect(ch->rdma_cm.cm_id); + } else { + if (ch->ib_cm.cm_id) + ret = ib_send_cm_dreq(ch->ib_cm.cm_id, + NULL, 0); + } + if (ret < 0) { shost_printk(KERN_DEBUG, target->scsi_host, PFX "Sending CM DREQ failed\n"); } @@ -962,6 +1128,7 @@ static void srp_remove_target(struct srp_target_port *target) scsi_remove_host(target->scsi_host); srp_stop_rport_timers(target->rport); srp_disconnect_target(target); + kobj_ns_drop(KOBJ_NS_TYPE_NET, target->net); for (i = 0; i < target->ch_count; i++) { ch = &target->ch[i]; srp_free_ch_ib(target, ch); @@ -2349,7 +2516,7 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id, struct srp_target_port *target = ch->target; struct ib_qp_attr *qp_attr = NULL; int attr_mask = 0; - int ret; + int ret = 0; int i; if (lrsp->opcode == SRP_LOGIN_RSP) { @@ -2379,40 +2546,42 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id, goto error; } - ret = -ENOMEM; - qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); - if (!qp_attr) - goto error; - - qp_attr->qp_state = IB_QPS_RTR; - ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); - if (ret) - goto error_free; - - ret = ib_modify_qp(ch->qp, qp_attr, attr_mask); - if (ret) - goto error_free; - for (i = 0; i < target->queue_size; i++) { struct srp_iu *iu = ch->rx_ring[i]; ret = srp_post_recv(ch, iu); if (ret) - goto error_free; + goto error; } - qp_attr->qp_state = IB_QPS_RTS; - ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); - if (ret) - goto error_free; + if (!target->using_rdma_cm) { + ret = -ENOMEM; + qp_attr = kmalloc(sizeof(*qp_attr), GFP_KERNEL); + if (!qp_attr) + goto error; - target->rq_tmo_jiffies = srp_compute_rq_tmo(qp_attr, attr_mask); + qp_attr->qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); + if (ret) + goto error_free; - ret = ib_modify_qp(ch->qp, qp_attr, attr_mask); - if (ret) - goto error_free; + ret = ib_modify_qp(ch->qp, qp_attr, attr_mask); + if (ret) + goto error_free; - ret = ib_send_cm_rtu(cm_id, NULL, 0); + qp_attr->qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); + if (ret) + goto error_free; + + target->rq_tmo_jiffies = srp_compute_rq_tmo(qp_attr, attr_mask); + + ret = ib_modify_qp(ch->qp, qp_attr, attr_mask); + if (ret) + goto error_free; + + ret = ib_send_cm_rtu(cm_id, NULL, 0); + } error_free: kfree(qp_attr); @@ -2421,41 +2590,43 @@ error: ch->status = ret; } -static void srp_cm_rej_handler(struct ib_cm_id *cm_id, - struct ib_cm_event *event, - struct srp_rdma_ch *ch) +static void srp_ib_cm_rej_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event, + struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; struct Scsi_Host *shost = target->scsi_host; struct ib_class_port_info *cpi; int opcode; + u16 dlid; switch (event->param.rej_rcvd.reason) { case IB_CM_REJ_PORT_CM_REDIRECT: cpi = event->param.rej_rcvd.ari; - sa_path_set_dlid(&ch->path, ntohs(cpi->redirect_lid)); - ch->path.pkey = cpi->redirect_pkey; + dlid = be16_to_cpu(cpi->redirect_lid); + sa_path_set_dlid(&ch->ib_cm.path, dlid); + ch->ib_cm.path.pkey = cpi->redirect_pkey; cm_id->remote_cm_qpn = be32_to_cpu(cpi->redirect_qp) & 0x00ffffff; - memcpy(ch->path.dgid.raw, cpi->redirect_gid, 16); + memcpy(ch->ib_cm.path.dgid.raw, cpi->redirect_gid, 16); - ch->status = sa_path_get_dlid(&ch->path) ? - SRP_DLID_REDIRECT : SRP_PORT_REDIRECT; + ch->status = dlid ? SRP_DLID_REDIRECT : SRP_PORT_REDIRECT; break; case IB_CM_REJ_PORT_REDIRECT: if (srp_target_is_topspin(target)) { + union ib_gid *dgid = &ch->ib_cm.path.dgid; + /* * Topspin/Cisco SRP gateways incorrectly send * reject reason code 25 when they mean 24 * (port redirect). */ - memcpy(ch->path.dgid.raw, - event->param.rej_rcvd.ari, 16); + memcpy(dgid->raw, event->param.rej_rcvd.ari, 16); shost_printk(KERN_DEBUG, shost, PFX "Topspin/Cisco redirect to target port GID %016llx%016llx\n", - be64_to_cpu(ch->path.dgid.global.subnet_prefix), - be64_to_cpu(ch->path.dgid.global.interface_id)); + be64_to_cpu(dgid->global.subnet_prefix), + be64_to_cpu(dgid->global.interface_id)); ch->status = SRP_PORT_REDIRECT; } else { @@ -2484,7 +2655,8 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id, shost_printk(KERN_WARNING, shost, PFX "SRP LOGIN from %pI6 to %pI6 REJECTED, reason 0x%08x\n", target->sgid.raw, - target->orig_dgid.raw, reason); + target->ib_cm.orig_dgid.raw, + reason); } else shost_printk(KERN_WARNING, shost, " REJ reason: IB_CM_REJ_CONSUMER_DEFINED," @@ -2504,7 +2676,7 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id, } } -static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +static int srp_ib_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { struct srp_rdma_ch *ch = cm_id->context; struct srp_target_port *target = ch->target; @@ -2527,7 +2699,7 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) shost_printk(KERN_DEBUG, target->scsi_host, PFX "REJ received\n"); comp = 1; - srp_cm_rej_handler(cm_id, event, ch); + srp_ib_cm_rej_handler(cm_id, event, ch); break; case IB_CM_DREQ_RECEIVED: @@ -2565,6 +2737,135 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) return 0; } +static void srp_rdma_cm_rej_handler(struct srp_rdma_ch *ch, + struct rdma_cm_event *event) +{ + struct srp_target_port *target = ch->target; + struct Scsi_Host *shost = target->scsi_host; + int opcode; + + switch (event->status) { + case IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID: + shost_printk(KERN_WARNING, shost, + " REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n"); + ch->status = -ECONNRESET; + break; + + case IB_CM_REJ_CONSUMER_DEFINED: + opcode = *(u8 *) event->param.conn.private_data; + if (opcode == SRP_LOGIN_REJ) { + struct srp_login_rej *rej = + (struct srp_login_rej *) + event->param.conn.private_data; + u32 reason = be32_to_cpu(rej->reason); + + if (reason == SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE) + shost_printk(KERN_WARNING, shost, + PFX "SRP_LOGIN_REJ: requested max_it_iu_len too large\n"); + else + shost_printk(KERN_WARNING, shost, + PFX "SRP LOGIN REJECTED, reason 0x%08x\n", reason); + } else { + shost_printk(KERN_WARNING, shost, + " REJ reason: IB_CM_REJ_CONSUMER_DEFINED, opcode 0x%02x\n", + opcode); + } + ch->status = -ECONNRESET; + break; + + case IB_CM_REJ_STALE_CONN: + shost_printk(KERN_WARNING, shost, + " REJ reason: stale connection\n"); + ch->status = SRP_STALE_CONN; + break; + + default: + shost_printk(KERN_WARNING, shost, " REJ reason 0x%x\n", + event->status); + ch->status = -ECONNRESET; + break; + } +} + +static int srp_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct srp_rdma_ch *ch = cm_id->context; + struct srp_target_port *target = ch->target; + int comp = 0; + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + ch->status = 0; + comp = 1; + break; + + case RDMA_CM_EVENT_ADDR_ERROR: + ch->status = -ENXIO; + comp = 1; + break; + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + ch->status = 0; + comp = 1; + break; + + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + ch->status = -EHOSTUNREACH; + comp = 1; + break; + + case RDMA_CM_EVENT_CONNECT_ERROR: + shost_printk(KERN_DEBUG, target->scsi_host, + PFX "Sending CM REQ failed\n"); + comp = 1; + ch->status = -ECONNRESET; + break; + + case RDMA_CM_EVENT_ESTABLISHED: + comp = 1; + srp_cm_rep_handler(NULL, event->param.conn.private_data, ch); + break; + + case RDMA_CM_EVENT_REJECTED: + shost_printk(KERN_DEBUG, target->scsi_host, PFX "REJ received\n"); + comp = 1; + + srp_rdma_cm_rej_handler(ch, event); + break; + + case RDMA_CM_EVENT_DISCONNECTED: + if (ch->connected) { + shost_printk(KERN_WARNING, target->scsi_host, + PFX "received DREQ\n"); + rdma_disconnect(ch->rdma_cm.cm_id); + comp = 1; + ch->status = 0; + queue_work(system_long_wq, &target->tl_err_work); + } + break; + + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + shost_printk(KERN_ERR, target->scsi_host, + PFX "connection closed\n"); + + comp = 1; + ch->status = 0; + break; + + default: + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled CM event %d\n", event->event); + break; + } + + if (comp) + complete(&ch->done); + + return 0; +} + /** * srp_change_queue_depth - setting device queue depth * @sdev: scsi device struct @@ -2717,6 +3018,16 @@ static int srp_reset_host(struct scsi_cmnd *scmnd) return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED; } +static int srp_target_alloc(struct scsi_target *starget) +{ + struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); + struct srp_target_port *target = host_to_target(shost); + + if (target->target_can_queue) + starget->can_queue = target->target_can_queue; + return 0; +} + static int srp_slave_alloc(struct scsi_device *sdev) { struct Scsi_Host *shost = sdev->host; @@ -2766,7 +3077,10 @@ static ssize_t show_service_id(struct device *dev, { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - return sprintf(buf, "0x%016llx\n", be64_to_cpu(target->service_id)); + if (target->using_rdma_cm) + return -ENOENT; + return sprintf(buf, "0x%016llx\n", + be64_to_cpu(target->ib_cm.service_id)); } static ssize_t show_pkey(struct device *dev, struct device_attribute *attr, @@ -2774,7 +3088,9 @@ static ssize_t show_pkey(struct device *dev, struct device_attribute *attr, { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - return sprintf(buf, "0x%04x\n", be16_to_cpu(target->pkey)); + if (target->using_rdma_cm) + return -ENOENT; + return sprintf(buf, "0x%04x\n", be16_to_cpu(target->ib_cm.pkey)); } static ssize_t show_sgid(struct device *dev, struct device_attribute *attr, @@ -2791,7 +3107,9 @@ static ssize_t show_dgid(struct device *dev, struct device_attribute *attr, struct srp_target_port *target = host_to_target(class_to_shost(dev)); struct srp_rdma_ch *ch = &target->ch[0]; - return sprintf(buf, "%pI6\n", ch->path.dgid.raw); + if (target->using_rdma_cm) + return -ENOENT; + return sprintf(buf, "%pI6\n", ch->ib_cm.path.dgid.raw); } static ssize_t show_orig_dgid(struct device *dev, @@ -2799,7 +3117,9 @@ static ssize_t show_orig_dgid(struct device *dev, { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - return sprintf(buf, "%pI6\n", target->orig_dgid.raw); + if (target->using_rdma_cm) + return -ENOENT; + return sprintf(buf, "%pI6\n", target->ib_cm.orig_dgid.raw); } static ssize_t show_req_lim(struct device *dev, @@ -2921,6 +3241,7 @@ static struct scsi_host_template srp_template = { .module = THIS_MODULE, .name = "InfiniBand SRP initiator", .proc_name = DRV_NAME, + .target_alloc = srp_target_alloc, .slave_alloc = srp_slave_alloc, .slave_configure = srp_slave_configure, .info = srp_target_info, @@ -3044,6 +3365,9 @@ static bool srp_conn_unique(struct srp_host *host, if (t != target && target->id_ext == t->id_ext && target->ioc_guid == t->ioc_guid && + (!target->using_rdma_cm || + memcmp(&target->rdma_cm.dst, &t->rdma_cm.dst, + sizeof(target->rdma_cm.dst)) == 0) && target->initiator_ext == t->initiator_ext) { ret = false; break; @@ -3060,6 +3384,9 @@ out: * * id_ext=<SRP ID ext>,ioc_guid=<SRP IOC GUID>,dgid=<dest GID>, * pkey=<P_Key>,service_id=<service ID> + * or + * id_ext=<SRP ID ext>,ioc_guid=<SRP IOC GUID>, + * [src=<IPv4 address>,]dest=<IPv4 address>:<port number> * * to the add_target sysfs attribute. */ @@ -3080,11 +3407,20 @@ enum { SRP_OPT_COMP_VECTOR = 1 << 12, SRP_OPT_TL_RETRY_COUNT = 1 << 13, SRP_OPT_QUEUE_SIZE = 1 << 14, - SRP_OPT_ALL = (SRP_OPT_ID_EXT | - SRP_OPT_IOC_GUID | - SRP_OPT_DGID | - SRP_OPT_PKEY | - SRP_OPT_SERVICE_ID), + SRP_OPT_IP_SRC = 1 << 15, + SRP_OPT_IP_DEST = 1 << 16, + SRP_OPT_TARGET_CAN_QUEUE= 1 << 17, +}; + +static unsigned int srp_opt_mandatory[] = { + SRP_OPT_ID_EXT | + SRP_OPT_IOC_GUID | + SRP_OPT_DGID | + SRP_OPT_PKEY | + SRP_OPT_SERVICE_ID, + SRP_OPT_ID_EXT | + SRP_OPT_IOC_GUID | + SRP_OPT_IP_DEST, }; static const match_table_t srp_opt_tokens = { @@ -3095,6 +3431,7 @@ static const match_table_t srp_opt_tokens = { { SRP_OPT_SERVICE_ID, "service_id=%s" }, { SRP_OPT_MAX_SECT, "max_sect=%d" }, { SRP_OPT_MAX_CMD_PER_LUN, "max_cmd_per_lun=%d" }, + { SRP_OPT_TARGET_CAN_QUEUE, "target_can_queue=%d" }, { SRP_OPT_IO_CLASS, "io_class=%x" }, { SRP_OPT_INITIATOR_EXT, "initiator_ext=%s" }, { SRP_OPT_CMD_SG_ENTRIES, "cmd_sg_entries=%u" }, @@ -3103,15 +3440,33 @@ static const match_table_t srp_opt_tokens = { { SRP_OPT_COMP_VECTOR, "comp_vector=%u" }, { SRP_OPT_TL_RETRY_COUNT, "tl_retry_count=%u" }, { SRP_OPT_QUEUE_SIZE, "queue_size=%d" }, + { SRP_OPT_IP_SRC, "src=%s" }, + { SRP_OPT_IP_DEST, "dest=%s" }, { SRP_OPT_ERR, NULL } }; -static int srp_parse_options(const char *buf, struct srp_target_port *target) +static int srp_parse_in(struct net *net, struct sockaddr_storage *sa, + const char *addr_port_str) +{ + char *addr = kstrdup(addr_port_str, GFP_KERNEL); + char *port_str = addr; + int ret; + + if (!addr) + return -ENOMEM; + strsep(&port_str, ":"); + ret = inet_pton_with_scope(net, AF_UNSPEC, addr, port_str, sa); + kfree(addr); + return ret; +} + +static int srp_parse_options(struct net *net, const char *buf, + struct srp_target_port *target) { char *options, *sep_opt; char *p; - char dgid[3]; substring_t args[MAX_OPT_ARGS]; + unsigned long long ull; int opt_mask = 0; int token; int ret = -EINVAL; @@ -3136,7 +3491,13 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) ret = -ENOMEM; goto out; } - target->id_ext = cpu_to_be64(simple_strtoull(p, NULL, 16)); + ret = kstrtoull(p, 16, &ull); + if (ret) { + pr_warn("invalid id_ext parameter '%s'\n", p); + kfree(p); + goto out; + } + target->id_ext = cpu_to_be64(ull); kfree(p); break; @@ -3146,7 +3507,13 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) ret = -ENOMEM; goto out; } - target->ioc_guid = cpu_to_be64(simple_strtoull(p, NULL, 16)); + ret = kstrtoull(p, 16, &ull); + if (ret) { + pr_warn("invalid ioc_guid parameter '%s'\n", p); + kfree(p); + goto out; + } + target->ioc_guid = cpu_to_be64(ull); kfree(p); break; @@ -3162,16 +3529,10 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) goto out; } - for (i = 0; i < 16; ++i) { - strlcpy(dgid, p + i * 2, sizeof(dgid)); - if (sscanf(dgid, "%hhx", - &target->orig_dgid.raw[i]) < 1) { - ret = -EINVAL; - kfree(p); - goto out; - } - } + ret = hex2bin(target->ib_cm.orig_dgid.raw, p, 16); kfree(p); + if (ret < 0) + goto out; break; case SRP_OPT_PKEY: @@ -3179,7 +3540,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) pr_warn("bad P_Key parameter '%s'\n", p); goto out; } - target->pkey = cpu_to_be16(token); + target->ib_cm.pkey = cpu_to_be16(token); break; case SRP_OPT_SERVICE_ID: @@ -3188,7 +3549,45 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) ret = -ENOMEM; goto out; } - target->service_id = cpu_to_be64(simple_strtoull(p, NULL, 16)); + ret = kstrtoull(p, 16, &ull); + if (ret) { + pr_warn("bad service_id parameter '%s'\n", p); + kfree(p); + goto out; + } + target->ib_cm.service_id = cpu_to_be64(ull); + kfree(p); + break; + + case SRP_OPT_IP_SRC: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + ret = srp_parse_in(net, &target->rdma_cm.src.ss, p); + if (ret < 0) { + pr_warn("bad source parameter '%s'\n", p); + kfree(p); + goto out; + } + target->rdma_cm.src_specified = true; + kfree(p); + break; + + case SRP_OPT_IP_DEST: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + ret = srp_parse_in(net, &target->rdma_cm.dst.ss, p); + if (ret < 0) { + pr_warn("bad dest parameter '%s'\n", p); + kfree(p); + goto out; + } + target->using_rdma_cm = true; kfree(p); break; @@ -3221,6 +3620,15 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) target->scsi_host->cmd_per_lun = token; break; + case SRP_OPT_TARGET_CAN_QUEUE: + if (match_int(args, &token) || token < 1) { + pr_warn("bad max target_can_queue parameter '%s'\n", + p); + goto out; + } + target->target_can_queue = token; + break; + case SRP_OPT_IO_CLASS: if (match_hex(args, &token)) { pr_warn("bad IO class parameter '%s'\n", p); @@ -3242,7 +3650,13 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) ret = -ENOMEM; goto out; } - target->initiator_ext = cpu_to_be64(simple_strtoull(p, NULL, 16)); + ret = kstrtoull(p, 16, &ull); + if (ret) { + pr_warn("bad initiator_ext value '%s'\n", p); + kfree(p); + goto out; + } + target->initiator_ext = cpu_to_be64(ull); kfree(p); break; @@ -3297,14 +3711,14 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) } } - if ((opt_mask & SRP_OPT_ALL) == SRP_OPT_ALL) - ret = 0; - else - for (i = 0; i < ARRAY_SIZE(srp_opt_tokens); ++i) - if ((srp_opt_tokens[i].token & SRP_OPT_ALL) && - !(srp_opt_tokens[i].token & opt_mask)) - pr_warn("target creation request is missing parameter '%s'\n", - srp_opt_tokens[i].pattern); + for (i = 0; i < ARRAY_SIZE(srp_opt_mandatory); i++) { + if ((opt_mask & srp_opt_mandatory[i]) == srp_opt_mandatory[i]) { + ret = 0; + break; + } + } + if (ret) + pr_warn("target creation request is missing one or more parameters\n"); if (target->scsi_host->cmd_per_lun > target->scsi_host->can_queue && (opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) @@ -3345,6 +3759,7 @@ static ssize_t srp_create_target(struct device *dev, target = host_to_target(target_host); + target->net = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); target->io_class = SRP_REV16A_IB_IO_CLASS; target->scsi_host = target_host; target->srp_host = host; @@ -3366,18 +3781,29 @@ static ssize_t srp_create_target(struct device *dev, if (ret < 0) goto put; - ret = srp_parse_options(buf, target); + ret = srp_parse_options(target->net, buf, target); if (ret) goto out; target->req_ring_size = target->queue_size - SRP_TSK_MGMT_SQ_SIZE; if (!srp_conn_unique(target->srp_host, target)) { - shost_printk(KERN_INFO, target->scsi_host, - PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n", - be64_to_cpu(target->id_ext), - be64_to_cpu(target->ioc_guid), - be64_to_cpu(target->initiator_ext)); + if (target->using_rdma_cm) { + char dst_addr[64]; + + shost_printk(KERN_INFO, target->scsi_host, + PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;dest=%s\n", + be64_to_cpu(target->id_ext), + be64_to_cpu(target->ioc_guid), + inet_ntop(&target->rdma_cm.dst, dst_addr, + sizeof(dst_addr))); + } else { + shost_printk(KERN_INFO, target->scsi_host, + PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n", + be64_to_cpu(target->id_ext), + be64_to_cpu(target->ioc_guid), + be64_to_cpu(target->initiator_ext)); + } ret = -EEXIST; goto out; } @@ -3478,11 +3904,18 @@ static ssize_t srp_create_target(struct device *dev, ret = srp_connect_ch(ch, multich); if (ret) { + char dst[64]; + + if (target->using_rdma_cm) + inet_ntop(&target->rdma_cm.dst, dst, + sizeof(dst)); + else + snprintf(dst, sizeof(dst), "%pI6", + target->ib_cm.orig_dgid.raw); shost_printk(KERN_ERR, target->scsi_host, - PFX "Connection %d/%d to %pI6 failed\n", + PFX "Connection %d/%d to %s failed\n", ch_start + cpu_idx, - target->ch_count, - ch->target->orig_dgid.raw); + target->ch_count, dst); if (node_idx == 0 && cpu_idx == 0) { goto free_ch; } else { @@ -3507,13 +3940,25 @@ connected: goto err_disconnect; if (target->state != SRP_TARGET_REMOVED) { - shost_printk(KERN_DEBUG, target->scsi_host, PFX - "new target: id_ext %016llx ioc_guid %016llx pkey %04x service_id %016llx sgid %pI6 dgid %pI6\n", - be64_to_cpu(target->id_ext), - be64_to_cpu(target->ioc_guid), - be16_to_cpu(target->pkey), - be64_to_cpu(target->service_id), - target->sgid.raw, target->orig_dgid.raw); + if (target->using_rdma_cm) { + char dst[64]; + + inet_ntop(&target->rdma_cm.dst, dst, sizeof(dst)); + shost_printk(KERN_DEBUG, target->scsi_host, PFX + "new target: id_ext %016llx ioc_guid %016llx sgid %pI6 dest %s\n", + be64_to_cpu(target->id_ext), + be64_to_cpu(target->ioc_guid), + target->sgid.raw, dst); + } else { + shost_printk(KERN_DEBUG, target->scsi_host, PFX + "new target: id_ext %016llx ioc_guid %016llx pkey %04x service_id %016llx sgid %pI6 dgid %pI6\n", + be64_to_cpu(target->id_ext), + be64_to_cpu(target->ioc_guid), + be16_to_cpu(target->ib_cm.pkey), + be64_to_cpu(target->ib_cm.service_id), + target->sgid.raw, + target->ib_cm.orig_dgid.raw); + } } ret = count; @@ -3523,8 +3968,16 @@ out: put: scsi_host_put(target->scsi_host); - if (ret < 0) + if (ret < 0) { + /* + * If a call to srp_remove_target() has not been scheduled, + * drop the network namespace reference now that was obtained + * earlier in this function. + */ + if (target->state != SRP_TARGET_REMOVED) + kobj_ns_drop(KOBJ_NS_TYPE_NET, target->net); scsi_host_put(target->scsi_host); + } return ret; diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index a814f5ef16f9..a2706086b9c7 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -45,6 +45,7 @@ #include <rdma/ib_sa.h> #include <rdma/ib_cm.h> #include <rdma/ib_fmr_pool.h> +#include <rdma/rdma_cm.h> enum { SRP_PATH_REC_TIMEOUT_MS = 1000, @@ -153,11 +154,18 @@ struct srp_rdma_ch { struct completion done; int status; - struct sa_path_rec path; - struct ib_sa_query *path_query; - int path_query_id; + union { + struct ib_cm { + struct sa_path_rec path; + struct ib_sa_query *path_query; + int path_query_id; + struct ib_cm_id *cm_id; + } ib_cm; + struct rdma_cm { + struct rdma_cm_id *cm_id; + } rdma_cm; + }; - struct ib_cm_id *cm_id; struct srp_iu **tx_ring; struct srp_iu **rx_ring; struct srp_request *req_ring; @@ -182,6 +190,7 @@ struct srp_target_port { /* read only in the hot path */ u32 global_rkey; struct srp_rdma_ch *ch; + struct net *net; u32 ch_count; u32 lkey; enum srp_target_state state; @@ -194,7 +203,6 @@ struct srp_target_port { union ib_gid sgid; __be64 id_ext; __be64 ioc_guid; - __be64 service_id; __be64 initiator_ext; u16 io_class; struct srp_host *srp_host; @@ -203,6 +211,7 @@ struct srp_target_port { char target_name[32]; unsigned int scsi_id; unsigned int sg_tablesize; + unsigned int target_can_queue; int mr_pool_size; int mr_per_cmd; int queue_size; @@ -210,8 +219,28 @@ struct srp_target_port { int comp_vector; int tl_retry_count; - union ib_gid orig_dgid; - __be16 pkey; + bool using_rdma_cm; + + union { + struct { + __be64 service_id; + union ib_gid orig_dgid; + __be16 pkey; + } ib_cm; + struct { + union { + struct sockaddr_in ip4; + struct sockaddr_in6 ip6; + struct sockaddr_storage ss; + } src; + union { + struct sockaddr_in ip4; + struct sockaddr_in6 ip6; + struct sockaddr_storage ss; + } dst; + bool src_specified; + } rdma_cm; + }; u32 rq_tmo_jiffies; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index bfa576aa9f03..0373b7c40902 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -41,6 +41,7 @@ #include <linux/string.h> #include <linux/delay.h> #include <linux/atomic.h> +#include <rdma/ib_cache.h> #include <scsi/scsi_proto.h> #include <scsi/scsi_tcq.h> #include <target/target_core_base.h> @@ -120,7 +121,9 @@ static bool srpt_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state new) } /** - * srpt_event_handler() - Asynchronous IB event callback function. + * srpt_event_handler - asynchronous IB event callback function + * @handler: IB event handler registered by ib_register_event_handler(). + * @event: Description of the event that occurred. * * Callback function called by the InfiniBand core when an asynchronous IB * event occurs. This callback may occur in interrupt context. See also @@ -132,6 +135,7 @@ static void srpt_event_handler(struct ib_event_handler *handler, { struct srpt_device *sdev; struct srpt_port *sport; + u8 port_num; sdev = ib_get_client_data(event->device, &srpt_client); if (!sdev || sdev->device != event->device) @@ -142,10 +146,15 @@ static void srpt_event_handler(struct ib_event_handler *handler, switch (event->event) { case IB_EVENT_PORT_ERR: - if (event->element.port_num <= sdev->device->phys_port_cnt) { - sport = &sdev->port[event->element.port_num - 1]; + port_num = event->element.port_num - 1; + if (port_num < sdev->device->phys_port_cnt) { + sport = &sdev->port[port_num]; sport->lid = 0; sport->sm_lid = 0; + } else { + WARN(true, "event %d: port_num %d out of range 1..%d\n", + event->event, port_num + 1, + sdev->device->phys_port_cnt); } break; case IB_EVENT_PORT_ACTIVE: @@ -155,25 +164,31 @@ static void srpt_event_handler(struct ib_event_handler *handler, case IB_EVENT_CLIENT_REREGISTER: case IB_EVENT_GID_CHANGE: /* Refresh port data asynchronously. */ - if (event->element.port_num <= sdev->device->phys_port_cnt) { - sport = &sdev->port[event->element.port_num - 1]; + port_num = event->element.port_num - 1; + if (port_num < sdev->device->phys_port_cnt) { + sport = &sdev->port[port_num]; if (!sport->lid && !sport->sm_lid) schedule_work(&sport->work); + } else { + WARN(true, "event %d: port_num %d out of range 1..%d\n", + event->event, port_num + 1, + sdev->device->phys_port_cnt); } break; default: - pr_err("received unrecognized IB event %d\n", - event->event); + pr_err("received unrecognized IB event %d\n", event->event); break; } } /** - * srpt_srq_event() - SRQ event callback function. + * srpt_srq_event - SRQ event callback function + * @event: Description of the event that occurred. + * @ctx: Context pointer specified at SRQ creation time. */ static void srpt_srq_event(struct ib_event *event, void *ctx) { - pr_info("SRQ event %d\n", event->event); + pr_debug("SRQ event %d\n", event->event); } static const char *get_ch_state_name(enum rdma_ch_state s) @@ -194,16 +209,18 @@ static const char *get_ch_state_name(enum rdma_ch_state s) } /** - * srpt_qp_event() - QP event callback function. + * srpt_qp_event - QP event callback function + * @event: Description of the event that occurred. + * @ch: SRPT RDMA channel. */ static void srpt_qp_event(struct ib_event *event, struct srpt_rdma_ch *ch) { - pr_debug("QP event %d on cm_id=%p sess_name=%s state=%d\n", - event->event, ch->cm_id, ch->sess_name, ch->state); + pr_debug("QP event %d on ch=%p sess_name=%s state=%d\n", + event->event, ch, ch->sess_name, ch->state); switch (event->event) { case IB_EVENT_COMM_EST: - ib_cm_notify(ch->cm_id, event->event); + ib_cm_notify(ch->ib_cm.cm_id, event->event); break; case IB_EVENT_QP_LAST_WQE_REACHED: pr_debug("%s-%d, state %s: received Last WQE event.\n", @@ -217,8 +234,8 @@ static void srpt_qp_event(struct ib_event *event, struct srpt_rdma_ch *ch) } /** - * srpt_set_ioc() - Helper function for initializing an IOUnitInfo structure. - * + * srpt_set_ioc - initialize a IOUnitInfo structure + * @c_list: controller list. * @slot: one-based slot number. * @value: four-bit value. * @@ -241,7 +258,8 @@ static void srpt_set_ioc(u8 *c_list, u32 slot, u8 value) } /** - * srpt_get_class_port_info() - Copy ClassPortInfo to a management datagram. + * srpt_get_class_port_info - copy ClassPortInfo to a management datagram + * @mad: Datagram that will be sent as response to DM_ATTR_CLASS_PORT_INFO. * * See also section 16.3.3.1 ClassPortInfo in the InfiniBand Architecture * Specification. @@ -260,7 +278,8 @@ static void srpt_get_class_port_info(struct ib_dm_mad *mad) } /** - * srpt_get_iou() - Write IOUnitInfo to a management datagram. + * srpt_get_iou - write IOUnitInfo to a management datagram + * @mad: Datagram that will be sent as response to DM_ATTR_IOU_INFO. * * See also section 16.3.3.3 IOUnitInfo in the InfiniBand Architecture * Specification. See also section B.7, table B.6 in the SRP r16a document. @@ -284,7 +303,10 @@ static void srpt_get_iou(struct ib_dm_mad *mad) } /** - * srpt_get_ioc() - Write IOControllerprofile to a management datagram. + * srpt_get_ioc - write IOControllerprofile to a management datagram + * @sport: HCA port through which the MAD has been received. + * @slot: Slot number specified in DM_ATTR_IOC_PROFILE query. + * @mad: Datagram that will be sent as response to DM_ATTR_IOC_PROFILE. * * See also section 16.3.3.4 IOControllerProfile in the InfiniBand * Architecture Specification. See also section B.7, table B.7 in the SRP @@ -314,7 +336,7 @@ static void srpt_get_ioc(struct srpt_port *sport, u32 slot, if (sdev->use_srq) send_queue_depth = sdev->srq_size; else - send_queue_depth = min(SRPT_RQ_SIZE, + send_queue_depth = min(MAX_SRPT_RQ_SIZE, sdev->device->attrs.max_qp_wr); memset(iocp, 0, sizeof(*iocp)); @@ -342,7 +364,12 @@ static void srpt_get_ioc(struct srpt_port *sport, u32 slot, } /** - * srpt_get_svc_entries() - Write ServiceEntries to a management datagram. + * srpt_get_svc_entries - write ServiceEntries to a management datagram + * @ioc_guid: I/O controller GUID to use in reply. + * @slot: I/O controller number. + * @hi: End of the range of service entries to be specified in the reply. + * @lo: Start of the range of service entries to be specified in the reply.. + * @mad: Datagram that will be sent as response to DM_ATTR_SVC_ENTRIES. * * See also section 16.3.3.5 ServiceEntries in the InfiniBand Architecture * Specification. See also section B.7, table B.8 in the SRP r16a document. @@ -379,8 +406,8 @@ static void srpt_get_svc_entries(u64 ioc_guid, } /** - * srpt_mgmt_method_get() - Process a received management datagram. - * @sp: source port through which the MAD has been received. + * srpt_mgmt_method_get - process a received management datagram + * @sp: HCA port through which the MAD has been received. * @rq_mad: received MAD. * @rsp_mad: response MAD. */ @@ -419,7 +446,9 @@ static void srpt_mgmt_method_get(struct srpt_port *sp, struct ib_mad *rq_mad, } /** - * srpt_mad_send_handler() - Post MAD-send callback function. + * srpt_mad_send_handler - MAD send completion callback + * @mad_agent: Return value of ib_register_mad_agent(). + * @mad_wc: Work completion reporting that the MAD has been sent. */ static void srpt_mad_send_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_wc *mad_wc) @@ -429,7 +458,10 @@ static void srpt_mad_send_handler(struct ib_mad_agent *mad_agent, } /** - * srpt_mad_recv_handler() - MAD reception callback function. + * srpt_mad_recv_handler - MAD reception callback function + * @mad_agent: Return value of ib_register_mad_agent(). + * @send_buf: Not used. + * @mad_wc: Work completion reporting that a MAD has been received. */ static void srpt_mad_recv_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf, @@ -493,8 +525,18 @@ err: ib_free_recv_mad(mad_wc); } +static int srpt_format_guid(char *buf, unsigned int size, const __be64 *guid) +{ + const __be16 *g = (const __be16 *)guid; + + return snprintf(buf, size, "%04x:%04x:%04x:%04x", + be16_to_cpu(g[0]), be16_to_cpu(g[1]), + be16_to_cpu(g[2]), be16_to_cpu(g[3])); +} + /** - * srpt_refresh_port() - Configure a HCA port. + * srpt_refresh_port - configure a HCA port + * @sport: SRPT HCA port. * * Enable InfiniBand management datagram processing, update the cached sm_lid, * lid and gid values, and register a callback function for processing MADs @@ -507,7 +549,6 @@ static int srpt_refresh_port(struct srpt_port *sport) struct ib_mad_reg_req reg_req; struct ib_port_modify port_modify; struct ib_port_attr port_attr; - __be16 *guid; int ret; memset(&port_modify, 0, sizeof(port_modify)); @@ -531,11 +572,8 @@ static int srpt_refresh_port(struct srpt_port *sport) goto err_query_port; sport->port_guid_wwn.priv = sport; - guid = (__be16 *)&sport->gid.global.interface_id; - snprintf(sport->port_guid, sizeof(sport->port_guid), - "%04x:%04x:%04x:%04x", - be16_to_cpu(guid[0]), be16_to_cpu(guid[1]), - be16_to_cpu(guid[2]), be16_to_cpu(guid[3])); + srpt_format_guid(sport->port_guid, sizeof(sport->port_guid), + &sport->gid.global.interface_id); sport->port_gid_wwn.priv = sport; snprintf(sport->port_gid, sizeof(sport->port_gid), "0x%016llx%016llx", @@ -577,7 +615,8 @@ err_mod_port: } /** - * srpt_unregister_mad_agent() - Unregister MAD callback functions. + * srpt_unregister_mad_agent - unregister MAD callback functions + * @sdev: SRPT HCA pointer. * * Note: It is safe to call this function more than once for the same device. */ @@ -602,7 +641,11 @@ static void srpt_unregister_mad_agent(struct srpt_device *sdev) } /** - * srpt_alloc_ioctx() - Allocate an SRPT I/O context structure. + * srpt_alloc_ioctx - allocate a SRPT I/O context structure + * @sdev: SRPT HCA pointer. + * @ioctx_size: I/O context size. + * @dma_size: Size of I/O context DMA buffer. + * @dir: DMA data direction. */ static struct srpt_ioctx *srpt_alloc_ioctx(struct srpt_device *sdev, int ioctx_size, int dma_size, @@ -633,7 +676,11 @@ err: } /** - * srpt_free_ioctx() - Free an SRPT I/O context structure. + * srpt_free_ioctx - free a SRPT I/O context structure + * @sdev: SRPT HCA pointer. + * @ioctx: I/O context pointer. + * @dma_size: Size of I/O context DMA buffer. + * @dir: DMA data direction. */ static void srpt_free_ioctx(struct srpt_device *sdev, struct srpt_ioctx *ioctx, int dma_size, enum dma_data_direction dir) @@ -647,7 +694,7 @@ static void srpt_free_ioctx(struct srpt_device *sdev, struct srpt_ioctx *ioctx, } /** - * srpt_alloc_ioctx_ring() - Allocate a ring of SRPT I/O context structures. + * srpt_alloc_ioctx_ring - allocate a ring of SRPT I/O context structures * @sdev: Device to allocate the I/O context ring for. * @ring_size: Number of elements in the I/O context ring. * @ioctx_size: I/O context size. @@ -685,7 +732,12 @@ out: } /** - * srpt_free_ioctx_ring() - Free the ring of SRPT I/O context structures. + * srpt_free_ioctx_ring - free the ring of SRPT I/O context structures + * @ioctx_ring: I/O context ring to be freed. + * @sdev: SRPT HCA pointer. + * @ring_size: Number of ring elements. + * @dma_size: Size of I/O context DMA buffer. + * @dir: DMA data direction. */ static void srpt_free_ioctx_ring(struct srpt_ioctx **ioctx_ring, struct srpt_device *sdev, int ring_size, @@ -702,23 +754,9 @@ static void srpt_free_ioctx_ring(struct srpt_ioctx **ioctx_ring, } /** - * srpt_get_cmd_state() - Get the state of a SCSI command. - */ -static enum srpt_command_state srpt_get_cmd_state(struct srpt_send_ioctx *ioctx) -{ - enum srpt_command_state state; - unsigned long flags; - - BUG_ON(!ioctx); - - spin_lock_irqsave(&ioctx->spinlock, flags); - state = ioctx->state; - spin_unlock_irqrestore(&ioctx->spinlock, flags); - return state; -} - -/** - * srpt_set_cmd_state() - Set the state of a SCSI command. + * srpt_set_cmd_state - set the state of a SCSI command + * @ioctx: Send I/O context. + * @new: New I/O context state. * * Does not modify the state of aborted commands. Returns the previous command * state. @@ -727,21 +765,19 @@ static enum srpt_command_state srpt_set_cmd_state(struct srpt_send_ioctx *ioctx, enum srpt_command_state new) { enum srpt_command_state previous; - unsigned long flags; - BUG_ON(!ioctx); - - spin_lock_irqsave(&ioctx->spinlock, flags); previous = ioctx->state; if (previous != SRPT_STATE_DONE) ioctx->state = new; - spin_unlock_irqrestore(&ioctx->spinlock, flags); return previous; } /** - * srpt_test_and_set_cmd_state() - Test and set the state of a command. + * srpt_test_and_set_cmd_state - test and set the state of a command + * @ioctx: Send I/O context. + * @old: Current I/O context state. + * @new: New I/O context state. * * Returns true if and only if the previous command state was equal to 'old'. */ @@ -750,22 +786,23 @@ static bool srpt_test_and_set_cmd_state(struct srpt_send_ioctx *ioctx, enum srpt_command_state new) { enum srpt_command_state previous; - unsigned long flags; WARN_ON(!ioctx); WARN_ON(old == SRPT_STATE_DONE); WARN_ON(new == SRPT_STATE_NEW); - spin_lock_irqsave(&ioctx->spinlock, flags); previous = ioctx->state; if (previous == old) ioctx->state = new; - spin_unlock_irqrestore(&ioctx->spinlock, flags); + return previous == old; } /** - * srpt_post_recv() - Post an IB receive request. + * srpt_post_recv - post an IB receive request + * @sdev: SRPT HCA pointer. + * @ch: SRPT RDMA channel. + * @ioctx: Receive I/O context pointer. */ static int srpt_post_recv(struct srpt_device *sdev, struct srpt_rdma_ch *ch, struct srpt_recv_ioctx *ioctx) @@ -791,7 +828,8 @@ static int srpt_post_recv(struct srpt_device *sdev, struct srpt_rdma_ch *ch, } /** - * srpt_zerolength_write() - Perform a zero-length RDMA write. + * srpt_zerolength_write - perform a zero-length RDMA write + * @ch: SRPT RDMA channel. * * A quote from the InfiniBand specification: C9-88: For an HCA responder * using Reliable Connection service, for each zero-length RDMA READ or WRITE @@ -802,6 +840,9 @@ static int srpt_zerolength_write(struct srpt_rdma_ch *ch) { struct ib_send_wr wr, *bad_wr; + pr_debug("%s-%d: queued zerolength write\n", ch->sess_name, + ch->qp->qp_num); + memset(&wr, 0, sizeof(wr)); wr.opcode = IB_WR_RDMA_WRITE; wr.wr_cqe = &ch->zw_cqe; @@ -813,13 +854,17 @@ static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc) { struct srpt_rdma_ch *ch = cq->cq_context; + pr_debug("%s-%d wc->status %d\n", ch->sess_name, ch->qp->qp_num, + wc->status); + if (wc->status == IB_WC_SUCCESS) { srpt_process_wait_list(ch); } else { if (srpt_set_ch_state(ch, CH_DISCONNECTED)) schedule_work(&ch->release_work); else - WARN_ONCE(1, "%s-%d\n", ch->sess_name, ch->qp->qp_num); + pr_debug("%s-%d: already disconnected.\n", + ch->sess_name, ch->qp->qp_num); } } @@ -928,11 +973,13 @@ static inline void *srpt_get_desc_buf(struct srp_cmd *srp_cmd) } /** - * srpt_get_desc_tbl() - Parse the data descriptors of an SRP_CMD request. + * srpt_get_desc_tbl - parse the data descriptors of a SRP_CMD request * @ioctx: Pointer to the I/O context associated with the request. * @srp_cmd: Pointer to the SRP_CMD request data. * @dir: Pointer to the variable to which the transfer direction will be * written. + * @sg: [out] scatterlist allocated for the parsed SRP_CMD. + * @sg_cnt: [out] length of @sg. * @data_len: Pointer to the variable to which the total data length of all * descriptors in the SRP_CMD request will be written. * @@ -998,7 +1045,9 @@ static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx, } /** - * srpt_init_ch_qp() - Initialize queue pair attributes. + * srpt_init_ch_qp - initialize queue pair attributes + * @ch: SRPT RDMA channel. + * @qp: Queue pair pointer. * * Initialized the attributes of queue pair 'qp' by allowing local write, * remote read and remote write. Also transitions 'qp' to state IB_QPS_INIT. @@ -1015,7 +1064,12 @@ static int srpt_init_ch_qp(struct srpt_rdma_ch *ch, struct ib_qp *qp) attr->qp_state = IB_QPS_INIT; attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE; attr->port_num = ch->sport->port; - attr->pkey_index = 0; + + ret = ib_find_cached_pkey(ch->sport->sdev->device, ch->sport->port, + ch->pkey, &attr->pkey_index); + if (ret < 0) + pr_err("Translating pkey %#x failed (%d) - using index 0\n", + ch->pkey, ret); ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PORT | @@ -1026,7 +1080,7 @@ static int srpt_init_ch_qp(struct srpt_rdma_ch *ch, struct ib_qp *qp) } /** - * srpt_ch_qp_rtr() - Change the state of a channel to 'ready to receive' (RTR). + * srpt_ch_qp_rtr - change the state of a channel to 'ready to receive' (RTR) * @ch: channel of the queue pair. * @qp: queue pair to change the state of. * @@ -1043,7 +1097,7 @@ static int srpt_ch_qp_rtr(struct srpt_rdma_ch *ch, struct ib_qp *qp) int ret; qp_attr.qp_state = IB_QPS_RTR; - ret = ib_cm_init_qp_attr(ch->cm_id, &qp_attr, &attr_mask); + ret = ib_cm_init_qp_attr(ch->ib_cm.cm_id, &qp_attr, &attr_mask); if (ret) goto out; @@ -1056,7 +1110,7 @@ out: } /** - * srpt_ch_qp_rts() - Change the state of a channel to 'ready to send' (RTS). + * srpt_ch_qp_rts - change the state of a channel to 'ready to send' (RTS) * @ch: channel of the queue pair. * @qp: queue pair to change the state of. * @@ -1073,7 +1127,7 @@ static int srpt_ch_qp_rts(struct srpt_rdma_ch *ch, struct ib_qp *qp) int ret; qp_attr.qp_state = IB_QPS_RTS; - ret = ib_cm_init_qp_attr(ch->cm_id, &qp_attr, &attr_mask); + ret = ib_cm_init_qp_attr(ch->ib_cm.cm_id, &qp_attr, &attr_mask); if (ret) goto out; @@ -1086,7 +1140,8 @@ out: } /** - * srpt_ch_qp_err() - Set the channel queue pair state to 'error'. + * srpt_ch_qp_err - set the channel queue pair state to 'error' + * @ch: SRPT RDMA channel. */ static int srpt_ch_qp_err(struct srpt_rdma_ch *ch) { @@ -1097,7 +1152,8 @@ static int srpt_ch_qp_err(struct srpt_rdma_ch *ch) } /** - * srpt_get_send_ioctx() - Obtain an I/O context for sending to the initiator. + * srpt_get_send_ioctx - obtain an I/O context for sending to the initiator + * @ch: SRPT RDMA channel. */ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) { @@ -1119,11 +1175,9 @@ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) return ioctx; BUG_ON(ioctx->ch != ch); - spin_lock_init(&ioctx->spinlock); ioctx->state = SRPT_STATE_NEW; ioctx->n_rdma = 0; ioctx->n_rw_ctx = 0; - init_completion(&ioctx->tx_done); ioctx->queue_status_only = false; /* * transport_init_se_cmd() does not initialize all fields, so do it @@ -1136,14 +1190,12 @@ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) } /** - * srpt_abort_cmd() - Abort a SCSI command. + * srpt_abort_cmd - abort a SCSI command * @ioctx: I/O context associated with the SCSI command. - * @context: Preferred execution context. */ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) { enum srpt_command_state state; - unsigned long flags; BUG_ON(!ioctx); @@ -1152,7 +1204,6 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) * the ib_srpt driver, change the state to the next state. */ - spin_lock_irqsave(&ioctx->spinlock, flags); state = ioctx->state; switch (state) { case SRPT_STATE_NEED_DATA: @@ -1167,7 +1218,6 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) __func__, state); break; } - spin_unlock_irqrestore(&ioctx->spinlock, flags); pr_debug("Aborting cmd with state %d -> %d and tag %lld\n", state, ioctx->state, ioctx->cmd.tag); @@ -1206,6 +1256,10 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) } /** + * srpt_rdma_read_done - RDMA read completion callback + * @cq: Completion queue. + * @wc: Work completion. + * * XXX: what is now target_execute_cmd used to be asynchronous, and unmapping * the data that has been transferred via IB RDMA had to be postponed until the * check_stop_free() callback. None of this is necessary anymore and needs to @@ -1233,11 +1287,11 @@ static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc) target_execute_cmd(&ioctx->cmd); else pr_err("%s[%d]: wrong state = %d\n", __func__, - __LINE__, srpt_get_cmd_state(ioctx)); + __LINE__, ioctx->state); } /** - * srpt_build_cmd_rsp() - Build an SRP_RSP response. + * srpt_build_cmd_rsp - build a SRP_RSP response * @ch: RDMA channel through which the request has been received. * @ioctx: I/O context associated with the SRP_CMD request. The response will * be built in the buffer ioctx->buf points at and hence this function will @@ -1297,7 +1351,7 @@ static int srpt_build_cmd_rsp(struct srpt_rdma_ch *ch, } /** - * srpt_build_tskmgmt_rsp() - Build a task management response. + * srpt_build_tskmgmt_rsp - build a task management response * @ch: RDMA channel through which the request has been received. * @ioctx: I/O context in which the SRP_RSP response will be built. * @rsp_code: RSP_CODE that will be stored in the response. @@ -1345,7 +1399,10 @@ static int srpt_check_stop_free(struct se_cmd *cmd) } /** - * srpt_handle_cmd() - Process SRP_CMD. + * srpt_handle_cmd - process a SRP_CMD information unit + * @ch: SRPT RDMA channel. + * @recv_ioctx: Receive I/O context. + * @send_ioctx: Send I/O context. */ static void srpt_handle_cmd(struct srpt_rdma_ch *ch, struct srpt_recv_ioctx *recv_ioctx, @@ -1427,7 +1484,10 @@ static int srp_tmr_to_tcm(int fn) } /** - * srpt_handle_tsk_mgmt() - Process an SRP_TSK_MGMT information unit. + * srpt_handle_tsk_mgmt - process a SRP_TSK_MGMT information unit + * @ch: SRPT RDMA channel. + * @recv_ioctx: Receive I/O context. + * @send_ioctx: Send I/O context. * * Returns 0 if and only if the request will be processed by the target core. * @@ -1449,9 +1509,9 @@ static void srpt_handle_tsk_mgmt(struct srpt_rdma_ch *ch, srp_tsk = recv_ioctx->ioctx.buf; cmd = &send_ioctx->cmd; - pr_debug("recv tsk_mgmt fn %d for task_tag %lld and cmd tag %lld" - " cm_id %p sess %p\n", srp_tsk->tsk_mgmt_func, - srp_tsk->task_tag, srp_tsk->tag, ch->cm_id, ch->sess); + pr_debug("recv tsk_mgmt fn %d for task_tag %lld and cmd tag %lld ch %p sess %p\n", + srp_tsk->tsk_mgmt_func, srp_tsk->task_tag, srp_tsk->tag, ch, + ch->sess); srpt_set_cmd_state(send_ioctx, SRPT_STATE_MGMT); send_ioctx->cmd.tag = srp_tsk->tag; @@ -1470,41 +1530,42 @@ fail: } /** - * srpt_handle_new_iu() - Process a newly received information unit. + * srpt_handle_new_iu - process a newly received information unit * @ch: RDMA channel through which the information unit has been received. - * @ioctx: SRPT I/O context associated with the information unit. + * @recv_ioctx: Receive I/O context associated with the information unit. */ -static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, - struct srpt_recv_ioctx *recv_ioctx, - struct srpt_send_ioctx *send_ioctx) +static bool +srpt_handle_new_iu(struct srpt_rdma_ch *ch, struct srpt_recv_ioctx *recv_ioctx) { + struct srpt_send_ioctx *send_ioctx = NULL; struct srp_cmd *srp_cmd; + bool res = false; + u8 opcode; BUG_ON(!ch); BUG_ON(!recv_ioctx); + if (unlikely(ch->state == CH_CONNECTING)) + goto push; + ib_dma_sync_single_for_cpu(ch->sport->sdev->device, recv_ioctx->ioctx.dma, srp_max_req_size, DMA_FROM_DEVICE); - if (unlikely(ch->state == CH_CONNECTING)) - goto out_wait; - - if (unlikely(ch->state != CH_LIVE)) - return; - srp_cmd = recv_ioctx->ioctx.buf; - if (srp_cmd->opcode == SRP_CMD || srp_cmd->opcode == SRP_TSK_MGMT) { - if (!send_ioctx) { - if (!list_empty(&ch->cmd_wait_list)) - goto out_wait; - send_ioctx = srpt_get_send_ioctx(ch); - } + opcode = srp_cmd->opcode; + if (opcode == SRP_CMD || opcode == SRP_TSK_MGMT) { + send_ioctx = srpt_get_send_ioctx(ch); if (unlikely(!send_ioctx)) - goto out_wait; + goto push; } - switch (srp_cmd->opcode) { + if (!list_empty(&recv_ioctx->wait_list)) { + WARN_ON_ONCE(!ch->processing_wait_list); + list_del_init(&recv_ioctx->wait_list); + } + + switch (opcode) { case SRP_CMD: srpt_handle_cmd(ch, recv_ioctx, send_ioctx); break; @@ -1524,16 +1585,22 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, pr_err("Received SRP_RSP\n"); break; default: - pr_err("received IU with unknown opcode 0x%x\n", - srp_cmd->opcode); + pr_err("received IU with unknown opcode 0x%x\n", opcode); break; } srpt_post_recv(ch->sport->sdev, ch, recv_ioctx); - return; + res = true; + +out: + return res; -out_wait: - list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list); +push: + if (list_empty(&recv_ioctx->wait_list)) { + WARN_ON_ONCE(ch->processing_wait_list); + list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list); + } + goto out; } static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc) @@ -1548,10 +1615,10 @@ static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc) req_lim = atomic_dec_return(&ch->req_lim); if (unlikely(req_lim < 0)) pr_err("req_lim = %d < 0\n", req_lim); - srpt_handle_new_iu(ch, ioctx, NULL); + srpt_handle_new_iu(ch, ioctx); } else { - pr_info("receiving failed for ioctx %p with status %d\n", - ioctx, wc->status); + pr_info_ratelimited("receiving failed for ioctx %p with status %d\n", + ioctx, wc->status); } } @@ -1562,22 +1629,28 @@ static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc) */ static void srpt_process_wait_list(struct srpt_rdma_ch *ch) { - struct srpt_send_ioctx *ioctx; + struct srpt_recv_ioctx *recv_ioctx, *tmp; + + WARN_ON_ONCE(ch->state == CH_CONNECTING); - while (!list_empty(&ch->cmd_wait_list) && - ch->state >= CH_LIVE && - (ioctx = srpt_get_send_ioctx(ch)) != NULL) { - struct srpt_recv_ioctx *recv_ioctx; + if (list_empty(&ch->cmd_wait_list)) + return; - recv_ioctx = list_first_entry(&ch->cmd_wait_list, - struct srpt_recv_ioctx, - wait_list); - list_del(&recv_ioctx->wait_list); - srpt_handle_new_iu(ch, recv_ioctx, ioctx); + WARN_ON_ONCE(ch->processing_wait_list); + ch->processing_wait_list = true; + list_for_each_entry_safe(recv_ioctx, tmp, &ch->cmd_wait_list, + wait_list) { + if (!srpt_handle_new_iu(ch, recv_ioctx)) + break; } + ch->processing_wait_list = false; } /** + * srpt_send_done - send completion callback + * @cq: Completion queue. + * @wc: Work completion. + * * Note: Although this has not yet been observed during tests, at least in * theory it is possible that the srpt_get_send_ioctx() call invoked by * srpt_handle_new_iu() fails. This is possible because the req_lim_delta @@ -1619,7 +1692,8 @@ static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc) } /** - * srpt_create_ch_ib() - Create receive and send completion queues. + * srpt_create_ch_ib - create receive and send completion queues + * @ch: SRPT RDMA channel. */ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch) { @@ -1627,7 +1701,7 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch) struct srpt_port *sport = ch->sport; struct srpt_device *sdev = sport->sdev; const struct ib_device_attr *attrs = &sdev->device->attrs; - u32 srp_sq_size = sport->port_attrib.srp_sq_size; + int sq_size = sport->port_attrib.srp_sq_size; int i, ret; WARN_ON(ch->rq_size < 1); @@ -1638,12 +1712,12 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch) goto out; retry: - ch->cq = ib_alloc_cq(sdev->device, ch, ch->rq_size + srp_sq_size, + ch->cq = ib_alloc_cq(sdev->device, ch, ch->rq_size + sq_size, 0 /* XXX: spread CQs */, IB_POLL_WORKQUEUE); if (IS_ERR(ch->cq)) { ret = PTR_ERR(ch->cq); pr_err("failed to create CQ cqe= %d ret= %d\n", - ch->rq_size + srp_sq_size, ret); + ch->rq_size + sq_size, ret); goto out; } @@ -1661,8 +1735,8 @@ retry: * both both, as RDMA contexts will also post completions for the * RDMA READ case. */ - qp_init->cap.max_send_wr = min(srp_sq_size / 2, attrs->max_qp_wr + 0U); - qp_init->cap.max_rdma_ctxs = srp_sq_size / 2; + qp_init->cap.max_send_wr = min(sq_size / 2, attrs->max_qp_wr); + qp_init->cap.max_rdma_ctxs = sq_size / 2; qp_init->cap.max_send_sge = min(attrs->max_sge, SRPT_MAX_SG_PER_WQE); qp_init->port_num = ch->sport->port; if (sdev->use_srq) { @@ -1676,8 +1750,8 @@ retry: if (IS_ERR(ch->qp)) { ret = PTR_ERR(ch->qp); if (ret == -ENOMEM) { - srp_sq_size /= 2; - if (srp_sq_size >= MIN_SRPT_SQ_SIZE) { + sq_size /= 2; + if (sq_size >= MIN_SRPT_SQ_SIZE) { ib_destroy_cq(ch->cq); goto retry; } @@ -1688,9 +1762,9 @@ retry: atomic_set(&ch->sq_wr_avail, qp_init->cap.max_send_wr); - pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n", + pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d ch= %p\n", __func__, ch->cq->cqe, qp_init->cap.max_send_sge, - qp_init->cap.max_send_wr, ch->cm_id); + qp_init->cap.max_send_wr, ch); ret = srpt_init_ch_qp(ch, ch->qp); if (ret) @@ -1718,7 +1792,8 @@ static void srpt_destroy_ch_ib(struct srpt_rdma_ch *ch) } /** - * srpt_close_ch() - Close an RDMA channel. + * srpt_close_ch - close a RDMA channel + * @ch: SRPT RDMA channel. * * Make sure all resources associated with the channel will be deallocated at * an appropriate time. @@ -1743,8 +1818,6 @@ static bool srpt_close_ch(struct srpt_rdma_ch *ch) pr_err("%s-%d: changing queue pair into error state failed: %d\n", ch->sess_name, ch->qp->qp_num, ret); - pr_debug("%s-%d: queued zerolength write\n", ch->sess_name, - ch->qp->qp_num); ret = srpt_zerolength_write(ch); if (ret < 0) { pr_err("%s-%d: queuing zero-length write failed: %d\n", @@ -1776,9 +1849,9 @@ static int srpt_disconnect_ch(struct srpt_rdma_ch *ch) if (!srpt_set_ch_state(ch, CH_DISCONNECTING)) return -ENOTCONN; - ret = ib_send_cm_dreq(ch->cm_id, NULL, 0); + ret = ib_send_cm_dreq(ch->ib_cm.cm_id, NULL, 0); if (ret < 0) - ret = ib_send_cm_drep(ch->cm_id, NULL, 0); + ret = ib_send_cm_drep(ch->ib_cm.cm_id, NULL, 0); if (ret < 0 && srpt_close_ch(ch)) ret = 0; @@ -1786,83 +1859,135 @@ static int srpt_disconnect_ch(struct srpt_rdma_ch *ch) return ret; } -/* - * Send DREQ and wait for DREP. Return true if and only if this function - * changed the state of @ch. - */ -static bool srpt_disconnect_ch_sync(struct srpt_rdma_ch *ch) - __must_hold(&sdev->mutex) +static bool srpt_ch_closed(struct srpt_port *sport, struct srpt_rdma_ch *ch) { - DECLARE_COMPLETION_ONSTACK(release_done); - struct srpt_device *sdev = ch->sport->sdev; - bool wait; + struct srpt_nexus *nexus; + struct srpt_rdma_ch *ch2; + bool res = true; + + rcu_read_lock(); + list_for_each_entry(nexus, &sport->nexus_list, entry) { + list_for_each_entry(ch2, &nexus->ch_list, list) { + if (ch2 == ch) { + res = false; + goto done; + } + } + } +done: + rcu_read_unlock(); - lockdep_assert_held(&sdev->mutex); + return res; +} + +/* Send DREQ and wait for DREP. */ +static void srpt_disconnect_ch_sync(struct srpt_rdma_ch *ch) +{ + struct srpt_port *sport = ch->sport; pr_debug("ch %s-%d state %d\n", ch->sess_name, ch->qp->qp_num, ch->state); - WARN_ON(ch->release_done); - ch->release_done = &release_done; - wait = !list_empty(&ch->list); + mutex_lock(&sport->mutex); srpt_disconnect_ch(ch); - mutex_unlock(&sdev->mutex); + mutex_unlock(&sport->mutex); - if (!wait) - goto out; - - while (wait_for_completion_timeout(&release_done, 180 * HZ) == 0) + while (wait_event_timeout(sport->ch_releaseQ, srpt_ch_closed(sport, ch), + 5 * HZ) == 0) pr_info("%s(%s-%d state %d): still waiting ...\n", __func__, ch->sess_name, ch->qp->qp_num, ch->state); -out: - mutex_lock(&sdev->mutex); - return wait; } -static void srpt_set_enabled(struct srpt_port *sport, bool enabled) - __must_hold(&sdev->mutex) +static void __srpt_close_all_ch(struct srpt_port *sport) { - struct srpt_device *sdev = sport->sdev; + struct srpt_nexus *nexus; struct srpt_rdma_ch *ch; - lockdep_assert_held(&sdev->mutex); + lockdep_assert_held(&sport->mutex); - if (sport->enabled == enabled) - return; - sport->enabled = enabled; - if (sport->enabled) - return; + list_for_each_entry(nexus, &sport->nexus_list, entry) { + list_for_each_entry(ch, &nexus->ch_list, list) { + if (srpt_disconnect_ch(ch) >= 0) + pr_info("Closing channel %s-%d because target %s_%d has been disabled\n", + ch->sess_name, ch->qp->qp_num, + sport->sdev->device->name, sport->port); + srpt_close_ch(ch); + } + } +} -again: - list_for_each_entry(ch, &sdev->rch_list, list) { - if (ch->sport == sport) { - pr_info("%s: closing channel %s-%d\n", - sdev->device->name, ch->sess_name, - ch->qp->qp_num); - if (srpt_disconnect_ch_sync(ch)) - goto again; +/* + * Look up (i_port_id, t_port_id) in sport->nexus_list. Create an entry if + * it does not yet exist. + */ +static struct srpt_nexus *srpt_get_nexus(struct srpt_port *sport, + const u8 i_port_id[16], + const u8 t_port_id[16]) +{ + struct srpt_nexus *nexus = NULL, *tmp_nexus = NULL, *n; + + for (;;) { + mutex_lock(&sport->mutex); + list_for_each_entry(n, &sport->nexus_list, entry) { + if (memcmp(n->i_port_id, i_port_id, 16) == 0 && + memcmp(n->t_port_id, t_port_id, 16) == 0) { + nexus = n; + break; + } } + if (!nexus && tmp_nexus) { + list_add_tail_rcu(&tmp_nexus->entry, + &sport->nexus_list); + swap(nexus, tmp_nexus); + } + mutex_unlock(&sport->mutex); + + if (nexus) + break; + tmp_nexus = kzalloc(sizeof(*nexus), GFP_KERNEL); + if (!tmp_nexus) { + nexus = ERR_PTR(-ENOMEM); + break; + } + INIT_LIST_HEAD(&tmp_nexus->ch_list); + memcpy(tmp_nexus->i_port_id, i_port_id, 16); + memcpy(tmp_nexus->t_port_id, t_port_id, 16); } + kfree(tmp_nexus); + + return nexus; +} + +static void srpt_set_enabled(struct srpt_port *sport, bool enabled) + __must_hold(&sport->mutex) +{ + lockdep_assert_held(&sport->mutex); + + if (sport->enabled == enabled) + return; + sport->enabled = enabled; + if (!enabled) + __srpt_close_all_ch(sport); } static void srpt_free_ch(struct kref *kref) { struct srpt_rdma_ch *ch = container_of(kref, struct srpt_rdma_ch, kref); - kfree(ch); + kfree_rcu(ch, rcu); } static void srpt_release_channel_work(struct work_struct *w) { struct srpt_rdma_ch *ch; struct srpt_device *sdev; + struct srpt_port *sport; struct se_session *se_sess; ch = container_of(w, struct srpt_rdma_ch, release_work); - pr_debug("%s: %s-%d; release_done = %p\n", __func__, ch->sess_name, - ch->qp->qp_num, ch->release_done); + pr_debug("%s-%d\n", ch->sess_name, ch->qp->qp_num); sdev = ch->sport->sdev; BUG_ON(!sdev); @@ -1877,169 +2002,141 @@ static void srpt_release_channel_work(struct work_struct *w) transport_deregister_session(se_sess); ch->sess = NULL; - ib_destroy_cm_id(ch->cm_id); + ib_destroy_cm_id(ch->ib_cm.cm_id); srpt_destroy_ch_ib(ch); srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring, ch->sport->sdev, ch->rq_size, - ch->rsp_size, DMA_TO_DEVICE); + ch->max_rsp_size, DMA_TO_DEVICE); srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_recv_ring, sdev, ch->rq_size, srp_max_req_size, DMA_FROM_DEVICE); - mutex_lock(&sdev->mutex); - list_del_init(&ch->list); - if (ch->release_done) - complete(ch->release_done); - mutex_unlock(&sdev->mutex); + sport = ch->sport; + mutex_lock(&sport->mutex); + list_del_rcu(&ch->list); + mutex_unlock(&sport->mutex); - wake_up(&sdev->ch_releaseQ); + wake_up(&sport->ch_releaseQ); kref_put(&ch->kref, srpt_free_ch); } /** - * srpt_cm_req_recv() - Process the event IB_CM_REQ_RECEIVED. + * srpt_cm_req_recv - process the event IB_CM_REQ_RECEIVED + * @cm_id: IB/CM connection identifier. + * @port_num: Port through which the IB/CM REQ message was received. + * @pkey: P_Key of the incoming connection. + * @req: SRP login request. + * @src_addr: GID of the port that submitted the login request. * * Ownership of the cm_id is transferred to the target session if this * functions returns zero. Otherwise the caller remains the owner of cm_id. */ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, - struct ib_cm_req_event_param *param, - void *private_data) + u8 port_num, __be16 pkey, + const struct srp_login_req *req, + const char *src_addr) { struct srpt_device *sdev = cm_id->context; - struct srpt_port *sport = &sdev->port[param->port - 1]; - struct srp_login_req *req; - struct srp_login_rsp *rsp; - struct srp_login_rej *rej; - struct ib_cm_rep_param *rep_param; - struct srpt_rdma_ch *ch, *tmp_ch; - __be16 *guid; + struct srpt_port *sport = &sdev->port[port_num - 1]; + struct srpt_nexus *nexus; + struct srp_login_rsp *rsp = NULL; + struct srp_login_rej *rej = NULL; + struct ib_cm_rep_param *rep_param = NULL; + struct srpt_rdma_ch *ch; + char i_port_id[36]; u32 it_iu_len; - int i, ret = 0; + int i, ret; WARN_ON_ONCE(irqs_disabled()); - if (WARN_ON(!sdev || !private_data)) + if (WARN_ON(!sdev || !req)) return -EINVAL; - req = (struct srp_login_req *)private_data; - it_iu_len = be32_to_cpu(req->req_it_iu_len); - pr_info("Received SRP_LOGIN_REQ with i_port_id 0x%llx:0x%llx," - " t_port_id 0x%llx:0x%llx and it_iu_len %d on port %d" - " (guid=0x%llx:0x%llx)\n", - be64_to_cpu(*(__be64 *)&req->initiator_port_id[0]), - be64_to_cpu(*(__be64 *)&req->initiator_port_id[8]), - be64_to_cpu(*(__be64 *)&req->target_port_id[0]), - be64_to_cpu(*(__be64 *)&req->target_port_id[8]), - it_iu_len, - param->port, - be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[0]), - be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[8])); + pr_info("Received SRP_LOGIN_REQ with i_port_id %pI6, t_port_id %pI6 and it_iu_len %d on port %d (guid=%pI6); pkey %#04x\n", + req->initiator_port_id, req->target_port_id, it_iu_len, + port_num, &sport->gid, be16_to_cpu(pkey)); + nexus = srpt_get_nexus(sport, req->initiator_port_id, + req->target_port_id); + if (IS_ERR(nexus)) { + ret = PTR_ERR(nexus); + goto out; + } + + ret = -ENOMEM; rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); rej = kzalloc(sizeof(*rej), GFP_KERNEL); rep_param = kzalloc(sizeof(*rep_param), GFP_KERNEL); - - if (!rsp || !rej || !rep_param) { - ret = -ENOMEM; + if (!rsp || !rej || !rep_param) goto out; - } + ret = -EINVAL; if (it_iu_len > srp_max_req_size || it_iu_len < 64) { rej->reason = cpu_to_be32( - SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE); - ret = -EINVAL; - pr_err("rejected SRP_LOGIN_REQ because its" - " length (%d bytes) is out of range (%d .. %d)\n", + SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE); + pr_err("rejected SRP_LOGIN_REQ because its length (%d bytes) is out of range (%d .. %d)\n", it_iu_len, 64, srp_max_req_size); goto reject; } if (!sport->enabled) { - rej->reason = cpu_to_be32( - SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); - ret = -EINVAL; - pr_err("rejected SRP_LOGIN_REQ because the target port" - " has not yet been enabled\n"); + rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + pr_info("rejected SRP_LOGIN_REQ because target port %s_%d has not yet been enabled\n", + sport->sdev->device->name, port_num); goto reject; } - if ((req->req_flags & SRP_MTCH_ACTION) == SRP_MULTICHAN_SINGLE) { - rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_NO_CHAN; - - mutex_lock(&sdev->mutex); - - list_for_each_entry_safe(ch, tmp_ch, &sdev->rch_list, list) { - if (!memcmp(ch->i_port_id, req->initiator_port_id, 16) - && !memcmp(ch->t_port_id, req->target_port_id, 16) - && param->port == ch->sport->port - && param->listen_id == ch->sport->sdev->cm_id - && ch->cm_id) { - if (srpt_disconnect_ch(ch) < 0) - continue; - pr_info("Relogin - closed existing channel %s\n", - ch->sess_name); - rsp->rsp_flags = - SRP_LOGIN_RSP_MULTICHAN_TERMINATED; - } - } - - mutex_unlock(&sdev->mutex); - - } else - rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_MAINTAINED; - if (*(__be64 *)req->target_port_id != cpu_to_be64(srpt_service_guid) || *(__be64 *)(req->target_port_id + 8) != cpu_to_be64(srpt_service_guid)) { rej->reason = cpu_to_be32( - SRP_LOGIN_REJ_UNABLE_ASSOCIATE_CHANNEL); - ret = -ENOMEM; - pr_err("rejected SRP_LOGIN_REQ because it" - " has an invalid target port identifier.\n"); + SRP_LOGIN_REJ_UNABLE_ASSOCIATE_CHANNEL); + pr_err("rejected SRP_LOGIN_REQ because it has an invalid target port identifier.\n"); goto reject; } + ret = -ENOMEM; ch = kzalloc(sizeof(*ch), GFP_KERNEL); if (!ch) { - rej->reason = cpu_to_be32( - SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); - pr_err("rejected SRP_LOGIN_REQ because no memory.\n"); - ret = -ENOMEM; + rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + pr_err("rejected SRP_LOGIN_REQ because out of memory.\n"); goto reject; } kref_init(&ch->kref); + ch->pkey = be16_to_cpu(pkey); + ch->nexus = nexus; ch->zw_cqe.done = srpt_zerolength_write_done; INIT_WORK(&ch->release_work, srpt_release_channel_work); - memcpy(ch->i_port_id, req->initiator_port_id, 16); - memcpy(ch->t_port_id, req->target_port_id, 16); - ch->sport = &sdev->port[param->port - 1]; - ch->cm_id = cm_id; + ch->sport = sport; + ch->ib_cm.cm_id = cm_id; cm_id->context = ch; /* * ch->rq_size should be at least as large as the initiator queue * depth to avoid that the initiator driver has to report QUEUE_FULL * to the SCSI mid-layer. */ - ch->rq_size = min(SRPT_RQ_SIZE, sdev->device->attrs.max_qp_wr); + ch->rq_size = min(MAX_SRPT_RQ_SIZE, sdev->device->attrs.max_qp_wr); spin_lock_init(&ch->spinlock); ch->state = CH_CONNECTING; INIT_LIST_HEAD(&ch->cmd_wait_list); - ch->rsp_size = ch->sport->port_attrib.srp_max_rsp_size; + ch->max_rsp_size = ch->sport->port_attrib.srp_max_rsp_size; ch->ioctx_ring = (struct srpt_send_ioctx **) srpt_alloc_ioctx_ring(ch->sport->sdev, ch->rq_size, sizeof(*ch->ioctx_ring[0]), - ch->rsp_size, DMA_TO_DEVICE); - if (!ch->ioctx_ring) + ch->max_rsp_size, DMA_TO_DEVICE); + if (!ch->ioctx_ring) { + pr_err("rejected SRP_LOGIN_REQ because creating a new QP SQ ring failed.\n"); + rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); goto free_ch; + } INIT_LIST_HEAD(&ch->free_list); for (i = 0; i < ch->rq_size; i++) { @@ -2058,59 +2155,88 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); goto free_ring; } + for (i = 0; i < ch->rq_size; i++) + INIT_LIST_HEAD(&ch->ioctx_recv_ring[i]->wait_list); } ret = srpt_create_ch_ib(ch); if (ret) { - rej->reason = cpu_to_be32( - SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); - pr_err("rejected SRP_LOGIN_REQ because creating" - " a new RDMA channel failed.\n"); - goto free_recv_ring; - } - - ret = srpt_ch_qp_rtr(ch, ch->qp); - if (ret) { rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); - pr_err("rejected SRP_LOGIN_REQ because enabling" - " RTR failed (error code = %d)\n", ret); - goto destroy_ib; + pr_err("rejected SRP_LOGIN_REQ because creating a new RDMA channel failed.\n"); + goto free_recv_ring; } - guid = (__be16 *)¶m->primary_path->dgid.global.interface_id; - snprintf(ch->ini_guid, sizeof(ch->ini_guid), "%04x:%04x:%04x:%04x", - be16_to_cpu(guid[0]), be16_to_cpu(guid[1]), - be16_to_cpu(guid[2]), be16_to_cpu(guid[3])); - snprintf(ch->sess_name, sizeof(ch->sess_name), "0x%016llx%016llx", - be64_to_cpu(*(__be64 *)ch->i_port_id), - be64_to_cpu(*(__be64 *)(ch->i_port_id + 8))); + strlcpy(ch->sess_name, src_addr, sizeof(ch->sess_name)); + snprintf(i_port_id, sizeof(i_port_id), "0x%016llx%016llx", + be64_to_cpu(*(__be64 *)nexus->i_port_id), + be64_to_cpu(*(__be64 *)(nexus->i_port_id + 8))); pr_debug("registering session %s\n", ch->sess_name); if (sport->port_guid_tpg.se_tpg_wwn) ch->sess = target_alloc_session(&sport->port_guid_tpg, 0, 0, TARGET_PROT_NORMAL, - ch->ini_guid, ch, NULL); + ch->sess_name, ch, NULL); if (sport->port_gid_tpg.se_tpg_wwn && IS_ERR_OR_NULL(ch->sess)) ch->sess = target_alloc_session(&sport->port_gid_tpg, 0, 0, - TARGET_PROT_NORMAL, ch->sess_name, ch, + TARGET_PROT_NORMAL, i_port_id, ch, NULL); /* Retry without leading "0x" */ if (sport->port_gid_tpg.se_tpg_wwn && IS_ERR_OR_NULL(ch->sess)) ch->sess = target_alloc_session(&sport->port_gid_tpg, 0, 0, TARGET_PROT_NORMAL, - ch->sess_name + 2, ch, NULL); + i_port_id + 2, ch, NULL); if (IS_ERR_OR_NULL(ch->sess)) { - pr_info("Rejected login because no ACL has been configured yet for initiator %s.\n", - ch->sess_name); - rej->reason = cpu_to_be32((PTR_ERR(ch->sess) == -ENOMEM) ? + ret = PTR_ERR(ch->sess); + pr_info("Rejected login for initiator %s: ret = %d.\n", + ch->sess_name, ret); + rej->reason = cpu_to_be32(ret == -ENOMEM ? SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES : SRP_LOGIN_REJ_CHANNEL_LIMIT_REACHED); + goto reject; + } + + mutex_lock(&sport->mutex); + + if ((req->req_flags & SRP_MTCH_ACTION) == SRP_MULTICHAN_SINGLE) { + struct srpt_rdma_ch *ch2; + + rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_NO_CHAN; + + list_for_each_entry(ch2, &nexus->ch_list, list) { + if (srpt_disconnect_ch(ch2) < 0) + continue; + pr_info("Relogin - closed existing channel %s\n", + ch2->sess_name); + rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_TERMINATED; + } + } else { + rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_MAINTAINED; + } + + list_add_tail_rcu(&ch->list, &nexus->ch_list); + + if (!sport->enabled) { + rej->reason = cpu_to_be32( + SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + pr_info("rejected SRP_LOGIN_REQ because target %s_%d is not enabled\n", + sdev->device->name, port_num); + mutex_unlock(&sport->mutex); + goto reject; + } + + mutex_unlock(&sport->mutex); + + ret = srpt_ch_qp_rtr(ch, ch->qp); + if (ret) { + rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + pr_err("rejected SRP_LOGIN_REQ because enabling RTR failed (error code = %d)\n", + ret); goto destroy_ib; } - pr_debug("Establish connection sess=%p name=%s cm_id=%p\n", ch->sess, - ch->sess_name, ch->cm_id); + pr_debug("Establish connection sess=%p name=%s ch=%p\n", ch->sess, + ch->sess_name, ch); /* create srp_login_response */ rsp->opcode = SRP_LOGIN_RSP; @@ -2118,8 +2244,8 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, rsp->max_it_iu_len = req->req_it_iu_len; rsp->max_ti_iu_len = req->req_it_iu_len; ch->max_ti_iu_len = it_iu_len; - rsp->buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT - | SRP_BUF_FORMAT_INDIRECT); + rsp->buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | + SRP_BUF_FORMAT_INDIRECT); rsp->req_lim_delta = cpu_to_be32(ch->rq_size); atomic_set(&ch->req_lim, ch->rq_size); atomic_set(&ch->req_lim_delta, 0); @@ -2135,25 +2261,31 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, rep_param->responder_resources = 4; rep_param->initiator_depth = 4; - ret = ib_send_cm_rep(cm_id, rep_param); - if (ret) { - pr_err("sending SRP_LOGIN_REQ response failed" - " (error code = %d)\n", ret); - goto release_channel; - } + /* + * Hold the sport mutex while accepting a connection to avoid that + * srpt_disconnect_ch() is invoked concurrently with this code. + */ + mutex_lock(&sport->mutex); + if (sport->enabled && ch->state == CH_CONNECTING) + ret = ib_send_cm_rep(cm_id, rep_param); + else + ret = -EINVAL; + mutex_unlock(&sport->mutex); - mutex_lock(&sdev->mutex); - list_add_tail(&ch->list, &sdev->rch_list); - mutex_unlock(&sdev->mutex); + switch (ret) { + case 0: + break; + case -EINVAL: + goto reject; + default: + rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + pr_err("sending SRP_LOGIN_REQ response failed (error code = %d)\n", + ret); + goto reject; + } goto out; -release_channel: - srpt_disconnect_ch(ch); - transport_deregister_session_configfs(ch->sess); - transport_deregister_session(ch->sess); - ch->sess = NULL; - destroy_ib: srpt_destroy_ch_ib(ch); @@ -2165,15 +2297,20 @@ free_recv_ring: free_ring: srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring, ch->sport->sdev, ch->rq_size, - ch->rsp_size, DMA_TO_DEVICE); + ch->max_rsp_size, DMA_TO_DEVICE); free_ch: + cm_id->context = NULL; kfree(ch); + ch = NULL; + + WARN_ON_ONCE(ret == 0); reject: + pr_info("Rejecting login with reason %#x\n", be32_to_cpu(rej->reason)); rej->opcode = SRP_LOGIN_REJ; rej->tag = req->tag; - rej->buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT - | SRP_BUF_FORMAT_INDIRECT); + rej->buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | + SRP_BUF_FORMAT_INDIRECT); ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, (void *)rej, sizeof(*rej)); @@ -2186,6 +2323,19 @@ out: return ret; } +static int srpt_ib_cm_req_recv(struct ib_cm_id *cm_id, + struct ib_cm_req_event_param *param, + void *private_data) +{ + char sguid[40]; + + srpt_format_guid(sguid, sizeof(sguid), + ¶m->primary_path->dgid.global.interface_id); + + return srpt_cm_req_recv(cm_id, param->port, param->primary_path->pkey, + private_data, sguid); +} + static void srpt_cm_rej_recv(struct srpt_rdma_ch *ch, enum ib_cm_rej_reason reason, const u8 *private_data, @@ -2206,7 +2356,8 @@ static void srpt_cm_rej_recv(struct srpt_rdma_ch *ch, } /** - * srpt_cm_rtu_recv() - Process an IB_CM_RTU_RECEIVED or USER_ESTABLISHED event. + * srpt_cm_rtu_recv - process an IB_CM_RTU_RECEIVED or USER_ESTABLISHED event + * @ch: SRPT RDMA channel. * * An IB_CM_RTU_RECEIVED message indicates that the connection is established * and that the recipient may begin transmitting (RTU = ready to use). @@ -2215,21 +2366,34 @@ static void srpt_cm_rtu_recv(struct srpt_rdma_ch *ch) { int ret; - if (srpt_set_ch_state(ch, CH_LIVE)) { - ret = srpt_ch_qp_rts(ch, ch->qp); + ret = srpt_ch_qp_rts(ch, ch->qp); + if (ret < 0) { + pr_err("%s-%d: QP transition to RTS failed\n", ch->sess_name, + ch->qp->qp_num); + srpt_close_ch(ch); + return; + } - if (ret == 0) { - /* Trigger wait list processing. */ - ret = srpt_zerolength_write(ch); - WARN_ONCE(ret < 0, "%d\n", ret); - } else { - srpt_close_ch(ch); - } + /* + * Note: calling srpt_close_ch() if the transition to the LIVE state + * fails is not necessary since that means that that function has + * already been invoked from another thread. + */ + if (!srpt_set_ch_state(ch, CH_LIVE)) { + pr_err("%s-%d: channel transition to LIVE state failed\n", + ch->sess_name, ch->qp->qp_num); + return; } + + /* Trigger wait list processing. */ + ret = srpt_zerolength_write(ch); + WARN_ONCE(ret < 0, "%d\n", ret); } /** - * srpt_cm_handler() - IB connection manager callback function. + * srpt_cm_handler - IB connection manager callback function + * @cm_id: IB/CM connection identifier. + * @event: IB/CM event. * * A non-zero return value will cause the caller destroy the CM ID. * @@ -2246,8 +2410,8 @@ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) ret = 0; switch (event->event) { case IB_CM_REQ_RECEIVED: - ret = srpt_cm_req_recv(cm_id, &event->param.req_rcvd, - event->private_data); + ret = srpt_ib_cm_req_recv(cm_id, &event->param.req_rcvd, + event->private_data); break; case IB_CM_REJ_RECEIVED: srpt_cm_rej_recv(ch, event->param.rej_rcvd.reason, @@ -2294,11 +2458,11 @@ static int srpt_write_pending_status(struct se_cmd *se_cmd) struct srpt_send_ioctx *ioctx; ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); - return srpt_get_cmd_state(ioctx) == SRPT_STATE_NEED_DATA; + return ioctx->state == SRPT_STATE_NEED_DATA; } /* - * srpt_write_pending() - Start data transfer from initiator to target (write). + * srpt_write_pending - Start data transfer from initiator to target (write). */ static int srpt_write_pending(struct se_cmd *se_cmd) { @@ -2355,7 +2519,8 @@ static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status) } /** - * srpt_queue_response() - Transmits the response to a SCSI command. + * srpt_queue_response - transmit the response to a SCSI command + * @cmd: SCSI target command. * * Callback function called by the TCM core. Must not block since it can be * invoked on the context of the IB completion handler. @@ -2369,13 +2534,11 @@ static void srpt_queue_response(struct se_cmd *cmd) struct ib_send_wr send_wr, *first_wr = &send_wr, *bad_wr; struct ib_sge sge; enum srpt_command_state state; - unsigned long flags; int resp_len, ret, i; u8 srp_tm_status; BUG_ON(!ch); - spin_lock_irqsave(&ioctx->spinlock, flags); state = ioctx->state; switch (state) { case SRPT_STATE_NEW: @@ -2390,7 +2553,6 @@ static void srpt_queue_response(struct se_cmd *cmd) ch, ioctx->ioctx.index, ioctx->state); break; } - spin_unlock_irqrestore(&ioctx->spinlock, flags); if (unlikely(WARN_ON_ONCE(state == SRPT_STATE_CMD_RSP_SENT))) return; @@ -2494,26 +2656,56 @@ static void srpt_refresh_port_work(struct work_struct *work) srpt_refresh_port(sport); } +static bool srpt_ch_list_empty(struct srpt_port *sport) +{ + struct srpt_nexus *nexus; + bool res = true; + + rcu_read_lock(); + list_for_each_entry(nexus, &sport->nexus_list, entry) + if (!list_empty(&nexus->ch_list)) + res = false; + rcu_read_unlock(); + + return res; +} + /** - * srpt_release_sdev() - Free the channel resources associated with a target. + * srpt_release_sport - disable login and wait for associated channels + * @sport: SRPT HCA port. */ -static int srpt_release_sdev(struct srpt_device *sdev) +static int srpt_release_sport(struct srpt_port *sport) { - int i, res; + struct srpt_nexus *nexus, *next_n; + struct srpt_rdma_ch *ch; WARN_ON_ONCE(irqs_disabled()); - BUG_ON(!sdev); - - mutex_lock(&sdev->mutex); - for (i = 0; i < ARRAY_SIZE(sdev->port); i++) - srpt_set_enabled(&sdev->port[i], false); - mutex_unlock(&sdev->mutex); + mutex_lock(&sport->mutex); + srpt_set_enabled(sport, false); + mutex_unlock(&sport->mutex); + + while (wait_event_timeout(sport->ch_releaseQ, + srpt_ch_list_empty(sport), 5 * HZ) <= 0) { + pr_info("%s_%d: waiting for session unregistration ...\n", + sport->sdev->device->name, sport->port); + rcu_read_lock(); + list_for_each_entry(nexus, &sport->nexus_list, entry) { + list_for_each_entry(ch, &nexus->ch_list, list) { + pr_info("%s-%d: state %s\n", + ch->sess_name, ch->qp->qp_num, + get_ch_state_name(ch->state)); + } + } + rcu_read_unlock(); + } - res = wait_event_interruptible(sdev->ch_releaseQ, - list_empty_careful(&sdev->rch_list)); - if (res) - pr_err("%s: interrupted.\n", __func__); + mutex_lock(&sport->mutex); + list_for_each_entry_safe(nexus, next_n, &sport->nexus_list, entry) { + list_del(&nexus->entry); + kfree_rcu(nexus, rcu); + } + mutex_unlock(&sport->mutex); return 0; } @@ -2600,8 +2792,10 @@ static int srpt_alloc_srq(struct srpt_device *sdev) sdev->use_srq = true; sdev->srq = srq; - for (i = 0; i < sdev->srq_size; ++i) + for (i = 0; i < sdev->srq_size; ++i) { + INIT_LIST_HEAD(&sdev->ioctx_ring[i]->wait_list); srpt_post_recv(sdev, NULL, sdev->ioctx_ring[i]); + } return 0; } @@ -2623,7 +2817,8 @@ static int srpt_use_srq(struct srpt_device *sdev, bool use_srq) } /** - * srpt_add_one() - Infiniband device addition callback function. + * srpt_add_one - InfiniBand device addition callback function + * @device: Describes a HCA. */ static void srpt_add_one(struct ib_device *device) { @@ -2638,9 +2833,7 @@ static void srpt_add_one(struct ib_device *device) goto err; sdev->device = device; - INIT_LIST_HEAD(&sdev->rch_list); - init_waitqueue_head(&sdev->ch_releaseQ); - mutex_init(&sdev->mutex); + mutex_init(&sdev->sdev_mutex); sdev->pd = ib_alloc_pd(device, 0); if (IS_ERR(sdev->pd)) @@ -2681,6 +2874,9 @@ static void srpt_add_one(struct ib_device *device) for (i = 1; i <= sdev->device->phys_port_cnt; i++) { sport = &sdev->port[i - 1]; + INIT_LIST_HEAD(&sport->nexus_list); + init_waitqueue_head(&sport->ch_releaseQ); + mutex_init(&sport->mutex); sport->sdev = sdev; sport->port = i; sport->port_attrib.srp_max_rdma_size = DEFAULT_MAX_RDMA_SIZE; @@ -2721,7 +2917,9 @@ err: } /** - * srpt_remove_one() - InfiniBand device removal callback function. + * srpt_remove_one - InfiniBand device removal callback function + * @device: Describes a HCA. + * @client_data: The value passed as the third argument to ib_set_client_data(). */ static void srpt_remove_one(struct ib_device *device, void *client_data) { @@ -2751,7 +2949,9 @@ static void srpt_remove_one(struct ib_device *device, void *client_data) spin_lock(&srpt_dev_lock); list_del(&sdev->list); spin_unlock(&srpt_dev_lock); - srpt_release_sdev(sdev); + + for (i = 0; i < sdev->device->phys_port_cnt; i++) + srpt_release_sport(&sdev->port[i]); srpt_free_srq(sdev); @@ -2827,7 +3027,8 @@ static void srpt_release_cmd(struct se_cmd *se_cmd) } /** - * srpt_close_session() - Forcibly close a session. + * srpt_close_session - forcibly close a session + * @se_sess: SCSI target session. * * Callback function invoked by the TCM core to clean up sessions associated * with a node ACL when the user invokes @@ -2836,15 +3037,13 @@ static void srpt_release_cmd(struct se_cmd *se_cmd) static void srpt_close_session(struct se_session *se_sess) { struct srpt_rdma_ch *ch = se_sess->fabric_sess_ptr; - struct srpt_device *sdev = ch->sport->sdev; - mutex_lock(&sdev->mutex); srpt_disconnect_ch_sync(ch); - mutex_unlock(&sdev->mutex); } /** - * srpt_sess_get_index() - Return the value of scsiAttIntrPortIndex (SCSI-MIB). + * srpt_sess_get_index - return the value of scsiAttIntrPortIndex (SCSI-MIB) + * @se_sess: SCSI target session. * * A quote from RFC 4455 (SCSI-MIB) about this MIB object: * This object represents an arbitrary integer used to uniquely identify a @@ -2866,7 +3065,7 @@ static int srpt_get_tcm_cmd_state(struct se_cmd *se_cmd) struct srpt_send_ioctx *ioctx; ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); - return srpt_get_cmd_state(ioctx); + return ioctx->state; } static int srpt_parse_guid(u64 *guid, const char *name) @@ -2883,7 +3082,7 @@ out: } /** - * srpt_parse_i_port_id() - Parse an initiator port ID. + * srpt_parse_i_port_id - parse an initiator port ID * @name: ASCII representation of a 128-bit initiator port ID. * @i_port_id: Binary 128-bit port ID. */ @@ -3064,18 +3263,24 @@ static ssize_t srpt_tpg_attrib_use_srq_store(struct config_item *item, if (val != !!val) return -EINVAL; - ret = mutex_lock_interruptible(&sdev->mutex); + ret = mutex_lock_interruptible(&sdev->sdev_mutex); if (ret < 0) return ret; + ret = mutex_lock_interruptible(&sport->mutex); + if (ret < 0) + goto unlock_sdev; enabled = sport->enabled; /* Log out all initiator systems before changing 'use_srq'. */ srpt_set_enabled(sport, false); sport->port_attrib.use_srq = val; srpt_use_srq(sdev, sport->port_attrib.use_srq); srpt_set_enabled(sport, enabled); - mutex_unlock(&sdev->mutex); + ret = count; + mutex_unlock(&sport->mutex); +unlock_sdev: + mutex_unlock(&sdev->sdev_mutex); - return count; + return ret; } CONFIGFS_ATTR(srpt_tpg_attrib_, srp_max_rdma_size); @@ -3104,7 +3309,6 @@ static ssize_t srpt_tpg_enable_store(struct config_item *item, { struct se_portal_group *se_tpg = to_tpg(item); struct srpt_port *sport = srpt_tpg_to_sport(se_tpg); - struct srpt_device *sdev = sport->sdev; unsigned long tmp; int ret; @@ -3119,9 +3323,9 @@ static ssize_t srpt_tpg_enable_store(struct config_item *item, return -EINVAL; } - mutex_lock(&sdev->mutex); + mutex_lock(&sport->mutex); srpt_set_enabled(sport, tmp); - mutex_unlock(&sdev->mutex); + mutex_unlock(&sport->mutex); return count; } @@ -3134,8 +3338,10 @@ static struct configfs_attribute *srpt_tpg_attrs[] = { }; /** - * configfs callback invoked for - * mkdir /sys/kernel/config/target/$driver/$port/$tpg + * srpt_make_tpg - configfs callback invoked for mkdir /sys/kernel/config/target/$driver/$port/$tpg + * @wwn: Corresponds to $driver/$port. + * @group: Not used. + * @name: $tpg. */ static struct se_portal_group *srpt_make_tpg(struct se_wwn *wwn, struct config_group *group, @@ -3157,8 +3363,8 @@ static struct se_portal_group *srpt_make_tpg(struct se_wwn *wwn, } /** - * configfs callback invoked for - * rmdir /sys/kernel/config/target/$driver/$port/$tpg + * srpt_drop_tpg - configfs callback invoked for rmdir /sys/kernel/config/target/$driver/$port/$tpg + * @tpg: Target portal group to deregister. */ static void srpt_drop_tpg(struct se_portal_group *tpg) { @@ -3169,8 +3375,10 @@ static void srpt_drop_tpg(struct se_portal_group *tpg) } /** - * configfs callback invoked for - * mkdir /sys/kernel/config/target/$driver/$port + * srpt_make_tport - configfs callback invoked for mkdir /sys/kernel/config/target/$driver/$port + * @tf: Not used. + * @group: Not used. + * @name: $port. */ static struct se_wwn *srpt_make_tport(struct target_fabric_configfs *tf, struct config_group *group, @@ -3180,8 +3388,8 @@ static struct se_wwn *srpt_make_tport(struct target_fabric_configfs *tf, } /** - * configfs callback invoked for - * rmdir /sys/kernel/config/target/$driver/$port + * srpt_drop_tport - configfs callback invoked for rmdir /sys/kernel/config/target/$driver/$port + * @wwn: $port. */ static void srpt_drop_tport(struct se_wwn *wwn) { @@ -3239,7 +3447,7 @@ static const struct target_core_fabric_ops srpt_template = { }; /** - * srpt_init_module() - Kernel module initialization. + * srpt_init_module - kernel module initialization * * Note: Since ib_register_client() registers callback functions, and since at * least one of these callback functions (srpt_add_one()) calls target core diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 673387d365a3..4d9199fd00dc 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -54,6 +54,8 @@ */ #define SRP_SERVICE_NAME_PREFIX "SRP.T10:" +struct srpt_nexus; + enum { /* * SRP IOControllerProfile attributes for SRP target ports that have @@ -114,7 +116,7 @@ enum { MIN_SRPT_SQ_SIZE = 16, DEF_SRPT_SQ_SIZE = 4096, - SRPT_RQ_SIZE = 128, + MAX_SRPT_RQ_SIZE = 128, MIN_SRPT_SRQ_SIZE = 4, DEFAULT_SRPT_SRQ_SIZE = 4095, MAX_SRPT_SRQ_SIZE = 65535, @@ -134,7 +136,7 @@ enum { }; /** - * enum srpt_command_state - SCSI command state managed by SRPT. + * enum srpt_command_state - SCSI command state managed by SRPT * @SRPT_STATE_NEW: New command arrived and is being processed. * @SRPT_STATE_NEED_DATA: Processing a write or bidir command and waiting * for data arrival. @@ -158,7 +160,8 @@ enum srpt_command_state { }; /** - * struct srpt_ioctx - Shared SRPT I/O context information. + * struct srpt_ioctx - shared SRPT I/O context information + * @cqe: Completion queue element. * @buf: Pointer to the buffer. * @dma: DMA address of the buffer. * @index: Index of the I/O context in its ioctx_ring array. @@ -171,7 +174,7 @@ struct srpt_ioctx { }; /** - * struct srpt_recv_ioctx - SRPT receive I/O context. + * struct srpt_recv_ioctx - SRPT receive I/O context * @ioctx: See above. * @wait_list: Node for insertion in srpt_rdma_ch.cmd_wait_list. */ @@ -187,13 +190,20 @@ struct srpt_rw_ctx { }; /** - * struct srpt_send_ioctx - SRPT send I/O context. + * struct srpt_send_ioctx - SRPT send I/O context * @ioctx: See above. * @ch: Channel pointer. - * @spinlock: Protects 'state'. + * @s_rw_ctx: @rw_ctxs points here if only a single rw_ctx is needed. + * @rw_ctxs: RDMA read/write contexts. + * @rdma_cqe: RDMA completion queue element. + * @free_list: Node in srpt_rdma_ch.free_list. * @state: I/O context state. * @cmd: Target core command data structure. * @sense_data: SCSI sense data. + * @n_rdma: Number of work requests needed to transfer this ioctx. + * @n_rw_ctx: Size of rw_ctxs array. + * @queue_status_only: Send a SCSI status back to the initiator but no data. + * @sense_data: Sense data to be sent to the initiator. */ struct srpt_send_ioctx { struct srpt_ioctx ioctx; @@ -204,10 +214,8 @@ struct srpt_send_ioctx { struct ib_cqe rdma_cqe; struct list_head free_list; - spinlock_t spinlock; enum srpt_command_state state; struct se_cmd cmd; - struct completion tx_done; u8 n_rdma; u8 n_rw_ctx; bool queue_status_only; @@ -215,7 +223,7 @@ struct srpt_send_ioctx { }; /** - * enum rdma_ch_state - SRP channel state. + * enum rdma_ch_state - SRP channel state * @CH_CONNECTING: QP is in RTR state; waiting for RTU. * @CH_LIVE: QP is in RTS state. * @CH_DISCONNECTING: DREQ has been sent and waiting for DREP or DREQ has @@ -233,17 +241,19 @@ enum rdma_ch_state { }; /** - * struct srpt_rdma_ch - RDMA channel. - * @cm_id: IB CM ID associated with the channel. + * struct srpt_rdma_ch - RDMA channel + * @nexus: I_T nexus this channel is associated with. * @qp: IB queue pair used for communicating over this channel. + * @cm_id: IB CM ID associated with the channel. * @cq: IB completion queue for this channel. + * @zw_cqe: Zero-length write CQE. + * @rcu: RCU head. + * @kref: kref for this channel. * @rq_size: IB receive queue size. - * @rsp_size IB response message size in bytes. + * @max_rsp_size: Maximum size of an RSP response message in bytes. * @sq_wr_avail: number of work requests available in the send queue. * @sport: pointer to the information of the HCA port used by this * channel. - * @i_port_id: 128-bit initiator port identifier copied from SRP_LOGIN_REQ. - * @t_port_id: 128-bit target port identifier copied from SRP_LOGIN_REQ. * @max_ti_iu_len: maximum target-to-initiator information unit length. * @req_lim: request limit: maximum number of requests that may be sent * by the initiator without having received a response. @@ -251,30 +261,34 @@ enum rdma_ch_state { * @spinlock: Protects free_list and state. * @free_list: Head of list with free send I/O contexts. * @state: channel state. See also enum rdma_ch_state. + * @processing_wait_list: Whether or not cmd_wait_list is being processed. * @ioctx_ring: Send ring. * @ioctx_recv_ring: Receive I/O context ring. - * @list: Node for insertion in the srpt_device.rch_list list. + * @list: Node in srpt_nexus.ch_list. * @cmd_wait_list: List of SCSI commands that arrived before the RTU event. This * list contains struct srpt_ioctx elements and is protected * against concurrent modification by the cm_id spinlock. + * @pkey: P_Key of the IB partition for this SRP channel. * @sess: Session information associated with this SRP channel. * @sess_name: Session name. - * @ini_guid: Initiator port GUID. * @release_work: Allows scheduling of srpt_release_channel(). - * @release_done: Enables waiting for srpt_release_channel() completion. */ struct srpt_rdma_ch { - struct ib_cm_id *cm_id; + struct srpt_nexus *nexus; struct ib_qp *qp; + union { + struct { + struct ib_cm_id *cm_id; + } ib_cm; + }; struct ib_cq *cq; struct ib_cqe zw_cqe; + struct rcu_head rcu; struct kref kref; int rq_size; - u32 rsp_size; + u32 max_rsp_size; atomic_t sq_wr_avail; struct srpt_port *sport; - u8 i_port_id[16]; - u8 t_port_id[16]; int max_ti_iu_len; atomic_t req_lim; atomic_t req_lim_delta; @@ -285,15 +299,31 @@ struct srpt_rdma_ch { struct srpt_recv_ioctx **ioctx_recv_ring; struct list_head list; struct list_head cmd_wait_list; + uint16_t pkey; + bool processing_wait_list; struct se_session *sess; - u8 sess_name[36]; - u8 ini_guid[24]; + u8 sess_name[24]; struct work_struct release_work; - struct completion *release_done; }; /** - * struct srpt_port_attib - Attributes for SRPT port + * struct srpt_nexus - I_T nexus + * @rcu: RCU head for this data structure. + * @entry: srpt_port.nexus_list list node. + * @ch_list: struct srpt_rdma_ch list. Protected by srpt_port.mutex. + * @i_port_id: 128-bit initiator port identifier copied from SRP_LOGIN_REQ. + * @t_port_id: 128-bit target port identifier copied from SRP_LOGIN_REQ. + */ +struct srpt_nexus { + struct rcu_head rcu; + struct list_head entry; + struct list_head ch_list; + u8 i_port_id[16]; + u8 t_port_id[16]; +}; + +/** + * struct srpt_port_attib - attributes for SRPT port * @srp_max_rdma_size: Maximum size of SRP RDMA transfers for new connections. * @srp_max_rsp_size: Maximum size of SRP response messages in bytes. * @srp_sq_size: Shared receive queue (SRQ) size. @@ -307,7 +337,7 @@ struct srpt_port_attrib { }; /** - * struct srpt_port - Information associated by SRPT with a single IB port. + * struct srpt_port - information associated by SRPT with a single IB port * @sdev: backpointer to the HCA information. * @mad_agent: per-port management datagram processing information. * @enabled: Whether or not this target port is enabled. @@ -323,7 +353,10 @@ struct srpt_port_attrib { * @port_guid_wwn: WWN associated with target port GUID. * @port_gid_tpg: TPG associated with target port GID. * @port_gid_wwn: WWN associated with target port GID. - * @port_acl_list: Head of the list with all node ACLs for this port. + * @port_attrib: Port attributes that can be accessed through configfs. + * @ch_releaseQ: Enables waiting for removal from nexus_list. + * @mutex: Protects nexus_list. + * @nexus_list: Nexus list. See also srpt_nexus.entry. */ struct srpt_port { struct srpt_device *sdev; @@ -341,21 +374,22 @@ struct srpt_port { struct se_portal_group port_gid_tpg; struct se_wwn port_gid_wwn; struct srpt_port_attrib port_attrib; + wait_queue_head_t ch_releaseQ; + struct mutex mutex; + struct list_head nexus_list; }; /** - * struct srpt_device - Information associated by SRPT with a single HCA. + * struct srpt_device - information associated by SRPT with a single HCA * @device: Backpointer to the struct ib_device managed by the IB core. * @pd: IB protection domain. * @lkey: L_Key (local key) with write access to all local memory. * @srq: Per-HCA SRQ (shared receive queue). * @cm_id: Connection identifier. * @srq_size: SRQ size. + * @sdev_mutex: Serializes use_srq changes. * @use_srq: Whether or not to use SRQ. * @ioctx_ring: Per-HCA SRQ. - * @rch_list: Per-device channel list -- see also srpt_rdma_ch.list. - * @ch_releaseQ: Enables waiting for removal from rch_list. - * @mutex: Protects rch_list. * @port: Information about the ports owned by this HCA. * @event_handler: Per-HCA asynchronous IB event handler. * @list: Node in srpt_dev_list. @@ -367,11 +401,9 @@ struct srpt_device { struct ib_srq *srq; struct ib_cm_id *cm_id; int srq_size; + struct mutex sdev_mutex; bool use_srq; struct srpt_recv_ioctx **ioctx_ring; - struct list_head rch_list; - wait_queue_head_t ch_releaseQ; - struct mutex mutex; struct srpt_port port[2]; struct ib_event_handler event_handler; struct list_head list; diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c index 769598f7b6c8..3aaf4bad6c5a 100644 --- a/drivers/net/ethernet/mellanox/mlx4/qp.c +++ b/drivers/net/ethernet/mellanox/mlx4/qp.c @@ -287,6 +287,9 @@ void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt) u64 in_param = 0; int err; + if (!cnt) + return; + if (mlx4_is_mfunc(dev)) { set_param_l(&in_param, base_qpn); set_param_h(&in_param, cnt); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index e7e7cef2bde4..14d57828945d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -417,7 +417,11 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr) cqn = be32_to_cpu(eqe->data.comp.cqn) & 0xffffff; mlx5_cq_completion(dev, cqn); break; - + case MLX5_EVENT_TYPE_DCT_DRAINED: + rsn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff; + rsn |= (MLX5_RES_DCT << MLX5_USER_INDEX_LEN); + mlx5_rsc_event(dev, rsn, eqe->type); + break; case MLX5_EVENT_TYPE_PATH_MIG: case MLX5_EVENT_TYPE_COMM_EST: case MLX5_EVENT_TYPE_SQ_DRAINED: @@ -715,6 +719,9 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev) if (MLX5_CAP_GEN(dev, fpga)) async_event_mask |= (1ull << MLX5_EVENT_TYPE_FPGA_ERROR); + if (MLX5_CAP_GEN_MAX(dev, dct)) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_DCT_DRAINED); + err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD, MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c index c4392f741c5f..e6175f8ac0e4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -688,7 +688,7 @@ static inline int mlx5_fpga_conn_init_qp(struct mlx5_fpga_conn *conn) MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); MLX5_SET(qpc, qpc, primary_address_path.pkey_index, MLX5_FPGA_PKEY_INDEX); - MLX5_SET(qpc, qpc, primary_address_path.port, MLX5_FPGA_PORT_NUM); + MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, MLX5_FPGA_PORT_NUM); MLX5_SET(qpc, qpc, pd, conn->fdev->conn_res.pdn); MLX5_SET(qpc, qpc, cqn_snd, conn->cq.mcq.cqn); MLX5_SET(qpc, qpc, cqn_rcv, conn->cq.mcq.cqn); @@ -727,7 +727,7 @@ static inline int mlx5_fpga_conn_rtr_qp(struct mlx5_fpga_conn *conn) MLX5_SET(qpc, qpc, next_rcv_psn, MLX5_GET(fpga_qpc, conn->fpga_qpc, next_send_psn)); MLX5_SET(qpc, qpc, primary_address_path.pkey_index, MLX5_FPGA_PKEY_INDEX); - MLX5_SET(qpc, qpc, primary_address_path.port, MLX5_FPGA_PORT_NUM); + MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, MLX5_FPGA_PORT_NUM); ether_addr_copy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32), MLX5_ADDR_OF(fpga_qpc, conn->fpga_qpc, fpga_mac_47_32)); MLX5_SET(qpc, qpc, primary_address_path.udp_sport, @@ -888,7 +888,8 @@ struct mlx5_fpga_conn *mlx5_fpga_conn_create(struct mlx5_fpga_device *fdev, err = mlx5_core_roce_gid_set(fdev->mdev, conn->qp.sgid_index, MLX5_ROCE_VERSION_2, MLX5_ROCE_L3_TYPE_IPV6, - remote_ip, remote_mac, true, 0); + remote_ip, remote_mac, true, 0, + MLX5_FPGA_PORT_NUM); if (err) { mlx5_fpga_err(fdev, "Failed to set SGID: %d\n", err); ret = ERR_PTR(err); @@ -954,7 +955,7 @@ err_cq: mlx5_fpga_conn_destroy_cq(conn); err_gid: mlx5_core_roce_gid_set(fdev->mdev, conn->qp.sgid_index, 0, 0, NULL, - NULL, false, 0); + NULL, false, 0, MLX5_FPGA_PORT_NUM); err_rsvd_gid: mlx5_core_reserved_gid_free(fdev->mdev, conn->qp.sgid_index); err: @@ -982,7 +983,7 @@ void mlx5_fpga_conn_destroy(struct mlx5_fpga_conn *conn) mlx5_fpga_conn_destroy_cq(conn); mlx5_core_roce_gid_set(conn->fdev->mdev, conn->qp.sgid_index, 0, 0, - NULL, NULL, false, 0); + NULL, NULL, false, 0, MLX5_FPGA_PORT_NUM); mlx5_core_reserved_gid_free(conn->fdev->mdev, conn->qp.sgid_index); kfree(conn); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index 5ef1b56b6a96..9d11e92fb541 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -195,12 +195,20 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev) return 0; } -int mlx5_cmd_init_hca(struct mlx5_core_dev *dev) +int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id) { u32 out[MLX5_ST_SZ_DW(init_hca_out)] = {0}; u32 in[MLX5_ST_SZ_DW(init_hca_in)] = {0}; + int i; MLX5_SET(init_hca_in, in, opcode, MLX5_CMD_OP_INIT_HCA); + + if (MLX5_CAP_GEN(dev, sw_owner_id)) { + for (i = 0; i < 4; i++) + MLX5_ARRAY_SET(init_hca_in, in, sw_owner_id, i, + sw_owner_id[i]); + } + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index ee2f378c5030..a281d95ce17c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -189,7 +189,7 @@ int mlx5i_create_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp MLX5_QP_ENHANCED_ULP_STATELESS_MODE); addr_path = MLX5_ADDR_OF(qpc, qpc, primary_address_path); - MLX5_SET(ads, addr_path, port, 1); + MLX5_SET(ads, addr_path, vhca_port_num, 1); MLX5_SET(ads, addr_path, grh, 1); ret = mlx5_core_create_qp(mdev, qp, in, inlen); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index 5701f125e99c..e159243e0fcf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -31,6 +31,8 @@ */ #include <linux/clocksource.h> +#include <linux/highmem.h> +#include <rdma/mlx5-abi.h> #include "en.h" enum { @@ -71,6 +73,28 @@ static u64 read_internal_timer(const struct cyclecounter *cc) return mlx5_read_internal_timer(mdev) & cc->mask; } +static void mlx5_update_clock_info_page(struct mlx5_core_dev *mdev) +{ + struct mlx5_ib_clock_info *clock_info = mdev->clock_info; + struct mlx5_clock *clock = &mdev->clock; + u32 sign; + + if (!clock_info) + return; + + sign = smp_load_acquire(&clock_info->sign); + smp_store_mb(clock_info->sign, + sign | MLX5_IB_CLOCK_INFO_KERNEL_UPDATING); + + clock_info->cycles = clock->tc.cycle_last; + clock_info->mult = clock->cycles.mult; + clock_info->nsec = clock->tc.nsec; + clock_info->frac = clock->tc.frac; + + smp_store_release(&clock_info->sign, + sign + MLX5_IB_CLOCK_INFO_KERNEL_UPDATING * 2); +} + static void mlx5_pps_out(struct work_struct *work) { struct mlx5_pps *pps_info = container_of(work, struct mlx5_pps, @@ -109,6 +133,7 @@ static void mlx5_timestamp_overflow(struct work_struct *work) write_lock_irqsave(&clock->lock, flags); timecounter_read(&clock->tc); + mlx5_update_clock_info_page(clock->mdev); write_unlock_irqrestore(&clock->lock, flags); schedule_delayed_work(&clock->overflow_work, clock->overflow_period); } @@ -123,6 +148,7 @@ static int mlx5_ptp_settime(struct ptp_clock_info *ptp, write_lock_irqsave(&clock->lock, flags); timecounter_init(&clock->tc, &clock->cycles, ns); + mlx5_update_clock_info_page(clock->mdev); write_unlock_irqrestore(&clock->lock, flags); return 0; @@ -152,6 +178,7 @@ static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) write_lock_irqsave(&clock->lock, flags); timecounter_adjtime(&clock->tc, delta); + mlx5_update_clock_info_page(clock->mdev); write_unlock_irqrestore(&clock->lock, flags); return 0; @@ -179,6 +206,7 @@ static int mlx5_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta) timecounter_read(&clock->tc); clock->cycles.mult = neg_adj ? clock->nominal_c_mult - diff : clock->nominal_c_mult + diff; + mlx5_update_clock_info_page(clock->mdev); write_unlock_irqrestore(&clock->lock, flags); return 0; @@ -474,6 +502,7 @@ void mlx5_init_clock(struct mlx5_core_dev *mdev) clock->cycles.shift); clock->nominal_c_mult = clock->cycles.mult; clock->cycles.mask = CLOCKSOURCE_MASK(41); + clock->mdev = mdev; timecounter_init(&clock->tc, &clock->cycles, ktime_to_ns(ktime_get_real())); @@ -486,6 +515,25 @@ void mlx5_init_clock(struct mlx5_core_dev *mdev) do_div(ns, NSEC_PER_SEC / 2 / HZ); clock->overflow_period = ns; + mdev->clock_info_page = alloc_page(GFP_KERNEL); + if (mdev->clock_info_page) { + mdev->clock_info = kmap(mdev->clock_info_page); + if (!mdev->clock_info) { + __free_page(mdev->clock_info_page); + mlx5_core_warn(mdev, "failed to map clock page\n"); + } else { + mdev->clock_info->sign = 0; + mdev->clock_info->nsec = clock->tc.nsec; + mdev->clock_info->cycles = clock->tc.cycle_last; + mdev->clock_info->mask = clock->cycles.mask; + mdev->clock_info->mult = clock->nominal_c_mult; + mdev->clock_info->shift = clock->cycles.shift; + mdev->clock_info->frac = clock->tc.frac; + mdev->clock_info->overflow_period = + clock->overflow_period; + } + } + INIT_WORK(&clock->pps_info.out_work, mlx5_pps_out); INIT_DELAYED_WORK(&clock->overflow_work, mlx5_timestamp_overflow); if (clock->overflow_period) @@ -525,5 +573,12 @@ void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) cancel_work_sync(&clock->pps_info.out_work); cancel_delayed_work_sync(&clock->overflow_work); + + if (mdev->clock_info) { + kunmap(mdev->clock_info_page); + __free_page(mdev->clock_info_page); + mdev->clock_info = NULL; + } + kfree(clock->ptp_info.pin_config); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c index 573f59f46d41..7722a3f9bb68 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c @@ -121,7 +121,7 @@ EXPORT_SYMBOL_GPL(mlx5_core_reserved_gids_count); int mlx5_core_roce_gid_set(struct mlx5_core_dev *dev, unsigned int index, u8 roce_version, u8 roce_l3_type, const u8 *gid, - const u8 *mac, bool vlan, u16 vlan_id) + const u8 *mac, bool vlan, u16 vlan_id, u8 port_num) { #define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v) u32 in[MLX5_ST_SZ_DW(set_roce_address_in)] = {0}; @@ -148,6 +148,9 @@ int mlx5_core_roce_gid_set(struct mlx5_core_dev *dev, unsigned int index, memcpy(addr_l3_addr, gid, gidsz); } + if (MLX5_CAP_GEN(dev, num_vhca_ports) > 0) + MLX5_SET(set_roce_address_in, in, vhca_port_num, port_num); + MLX5_SET(set_roce_address_in, in, roce_address_index, index); MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS); return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 0f88fd30a09a..2ef641c91c26 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -75,6 +75,8 @@ static unsigned int prof_sel = MLX5_DEFAULT_PROF; module_param_named(prof_sel, prof_sel, uint, 0444); MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2"); +static u32 sw_owner_id[4]; + enum { MLX5_ATOMIC_REQ_MODE_BE = 0x0, MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS = 0x1, @@ -551,6 +553,15 @@ static int handle_hca_cap(struct mlx5_core_dev *dev) cache_line_128byte, cache_line_size() == 128 ? 1 : 0); + if (MLX5_CAP_GEN_MAX(dev, dct)) + MLX5_SET(cmd_hca_cap, set_hca_cap, dct, 1); + + if (MLX5_CAP_GEN_MAX(dev, num_vhca_ports)) + MLX5_SET(cmd_hca_cap, + set_hca_cap, + num_vhca_ports, + MLX5_CAP_GEN_MAX(dev, num_vhca_ports)); + err = set_caps(dev, set_ctx, set_sz, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE); @@ -1107,7 +1118,7 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, goto reclaim_boot_pages; } - err = mlx5_cmd_init_hca(dev); + err = mlx5_cmd_init_hca(dev, sw_owner_id); if (err) { dev_err(&pdev->dev, "init hca failed\n"); goto err_pagealloc_stop; @@ -1643,6 +1654,8 @@ static int __init init(void) { int err; + get_random_bytes(&sw_owner_id, sizeof(sw_owner_id)); + mlx5_core_verify_params(); mlx5_register_debugfs(); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index ff4a0b889a6f..b05868728da7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -86,7 +86,7 @@ enum { int mlx5_query_hca_caps(struct mlx5_core_dev *dev); int mlx5_query_board_id(struct mlx5_core_dev *dev); -int mlx5_cmd_init_hca(struct mlx5_core_dev *dev); +int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id); int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev); int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev); void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index 889130edb715..02d6c5b5d502 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -98,6 +98,11 @@ static u64 sq_allowed_event_types(void) return BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR); } +static u64 dct_allowed_event_types(void) +{ + return BIT(MLX5_EVENT_TYPE_DCT_DRAINED); +} + static bool is_event_type_allowed(int rsc_type, int event_type) { switch (rsc_type) { @@ -107,6 +112,8 @@ static bool is_event_type_allowed(int rsc_type, int event_type) return BIT(event_type) & rq_allowed_event_types(); case MLX5_EVENT_QUEUE_TYPE_SQ: return BIT(event_type) & sq_allowed_event_types(); + case MLX5_EVENT_QUEUE_TYPE_DCT: + return BIT(event_type) & dct_allowed_event_types(); default: WARN(1, "Event arrived for unknown resource type"); return false; @@ -116,6 +123,7 @@ static bool is_event_type_allowed(int rsc_type, int event_type) void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type) { struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, rsn); + struct mlx5_core_dct *dct; struct mlx5_core_qp *qp; if (!common) @@ -134,7 +142,11 @@ void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type) qp = (struct mlx5_core_qp *)common; qp->event(qp, event_type); break; - + case MLX5_RES_DCT: + dct = (struct mlx5_core_dct *)common; + if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED) + complete(&dct->drained); + break; default: mlx5_core_warn(dev, "invalid resource type for 0x%x\n", rsn); } @@ -142,9 +154,9 @@ void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type) mlx5_core_put_rsc(common); } -static int create_qprqsq_common(struct mlx5_core_dev *dev, - struct mlx5_core_qp *qp, - int rsc_type) +static int create_resource_common(struct mlx5_core_dev *dev, + struct mlx5_core_qp *qp, + int rsc_type) { struct mlx5_qp_table *table = &dev->priv.qp_table; int err; @@ -165,8 +177,8 @@ static int create_qprqsq_common(struct mlx5_core_dev *dev, return 0; } -static void destroy_qprqsq_common(struct mlx5_core_dev *dev, - struct mlx5_core_qp *qp) +static void destroy_resource_common(struct mlx5_core_dev *dev, + struct mlx5_core_qp *qp) { struct mlx5_qp_table *table = &dev->priv.qp_table; unsigned long flags; @@ -179,6 +191,40 @@ static void destroy_qprqsq_common(struct mlx5_core_dev *dev, wait_for_completion(&qp->common.free); } +int mlx5_core_create_dct(struct mlx5_core_dev *dev, + struct mlx5_core_dct *dct, + u32 *in, int inlen) +{ + u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {0}; + u32 din[MLX5_ST_SZ_DW(destroy_dct_in)] = {0}; + u32 dout[MLX5_ST_SZ_DW(destroy_dct_out)] = {0}; + struct mlx5_core_qp *qp = &dct->mqp; + int err; + + init_completion(&dct->drained); + MLX5_SET(create_dct_in, in, opcode, MLX5_CMD_OP_CREATE_DCT); + + err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); + if (err) { + mlx5_core_warn(dev, "create DCT failed, ret %d\n", err); + return err; + } + + qp->qpn = MLX5_GET(create_dct_out, out, dctn); + err = create_resource_common(dev, qp, MLX5_RES_DCT); + if (err) + goto err_cmd; + + return 0; +err_cmd: + MLX5_SET(destroy_dct_in, din, opcode, MLX5_CMD_OP_DESTROY_DCT); + MLX5_SET(destroy_dct_in, din, dctn, qp->qpn); + mlx5_cmd_exec(dev, (void *)&in, sizeof(din), + (void *)&out, sizeof(dout)); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_create_dct); + int mlx5_core_create_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, u32 *in, int inlen) @@ -197,7 +243,7 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev, qp->qpn = MLX5_GET(create_qp_out, out, qpn); mlx5_core_dbg(dev, "qpn = 0x%x\n", qp->qpn); - err = create_qprqsq_common(dev, qp, MLX5_RES_QP); + err = create_resource_common(dev, qp, MLX5_RES_QP); if (err) goto err_cmd; @@ -220,6 +266,47 @@ err_cmd: } EXPORT_SYMBOL_GPL(mlx5_core_create_qp); +static int mlx5_core_drain_dct(struct mlx5_core_dev *dev, + struct mlx5_core_dct *dct) +{ + u32 out[MLX5_ST_SZ_DW(drain_dct_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(drain_dct_in)] = {0}; + struct mlx5_core_qp *qp = &dct->mqp; + + MLX5_SET(drain_dct_in, in, opcode, MLX5_CMD_OP_DRAIN_DCT); + MLX5_SET(drain_dct_in, in, dctn, qp->qpn); + return mlx5_cmd_exec(dev, (void *)&in, sizeof(in), + (void *)&out, sizeof(out)); +} + +int mlx5_core_destroy_dct(struct mlx5_core_dev *dev, + struct mlx5_core_dct *dct) +{ + u32 out[MLX5_ST_SZ_DW(destroy_dct_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_dct_in)] = {0}; + struct mlx5_core_qp *qp = &dct->mqp; + int err; + + err = mlx5_core_drain_dct(dev, dct); + if (err) { + if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + goto destroy; + } else { + mlx5_core_warn(dev, "failed drain DCT 0x%x with error 0x%x\n", qp->qpn, err); + return err; + } + } + wait_for_completion(&dct->drained); +destroy: + destroy_resource_common(dev, &dct->mqp); + MLX5_SET(destroy_dct_in, in, opcode, MLX5_CMD_OP_DESTROY_DCT); + MLX5_SET(destroy_dct_in, in, dctn, qp->qpn); + err = mlx5_cmd_exec(dev, (void *)&in, sizeof(in), + (void *)&out, sizeof(out)); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_destroy_dct); + int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp) { @@ -229,7 +316,7 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, mlx5_debug_qp_remove(dev, qp); - destroy_qprqsq_common(dev, qp); + destroy_resource_common(dev, qp); MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); @@ -405,6 +492,20 @@ int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, } EXPORT_SYMBOL_GPL(mlx5_core_qp_query); +int mlx5_core_dct_query(struct mlx5_core_dev *dev, struct mlx5_core_dct *dct, + u32 *out, int outlen) +{ + u32 in[MLX5_ST_SZ_DW(query_dct_in)] = {0}; + struct mlx5_core_qp *qp = &dct->mqp; + + MLX5_SET(query_dct_in, in, opcode, MLX5_CMD_OP_QUERY_DCT); + MLX5_SET(query_dct_in, in, dctn, qp->qpn); + + return mlx5_cmd_exec(dev, (void *)&in, sizeof(in), + (void *)out, outlen); +} +EXPORT_SYMBOL_GPL(mlx5_core_dct_query); + int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn) { u32 out[MLX5_ST_SZ_DW(alloc_xrcd_out)] = {0}; @@ -441,7 +542,7 @@ int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, return err; rq->qpn = rqn; - err = create_qprqsq_common(dev, rq, MLX5_RES_RQ); + err = create_resource_common(dev, rq, MLX5_RES_RQ); if (err) goto err_destroy_rq; @@ -457,7 +558,7 @@ EXPORT_SYMBOL(mlx5_core_create_rq_tracked); void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev, struct mlx5_core_qp *rq) { - destroy_qprqsq_common(dev, rq); + destroy_resource_common(dev, rq); mlx5_core_destroy_rq(dev, rq->qpn); } EXPORT_SYMBOL(mlx5_core_destroy_rq_tracked); @@ -473,7 +574,7 @@ int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, return err; sq->qpn = sqn; - err = create_qprqsq_common(dev, sq, MLX5_RES_SQ); + err = create_resource_common(dev, sq, MLX5_RES_SQ); if (err) goto err_destroy_sq; @@ -489,7 +590,7 @@ EXPORT_SYMBOL(mlx5_core_create_sq_tracked); void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev, struct mlx5_core_qp *sq) { - destroy_qprqsq_common(dev, sq); + destroy_resource_common(dev, sq); mlx5_core_destroy_sq(dev, sq->qpn); } EXPORT_SYMBOL(mlx5_core_destroy_sq_tracked); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index a1296a62497d..dfe36cf6fbea 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -36,6 +36,9 @@ #include <linux/mlx5/vport.h> #include "mlx5_core.h" +/* Mutex to hold while enabling or disabling RoCE */ +static DEFINE_MUTEX(mlx5_roce_en_lock); + static int _mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u32 *out, int outlen) { @@ -998,17 +1001,35 @@ static int mlx5_nic_vport_update_roce_state(struct mlx5_core_dev *mdev, int mlx5_nic_vport_enable_roce(struct mlx5_core_dev *mdev) { - if (atomic_inc_return(&mdev->roce.roce_en) != 1) - return 0; - return mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_ENABLED); + int err = 0; + + mutex_lock(&mlx5_roce_en_lock); + if (!mdev->roce.roce_en) + err = mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_ENABLED); + + if (!err) + mdev->roce.roce_en++; + mutex_unlock(&mlx5_roce_en_lock); + + return err; } EXPORT_SYMBOL_GPL(mlx5_nic_vport_enable_roce); int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev) { - if (atomic_dec_return(&mdev->roce.roce_en) != 0) - return 0; - return mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_DISABLED); + int err = 0; + + mutex_lock(&mlx5_roce_en_lock); + if (mdev->roce.roce_en) { + mdev->roce.roce_en--; + if (mdev->roce.roce_en == 0) + err = mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_DISABLED); + + if (err) + mdev->roce.roce_en++; + } + mutex_unlock(&mlx5_roce_en_lock); + return err; } EXPORT_SYMBOL_GPL(mlx5_nic_vport_disable_roce); @@ -1110,3 +1131,61 @@ ex: return err; } EXPORT_SYMBOL_GPL(mlx5_core_modify_hca_vport_context); + +int mlx5_nic_vport_affiliate_multiport(struct mlx5_core_dev *master_mdev, + struct mlx5_core_dev *port_mdev) +{ + int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + void *in; + int err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + err = mlx5_nic_vport_enable_roce(port_mdev); + if (err) + goto free; + + MLX5_SET(modify_nic_vport_context_in, in, field_select.affiliation, 1); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.affiliated_vhca_id, + MLX5_CAP_GEN(master_mdev, vhca_id)); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.affiliation_criteria, + MLX5_CAP_GEN(port_mdev, affiliate_nic_vport_criteria)); + + err = mlx5_modify_nic_vport_context(port_mdev, in, inlen); + if (err) + mlx5_nic_vport_disable_roce(port_mdev); + +free: + kvfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_nic_vport_affiliate_multiport); + +int mlx5_nic_vport_unaffiliate_multiport(struct mlx5_core_dev *port_mdev) +{ + int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + void *in; + int err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_nic_vport_context_in, in, field_select.affiliation, 1); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.affiliated_vhca_id, 0); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.affiliation_criteria, 0); + + err = mlx5_modify_nic_vport_context(port_mdev, in, inlen); + if (!err) + mlx5_nic_vport_disable_roce(port_mdev); + + kvfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_nic_vport_unaffiliate_multiport); |