From 7eac52648a4c24ad23a05f62db97867c92a5747b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 7 Feb 2020 19:11:12 -0500 Subject: SUNRPC: Add a flag to avoid reference counts on credentials Add a flag to signal to the RPC layer that the credential is already pinned for the duration of the RPC call. Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 5 +++-- net/sunrpc/sched.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 7324b21f923e..2345e563c2f4 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1099,8 +1099,9 @@ rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg) task->tk_msg.rpc_proc = msg->rpc_proc; task->tk_msg.rpc_argp = msg->rpc_argp; task->tk_msg.rpc_resp = msg->rpc_resp; - if (msg->rpc_cred != NULL) - task->tk_msg.rpc_cred = get_cred(msg->rpc_cred); + task->tk_msg.rpc_cred = msg->rpc_cred; + if (!(task->tk_flags & RPC_TASK_CRED_NOREF)) + get_cred(task->tk_msg.rpc_cred); } } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 55e900255b0c..6eff14119a88 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -1162,7 +1162,8 @@ static void rpc_release_resources_task(struct rpc_task *task) { xprt_release(task); if (task->tk_msg.rpc_cred) { - put_cred(task->tk_msg.rpc_cred); + if (!(task->tk_flags & RPC_TASK_CRED_NOREF)) + put_cred(task->tk_msg.rpc_cred); task->tk_msg.rpc_cred = NULL; } rpc_task_release_client(task); -- cgit v1.2.3 From 263fb9c21e7a86e8e373a23edfa82700422f5e6a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 7 Feb 2020 19:16:34 -0500 Subject: SUNRPC: Don't take a reference to the cred on synchronous tasks If the RPC call is synchronous, assume the cred is already pinned by the caller. Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2345e563c2f4..252b044cdcdf 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1127,6 +1127,9 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) task = rpc_new_task(task_setup_data); + if (!RPC_IS_ASYNC(task)) + task->tk_flags |= RPC_TASK_CRED_NOREF; + rpc_task_set_client(task, task_setup_data->rpc_client); rpc_task_set_rpc_message(task, task_setup_data->rpc_message); -- cgit v1.2.3 From 68e9a2463d08f870abf8edfe9125b7738f7a63ce Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 28 Feb 2020 23:44:14 +0000 Subject: SUNRPC: remove redundant assignments to variable status The variable status is being initialized with a value that is never read and it is being updated later with a new value. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index d86c664ea6af..c9e039961e29 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1970,7 +1970,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) struct rpc_xprt *xprt = &transport->xprt; struct file *filp; struct socket *sock; - int status = -EIO; + int status; status = __sock_create(xprt->xprt_net, AF_LOCAL, SOCK_STREAM, 0, &sock, 1); -- cgit v1.2.3 From 4047aa909c4a40fceebc36fff708d465a4d3c6e2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 11 Mar 2020 11:21:07 -0400 Subject: sunrpc: Fix gss_unwrap_resp_integ() again xdr_buf_read_mic() tries to find unused contiguous space in a received xdr_buf in order to linearize the checksum for the call to gss_verify_mic. However, the corner cases in this code are numerous and we seem to keep missing them. I've just hit yet another buffer overrun related to it. This overrun is at the end of xdr_buf_read_mic(): 1284 if (buf->tail[0].iov_len != 0) 1285 mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len; 1286 else 1287 mic->data = buf->head[0].iov_base + buf->head[0].iov_len; 1288 __read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len); 1289 return 0; This logic assumes the transport has set the length of the tail based on the size of the received message. base + len is then supposed to be off the end of the message but still within the actual buffer. In fact, the length of the tail is set by the upper layer when the Call is encoded so that the end of the tail is actually the end of the allocated buffer itself. This causes the logic above to set mic->data to point past the end of the receive buffer. The "mic->data = head" arm of this if statement is no less fragile. As near as I can tell, this has been a problem forever. I'm not sure that minimizing au_rslack recently changed this pathology much. So instead, let's use a more straightforward approach: kmalloc a separate buffer to linearize the checksum. This is similar to how gss_validate() currently works. Coming back to this code, I had some trouble understanding what was going on. So I've cleaned up the variable naming and added a few comments that point back to the XDR definition in RFC 2203 to help guide future spelunkers, including myself. As an added clean up, the functionality that was in xdr_buf_read_mic() is folded directly into gss_unwrap_resp_integ(), as that is its only caller. Signed-off-by: Chuck Lever Reviewed-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 77 +++++++++++++++++++++++++++++++----------- 1 file changed, 58 insertions(+), 19 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 24ca861815b1..98b2c8bc8f40 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1934,35 +1934,69 @@ gss_unwrap_resp_auth(struct rpc_cred *cred) return 0; } +/* + * RFC 2203, Section 5.3.2.2 + * + * struct rpc_gss_integ_data { + * opaque databody_integ<>; + * opaque checksum<>; + * }; + * + * struct rpc_gss_data_t { + * unsigned int seq_num; + * proc_req_arg_t arg; + * }; + */ static int gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp, struct xdr_stream *xdr) { - struct xdr_buf integ_buf, *rcv_buf = &rqstp->rq_rcv_buf; - u32 data_offset, mic_offset, integ_len, maj_stat; + struct xdr_buf gss_data, *rcv_buf = &rqstp->rq_rcv_buf; struct rpc_auth *auth = cred->cr_auth; + u32 len, offset, seqno, maj_stat; struct xdr_netobj mic; - __be32 *p; + int ret; - p = xdr_inline_decode(xdr, 2 * sizeof(*p)); - if (unlikely(!p)) + ret = -EIO; + mic.data = NULL; + + /* opaque databody_integ<>; */ + if (xdr_stream_decode_u32(xdr, &len)) goto unwrap_failed; - integ_len = be32_to_cpup(p++); - if (integ_len & 3) + if (len & 3) goto unwrap_failed; - data_offset = (u8 *)(p) - (u8 *)rcv_buf->head[0].iov_base; - mic_offset = integ_len + data_offset; - if (mic_offset > rcv_buf->len) + offset = rcv_buf->len - xdr_stream_remaining(xdr); + if (xdr_stream_decode_u32(xdr, &seqno)) goto unwrap_failed; - if (be32_to_cpup(p) != rqstp->rq_seqno) + if (seqno != rqstp->rq_seqno) goto bad_seqno; + if (xdr_buf_subsegment(rcv_buf, &gss_data, offset, len)) + goto unwrap_failed; - if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len)) + /* + * The xdr_stream now points to the beginning of the + * upper layer payload, to be passed below to + * rpcauth_unwrap_resp_decode(). The checksum, which + * follows the upper layer payload in @rcv_buf, is + * located and parsed without updating the xdr_stream. + */ + + /* opaque checksum<>; */ + offset += len; + if (xdr_decode_word(rcv_buf, offset, &len)) + goto unwrap_failed; + offset += sizeof(__be32); + if (offset + len > rcv_buf->len) goto unwrap_failed; - if (xdr_buf_read_mic(rcv_buf, &mic, mic_offset)) + mic.len = len; + mic.data = kmalloc(len, GFP_NOFS); + if (!mic.data) + goto unwrap_failed; + if (read_bytes_from_xdr_buf(rcv_buf, offset, mic.data, mic.len)) goto unwrap_failed; - maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic); + + maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &gss_data, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat != GSS_S_COMPLETE) @@ -1970,16 +2004,21 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, auth->au_rslack = auth->au_verfsize + 2 + 1 + XDR_QUADLEN(mic.len); auth->au_ralign = auth->au_verfsize + 2; - return 0; + ret = 0; + +out: + kfree(mic.data); + return ret; + unwrap_failed: trace_rpcgss_unwrap_failed(task); - return -EIO; + goto out; bad_seqno: - trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(p)); - return -EIO; + trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, seqno); + goto out; bad_mic: trace_rpcgss_verify_mic(task, maj_stat); - return -EIO; + goto out; } static int -- cgit v1.2.3 From 8d6bda7f23a9b3ef1d7e386f01924c37f18fe771 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 11 Mar 2020 11:21:12 -0400 Subject: SUNRPC: Remove xdr_buf_read_mic() Clean up: this function is no longer used. Signed-off-by: Chuck Lever Reviewed-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xdr.h | 1 - net/sunrpc/xdr.c | 55 ---------------------------------------------- 2 files changed, 56 deletions(-) (limited to 'net/sunrpc') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index b41f34977995..8a6dd5bd6748 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -184,7 +184,6 @@ xdr_adjust_iovec(struct kvec *iov, __be32 *p) extern void xdr_shift_buf(struct xdr_buf *, size_t); extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *); extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int); -extern int xdr_buf_read_mic(struct xdr_buf *, struct xdr_netobj *, unsigned int); extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int); diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index e5497dc2475b..15b58c5144f9 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1235,61 +1235,6 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) } EXPORT_SYMBOL_GPL(xdr_encode_word); -/** - * xdr_buf_read_mic() - obtain the address of the GSS mic from xdr buf - * @buf: pointer to buffer containing a mic - * @mic: on success, returns the address of the mic - * @offset: the offset in buf where mic may be found - * - * This function may modify the xdr buf if the mic is found to be straddling - * a boundary between head, pages, and tail. On success the mic can be read - * from the address returned. There is no need to free the mic. - * - * Return: Success returns 0, otherwise an integer error. - */ -int xdr_buf_read_mic(struct xdr_buf *buf, struct xdr_netobj *mic, unsigned int offset) -{ - struct xdr_buf subbuf; - unsigned int boundary; - - if (xdr_decode_word(buf, offset, &mic->len)) - return -EFAULT; - offset += 4; - - /* Is the mic partially in the head? */ - boundary = buf->head[0].iov_len; - if (offset < boundary && (offset + mic->len) > boundary) - xdr_shift_buf(buf, boundary - offset); - - /* Is the mic partially in the pages? */ - boundary += buf->page_len; - if (offset < boundary && (offset + mic->len) > boundary) - xdr_shrink_pagelen(buf, boundary - offset); - - if (xdr_buf_subsegment(buf, &subbuf, offset, mic->len)) - return -EFAULT; - - /* Is the mic contained entirely in the head? */ - mic->data = subbuf.head[0].iov_base; - if (subbuf.head[0].iov_len == mic->len) - return 0; - /* ..or is the mic contained entirely in the tail? */ - mic->data = subbuf.tail[0].iov_base; - if (subbuf.tail[0].iov_len == mic->len) - return 0; - - /* Find a contiguous area in @buf to hold all of @mic */ - if (mic->len > buf->buflen - buf->len) - return -ENOMEM; - if (buf->tail[0].iov_len != 0) - mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len; - else - mic->data = buf->head[0].iov_base + buf->head[0].iov_len; - __read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len); - return 0; -} -EXPORT_SYMBOL_GPL(xdr_buf_read_mic); - /* Returns 0 on success, or else a negative error code. */ static int xdr_xcode_array2(struct xdr_buf *buf, unsigned int base, -- cgit v1.2.3 From d162372af306d22ee57d8a723dd4552e932304a9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 11 Mar 2020 11:21:17 -0400 Subject: SUNRPC: Trim stack utilization in the wrap and unwrap paths By preventing compiler inlining of the integrity and privacy helpers, stack utilization for the common case (authentication only) goes way down. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 98b2c8bc8f40..45707a306f20 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1724,8 +1724,9 @@ bad_mic: goto out; } -static int gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, - struct rpc_task *task, struct xdr_stream *xdr) +static noinline_for_stack int +gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *rqstp = task->tk_rqstp; struct xdr_buf integ_buf, *snd_buf = &rqstp->rq_snd_buf; @@ -1816,8 +1817,9 @@ out: return -EAGAIN; } -static int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, - struct rpc_task *task, struct xdr_stream *xdr) +static noinline_for_stack int +gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *rqstp = task->tk_rqstp; struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; @@ -1947,7 +1949,7 @@ gss_unwrap_resp_auth(struct rpc_cred *cred) * proc_req_arg_t arg; * }; */ -static int +static noinline_for_stack int gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp, struct xdr_stream *xdr) @@ -2021,7 +2023,7 @@ bad_mic: goto out; } -static int +static noinline_for_stack int gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp, struct xdr_stream *xdr) -- cgit v1.2.3 From df513a7711712758b9cb1a48d86712e7e1ee03f4 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 26 Mar 2020 10:24:51 -0400 Subject: SUNRPC: fix krb5p mount to provide large enough buffer in rq_rcvsize Ever since commit 2c94b8eca1a2 ("SUNRPC: Use au_rslack when computing reply buffer size"). It changed how "req->rq_rcvsize" is calculated. It used to use au_cslack value which was nice and large and changed it to au_rslack value which turns out to be too small. Since 5.1, v3 mount with sec=krb5p fails against an Ontap server because client's receive buffer it too small. For gss krb5p, we need to account for the mic token in the verifier, and the wrap token in the wrap token. RFC 4121 defines: mic token Octet no Name Description -------------------------------------------------------------- 0..1 TOK_ID Identification field. Tokens emitted by GSS_GetMIC() contain the hex value 04 04 expressed in big-endian order in this field. 2 Flags Attributes field, as described in section 4.2.2. 3..7 Filler Contains five octets of hex value FF. 8..15 SND_SEQ Sequence number field in clear text, expressed in big-endian order. 16..last SGN_CKSUM Checksum of the "to-be-signed" data and octet 0..15, as described in section 4.2.4. that's 16bytes (GSS_KRB5_TOK_HDR_LEN) + chksum wrap token Octet no Name Description -------------------------------------------------------------- 0..1 TOK_ID Identification field. Tokens emitted by GSS_Wrap() contain the hex value 05 04 expressed in big-endian order in this field. 2 Flags Attributes field, as described in section 4.2.2. 3 Filler Contains the hex value FF. 4..5 EC Contains the "extra count" field, in big- endian order as described in section 4.2.3. 6..7 RRC Contains the "right rotation count" in big- endian order, as described in section 4.2.5. 8..15 SND_SEQ Sequence number field in clear text, expressed in big-endian order. 16..last Data Encrypted data for Wrap tokens with confidentiality, or plaintext data followed by the checksum for Wrap tokens without confidentiality, as described in section 4.2.4. Also 16bytes of header (GSS_KRB5_TOK_HDR_LEN), encrypted data, and cksum (other things like padding) RFC 3961 defines known cksum sizes: Checksum type sumtype checksum section or value size reference --------------------------------------------------------------------- CRC32 1 4 6.1.3 rsa-md4 2 16 6.1.2 rsa-md4-des 3 24 6.2.5 des-mac 4 16 6.2.7 des-mac-k 5 8 6.2.8 rsa-md4-des-k 6 16 6.2.6 rsa-md5 7 16 6.1.1 rsa-md5-des 8 24 6.2.4 rsa-md5-des3 9 24 ?? sha1 (unkeyed) 10 20 ?? hmac-sha1-des3-kd 12 20 6.3 hmac-sha1-des3 13 20 ?? sha1 (unkeyed) 14 20 ?? hmac-sha1-96-aes128 15 20 [KRB5-AES] hmac-sha1-96-aes256 16 20 [KRB5-AES] [reserved] 0x8003 ? [GSS-KRB5] Linux kernel now mainly supports type 15,16 so max cksum size is 20bytes. (GSS_KRB5_MAX_CKSUM_LEN) Re-use already existing define of GSS_KRB5_MAX_SLACK_NEEDED that's used for encoding the gss_wrap tokens (same tokens are used in reply). Fixes: 2c94b8eca1a2 ("SUNRPC: Use au_rslack when computing reply buffer size") Signed-off-by: Olga Kornievskaia Reviewed-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 45707a306f20..cfee4954f821 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -1050,7 +1051,7 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) goto err_put_mech; auth = &gss_auth->rpc_auth; auth->au_cslack = GSS_CRED_SLACK >> 2; - auth->au_rslack = GSS_VERF_SLACK >> 2; + auth->au_rslack = GSS_KRB5_MAX_SLACK_NEEDED >> 2; auth->au_verfsize = GSS_VERF_SLACK >> 2; auth->au_ralign = GSS_VERF_SLACK >> 2; auth->au_flags = 0; -- cgit v1.2.3 From 62a89501a3bde310ba339cc90bb05879528d84a0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 12 Feb 2020 11:12:35 -0500 Subject: xprtrdma: Enhance MR-related trace points Two changes: - Show the number of SG entries that were mapped. This helps debug DMA-related problems. - Record the MR's resource ID instead of its memory address. This groups each MR with its associated rdma-tool output, and reduces needless exposure of memory addresses. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 56 ++++++++++++++++++++++-------------------- net/sunrpc/xprtrdma/frwr_ops.c | 2 +- 2 files changed, 31 insertions(+), 27 deletions(-) (limited to 'net/sunrpc') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index c0e4c93324f5..87f4461ab108 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -228,20 +228,20 @@ DECLARE_EVENT_CLASS(xprtrdma_frwr_done, TP_ARGS(wc, frwr), TP_STRUCT__entry( - __field(const void *, mr) + __field(u32, mr_id) __field(unsigned int, status) __field(unsigned int, vendor_err) ), TP_fast_assign( - __entry->mr = container_of(frwr, struct rpcrdma_mr, frwr); + __entry->mr_id = frwr->fr_mr->res.id; __entry->status = wc->status; __entry->vendor_err = __entry->status ? wc->vendor_err : 0; ), TP_printk( - "mr=%p: %s (%u/0x%x)", - __entry->mr, rdma_show_wc_status(__entry->status), + "mr.id=%u: %s (%u/0x%x)", + __entry->mr_id, rdma_show_wc_status(__entry->status), __entry->status, __entry->vendor_err ) ); @@ -274,7 +274,8 @@ DECLARE_EVENT_CLASS(xprtrdma_mr, TP_ARGS(mr), TP_STRUCT__entry( - __field(const void *, mr) + __field(u32, mr_id) + __field(int, nents) __field(u32, handle) __field(u32, length) __field(u64, offset) @@ -282,15 +283,16 @@ DECLARE_EVENT_CLASS(xprtrdma_mr, ), TP_fast_assign( - __entry->mr = mr; + __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->nents = mr->mr_nents; __entry->handle = mr->mr_handle; __entry->length = mr->mr_length; __entry->offset = mr->mr_offset; __entry->dir = mr->mr_dir; ), - TP_printk("mr=%p %u@0x%016llx:0x%08x (%s)", - __entry->mr, __entry->length, + TP_printk("mr.id=%u nents=%d %u@0x%016llx:0x%08x (%s)", + __entry->mr_id, __entry->nents, __entry->length, (unsigned long long)__entry->offset, __entry->handle, xprtrdma_show_direction(__entry->dir) ) @@ -920,17 +922,17 @@ TRACE_EVENT(xprtrdma_frwr_alloc, TP_ARGS(mr, rc), TP_STRUCT__entry( - __field(const void *, mr) + __field(u32, mr_id) __field(int, rc) ), TP_fast_assign( - __entry->mr = mr; - __entry->rc = rc; + __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->rc = rc; ), - TP_printk("mr=%p: rc=%d", - __entry->mr, __entry->rc + TP_printk("mr.id=%u: rc=%d", + __entry->mr_id, __entry->rc ) ); @@ -943,7 +945,8 @@ TRACE_EVENT(xprtrdma_frwr_dereg, TP_ARGS(mr, rc), TP_STRUCT__entry( - __field(const void *, mr) + __field(u32, mr_id) + __field(int, nents) __field(u32, handle) __field(u32, length) __field(u64, offset) @@ -952,7 +955,8 @@ TRACE_EVENT(xprtrdma_frwr_dereg, ), TP_fast_assign( - __entry->mr = mr; + __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->nents = mr->mr_nents; __entry->handle = mr->mr_handle; __entry->length = mr->mr_length; __entry->offset = mr->mr_offset; @@ -960,8 +964,8 @@ TRACE_EVENT(xprtrdma_frwr_dereg, __entry->rc = rc; ), - TP_printk("mr=%p %u@0x%016llx:0x%08x (%s): rc=%d", - __entry->mr, __entry->length, + TP_printk("mr.id=%u nents=%d %u@0x%016llx:0x%08x (%s): rc=%d", + __entry->mr_id, __entry->nents, __entry->length, (unsigned long long)__entry->offset, __entry->handle, xprtrdma_show_direction(__entry->dir), __entry->rc @@ -977,21 +981,21 @@ TRACE_EVENT(xprtrdma_frwr_sgerr, TP_ARGS(mr, sg_nents), TP_STRUCT__entry( - __field(const void *, mr) + __field(u32, mr_id) __field(u64, addr) __field(u32, dir) __field(int, nents) ), TP_fast_assign( - __entry->mr = mr; + __entry->mr_id = mr->frwr.fr_mr->res.id; __entry->addr = mr->mr_sg->dma_address; __entry->dir = mr->mr_dir; __entry->nents = sg_nents; ), - TP_printk("mr=%p dma addr=0x%llx (%s) sg_nents=%d", - __entry->mr, __entry->addr, + TP_printk("mr.id=%u DMA addr=0x%llx (%s) sg_nents=%d", + __entry->mr_id, __entry->addr, xprtrdma_show_direction(__entry->dir), __entry->nents ) @@ -1006,7 +1010,7 @@ TRACE_EVENT(xprtrdma_frwr_maperr, TP_ARGS(mr, num_mapped), TP_STRUCT__entry( - __field(const void *, mr) + __field(u32, mr_id) __field(u64, addr) __field(u32, dir) __field(int, num_mapped) @@ -1014,15 +1018,15 @@ TRACE_EVENT(xprtrdma_frwr_maperr, ), TP_fast_assign( - __entry->mr = mr; + __entry->mr_id = mr->frwr.fr_mr->res.id; __entry->addr = mr->mr_sg->dma_address; __entry->dir = mr->mr_dir; __entry->num_mapped = num_mapped; __entry->nents = mr->mr_nents; ), - TP_printk("mr=%p dma addr=0x%llx (%s) nents=%d of %d", - __entry->mr, __entry->addr, + TP_printk("mr.id=%u DMA addr=0x%llx (%s) nents=%d of %d", + __entry->mr_id, __entry->addr, xprtrdma_show_direction(__entry->dir), __entry->num_mapped, __entry->nents ) @@ -1031,7 +1035,7 @@ TRACE_EVENT(xprtrdma_frwr_maperr, DEFINE_MR_EVENT(localinv); DEFINE_MR_EVENT(map); DEFINE_MR_EVENT(unmap); -DEFINE_MR_EVENT(remoteinv); +DEFINE_MR_EVENT(reminv); DEFINE_MR_EVENT(recycle); TRACE_EVENT(xprtrdma_dma_maperr, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 125297c9aa3e..0dc799553a08 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -419,7 +419,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); - trace_xprtrdma_mr_remoteinv(mr); + trace_xprtrdma_mr_reminv(mr); rpcrdma_mr_put(mr); break; /* only one invalidated MR per RPC */ } -- cgit v1.2.3 From 85cd8e2b78eea7374927750ffec60bf047f8f90b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Feb 2020 17:00:12 -0500 Subject: xprtrdma: Invoke rpcrdma_ep_create() in the connect worker Refactor rpcrdma_ep_create(), rpcrdma_ep_disconnect(), and rpcrdma_ep_destroy(). rpcrdma_ep_create will be invoked at connect time instead of at transport set-up time. It will be responsible for allocating per- connection resources. In this patch it allocates the CQs and creates a QP. More to come. rpcrdma_ep_destroy() is the inverse functionality that is invoked at disconnect time. It will be responsible for releasing the CQs and QP. These changes should be safe to do because both connect and disconnect is guaranteed to be serialized by the transport send lock. This takes us another step closer to resolving the address and route only at connect time so that connection failover to another device will work correctly. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 10 +-- net/sunrpc/xprtrdma/verbs.c | 147 +++++++++++++++------------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 - 3 files changed, 56 insertions(+), 103 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 3cfeba68ee9a..d915524a8e68 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -284,7 +284,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) cancel_delayed_work_sync(&r_xprt->rx_connect_worker); - rpcrdma_ep_destroy(r_xprt); + rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); rpcrdma_buffer_destroy(&r_xprt->rx_buf); rpcrdma_ia_close(&r_xprt->rx_ia); @@ -351,13 +351,9 @@ xprt_setup_rdma(struct xprt_create *args) if (rc) goto out1; - rc = rpcrdma_ep_create(new_xprt); - if (rc) - goto out2; - rc = rpcrdma_buffer_create(new_xprt); if (rc) - goto out3; + goto out2; if (!try_module_get(THIS_MODULE)) goto out4; @@ -375,8 +371,6 @@ xprt_setup_rdma(struct xprt_create *args) out4: rpcrdma_buffer_destroy(&new_xprt->rx_buf); rc = -ENODEV; -out3: - rpcrdma_ep_destroy(new_xprt); out2: rpcrdma_ia_close(&new_xprt->rx_ia); out1: diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 353f61ac8d51..042e6cc4f767 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -84,6 +84,7 @@ static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep); static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt); +static void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt); static struct rpcrdma_regbuf * rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, gfp_t flags); @@ -391,32 +392,17 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) { struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - /* This is similar to rpcrdma_ep_destroy, but: - * - Don't cancel the connect worker. - * - Don't call rpcrdma_ep_disconnect, which waits - * for another conn upcall, which will deadlock. - * - rdma_disconnect is unneeded, the underlying - * connection is already gone. - */ - if (ia->ri_id->qp) { + if (ia->ri_id->qp) rpcrdma_xprt_drain(r_xprt); - rdma_destroy_qp(ia->ri_id); - ia->ri_id->qp = NULL; - } - ib_free_cq(ep->rep_attr.recv_cq); - ep->rep_attr.recv_cq = NULL; - ib_free_cq(ep->rep_attr.send_cq); - ep->rep_attr.send_cq = NULL; - /* The ULP is responsible for ensuring all DMA - * mappings and MRs are gone. - */ rpcrdma_reps_unmap(r_xprt); rpcrdma_reqs_reset(r_xprt); rpcrdma_mrs_destroy(r_xprt); rpcrdma_sendctxs_destroy(r_xprt); + + rpcrdma_ep_destroy(r_xprt); + ib_dealloc_pd(ia->ri_pd); ia->ri_pd = NULL; @@ -434,11 +420,8 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) void rpcrdma_ia_close(struct rpcrdma_ia *ia) { - if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { - if (ia->ri_id->qp) - rdma_destroy_qp(ia->ri_id); + if (ia->ri_id && !IS_ERR(ia->ri_id)) rdma_destroy_id(ia->ri_id); - } ia->ri_id = NULL; /* If the pd is still busy, xprtrdma missed freeing a resource */ @@ -447,25 +430,19 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia) ia->ri_pd = NULL; } -/** - * rpcrdma_ep_create - Create unconnected endpoint - * @r_xprt: transport to instantiate - * - * Returns zero on success, or a negative errno. - */ -int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) +static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt, + struct rdma_cm_id *id) { struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; - struct ib_cq *sendcq, *recvcq; int rc; ep->rep_max_requests = r_xprt->rx_xprt.max_reqs; ep->rep_inline_send = xprt_rdma_max_inline_write; ep->rep_inline_recv = xprt_rdma_max_inline_read; - rc = frwr_query_device(r_xprt, ia->ri_id->device); + rc = frwr_query_device(r_xprt, id->device); if (rc) return rc; r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->rep_max_requests); @@ -491,25 +468,22 @@ int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) init_waitqueue_head(&ep->rep_connect_wait); ep->rep_receive_count = 0; - sendcq = ib_alloc_cq_any(ia->ri_id->device, r_xprt, - ep->rep_attr.cap.max_send_wr + 1, - IB_POLL_WORKQUEUE); - if (IS_ERR(sendcq)) { - rc = PTR_ERR(sendcq); - goto out1; + ep->rep_attr.send_cq = ib_alloc_cq_any(id->device, r_xprt, + ep->rep_attr.cap.max_send_wr, + IB_POLL_WORKQUEUE); + if (IS_ERR(ep->rep_attr.send_cq)) { + rc = PTR_ERR(ep->rep_attr.send_cq); + goto out_destroy; } - recvcq = ib_alloc_cq_any(ia->ri_id->device, NULL, - ep->rep_attr.cap.max_recv_wr + 1, - IB_POLL_WORKQUEUE); - if (IS_ERR(recvcq)) { - rc = PTR_ERR(recvcq); - goto out2; + ep->rep_attr.recv_cq = ib_alloc_cq_any(id->device, NULL, + ep->rep_attr.cap.max_recv_wr, + IB_POLL_WORKQUEUE); + if (IS_ERR(ep->rep_attr.recv_cq)) { + rc = PTR_ERR(ep->rep_attr.recv_cq); + goto out_destroy; } - ep->rep_attr.send_cq = sendcq; - ep->rep_attr.recv_cq = recvcq; - /* Initialize cma parameters */ memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma)); @@ -525,7 +499,7 @@ int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) /* Client offers RDMA Read but does not initiate */ ep->rep_remote_cma.initiator_depth = 0; ep->rep_remote_cma.responder_resources = - min_t(int, U8_MAX, ia->ri_id->device->attrs.max_qp_rd_atom); + min_t(int, U8_MAX, id->device->attrs.max_qp_rd_atom); /* Limit transport retries so client can detect server * GID changes quickly. RPC layer handles re-establishing @@ -540,45 +514,41 @@ int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) ep->rep_remote_cma.flow_control = 0; ep->rep_remote_cma.rnr_retry_count = 0; + rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); + if (rc) + goto out_destroy; return 0; -out2: - ib_free_cq(sendcq); -out1: +out_destroy: + rpcrdma_ep_destroy(r_xprt); return rc; } -/** - * rpcrdma_ep_destroy - Disconnect and destroy endpoint. - * @r_xprt: transport instance to shut down - * - */ -void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt) +static void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpcrdma_ia *ia = &r_xprt->rx_ia; if (ia->ri_id && ia->ri_id->qp) { - rpcrdma_ep_disconnect(ep, ia); rdma_destroy_qp(ia->ri_id); ia->ri_id->qp = NULL; } if (ep->rep_attr.recv_cq) ib_free_cq(ep->rep_attr.recv_cq); + ep->rep_attr.recv_cq = NULL; if (ep->rep_attr.send_cq) ib_free_cq(ep->rep_attr.send_cq); + ep->rep_attr.send_cq = NULL; } /* Re-establish a connection after a device removal event. * Unlike a normal reconnection, a fresh PD and a new set * of MRs and buffers is needed. */ -static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, - struct ib_qp_init_attr *qp_init_attr) +static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; int rc, err; trace_xprtrdma_reinsert(r_xprt); @@ -587,39 +557,24 @@ static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, if (rpcrdma_ia_open(r_xprt)) goto out1; - rc = -ENOMEM; - err = rpcrdma_ep_create(r_xprt); - if (err) { - pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err); - goto out2; - } - memcpy(qp_init_attr, &ep->rep_attr, sizeof(*qp_init_attr)); - rc = -ENETUNREACH; - err = rdma_create_qp(ia->ri_id, ia->ri_pd, qp_init_attr); - if (err) { - pr_err("rpcrdma: rdma_create_qp returned %d\n", err); - goto out3; - } + err = rpcrdma_ep_create(r_xprt, ia->ri_id); + if (err) + goto out2; return 0; -out3: - rpcrdma_ep_destroy(r_xprt); out2: rpcrdma_ia_close(ia); out1: return rc; } -static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, - struct ib_qp_init_attr *qp_init_attr) +static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rdma_cm_id *id, *old; int err, rc; - rpcrdma_ep_disconnect(&r_xprt->rx_ep, ia); - rc = -EHOSTUNREACH; id = rpcrdma_create_id(r_xprt, ia); if (IS_ERR(id)) @@ -640,15 +595,14 @@ static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, goto out_destroy; } - err = rdma_create_qp(id, ia->ri_pd, qp_init_attr); + err = rpcrdma_ep_create(r_xprt, id); if (err) goto out_destroy; - /* Atomically replace the transport's ID and QP. */ + /* Atomically replace the transport's ID. */ rc = 0; old = ia->ri_id; ia->ri_id = id; - rdma_destroy_qp(old); out_destroy: rdma_destroy_id(old); @@ -665,26 +619,25 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct ib_qp_init_attr qp_init_attr; int rc; retry: - memcpy(&qp_init_attr, &ep->rep_attr, sizeof(qp_init_attr)); switch (ep->rep_connected) { case 0: - rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &qp_init_attr); - if (rc) { - rc = -ENETUNREACH; + rc = -ENETUNREACH; + if (rpcrdma_ep_create(r_xprt, ia->ri_id)) goto out_noupdate; - } break; case -ENODEV: - rc = rpcrdma_ep_recreate_xprt(r_xprt, &qp_init_attr); + rc = rpcrdma_ep_recreate_xprt(r_xprt); if (rc) goto out_noupdate; break; + case 1: + rpcrdma_ep_disconnect(ep, ia); + /* fall through */ default: - rc = rpcrdma_ep_reconnect(r_xprt, &qp_init_attr); + rc = rpcrdma_ep_reconnect(r_xprt); if (rc) goto out; } @@ -742,10 +695,14 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, rx_ep); + struct rdma_cm_id *id = ia->ri_id; int rc; + if (!id) + goto out; + /* returns without wait if ID is not connected */ - rc = rdma_disconnect(ia->ri_id); + rc = rdma_disconnect(id); if (!rc) wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 1); @@ -753,10 +710,14 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ep->rep_connected = rc; trace_xprtrdma_disconnect(r_xprt, rc); - rpcrdma_xprt_drain(r_xprt); + if (id->qp) + rpcrdma_xprt_drain(r_xprt); +out: rpcrdma_reqs_reset(r_xprt); rpcrdma_mrs_destroy(r_xprt); rpcrdma_sendctxs_destroy(r_xprt); + + rpcrdma_ep_destroy(r_xprt); } /* Fixed-size circular FIFO queue. This implementation is wait-free and diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 37d5080c250b..9a536319557e 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -464,8 +464,6 @@ void rpcrdma_ia_close(struct rpcrdma_ia *); /* * Endpoint calls - xprtrdma/verbs.c */ -int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt); -void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt); int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); -- cgit v1.2.3 From 253a51622fb03425b611e709e34f1ea70949a61f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Feb 2020 17:00:17 -0500 Subject: xprtrdma: Refactor frwr_init_mr() Clean up: prepare for combining the rpcrdma_ia and rpcrdma_ep structures. Take the opportunity to rename the function to be consistent with the "subsystem _ object _ verb" naming scheme. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 10 ++++++---- net/sunrpc/xprtrdma/verbs.c | 4 +--- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 0dc799553a08..2c96b624a77d 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -52,7 +52,7 @@ /** * frwr_release_mr - Destroy one MR - * @mr: MR allocated by frwr_init_mr + * @mr: MR allocated by frwr_mr_init * */ void frwr_release_mr(struct rpcrdma_mr *mr) @@ -106,15 +106,16 @@ void frwr_reset(struct rpcrdma_req *req) } /** - * frwr_init_mr - Initialize one MR - * @ia: interface adapter + * frwr_mr_init - Initialize one MR + * @r_xprt: controlling transport instance * @mr: generic MR to prepare for FRWR * * Returns zero if successful. Otherwise a negative errno * is returned. */ -int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) +int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) { + struct rpcrdma_ia *ia = &r_xprt->rx_ia; unsigned int depth = ia->ri_max_frwr_depth; struct scatterlist *sg; struct ib_mr *frmr; @@ -128,6 +129,7 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) if (!sg) goto out_list_err; + mr->mr_xprt = r_xprt; mr->frwr.fr_mr = frmr; mr->mr_dir = DMA_NONE; INIT_LIST_HEAD(&mr->mr_list); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 042e6cc4f767..02ce3d548825 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -896,14 +896,12 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) if (!mr) break; - rc = frwr_init_mr(ia, mr); + rc = frwr_mr_init(r_xprt, mr); if (rc) { kfree(mr); break; } - mr->mr_xprt = r_xprt; - spin_lock(&buf->rb_lock); rpcrdma_mr_push(mr, &buf->rb_mrs); list_add(&mr->mr_all, &buf->rb_all_mrs); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9a536319557e..9e3e9a82cb9f 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -536,7 +536,7 @@ rpcrdma_data_dir(bool writing) void frwr_reset(struct rpcrdma_req *req); int frwr_query_device(struct rpcrdma_xprt *r_xprt, const struct ib_device *device); -int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr); +int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr); void frwr_release_mr(struct rpcrdma_mr *mr); struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, -- cgit v1.2.3 From 97d0de8812a10a66510ff95f8fe6e8d3053fd2ca Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Feb 2020 17:00:23 -0500 Subject: xprtrdma: Clean up the post_send path Clean up: Simplify the synopses of functions in the post_send path by combining the struct rpcrdma_ia and struct rpcrdma_ep arguments. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 2 +- net/sunrpc/xprtrdma/frwr_ops.c | 14 +++++++++----- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtrdma/verbs.c | 13 +++++-------- net/sunrpc/xprtrdma/xprt_rdma.h | 5 ++--- 5 files changed, 18 insertions(+), 18 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 1a0ae0c61353..4b43910a6ed2 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -115,7 +115,7 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst) if (rc < 0) goto failed_marshal; - if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + if (rpcrdma_post_sends(r_xprt, req)) goto drop_connection; return 0; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 2c96b624a77d..a1b5c8024cca 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -374,18 +374,22 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) } /** - * frwr_send - post Send WR containing the RPC Call message - * @ia: interface adapter - * @req: Prepared RPC Call + * frwr_send - post Send WRs containing the RPC Call message + * @r_xprt: controlling transport instance + * @req: prepared RPC Call * * For FRWR, chain any FastReg WRs to the Send WR. Only a * single ib_post_send call is needed to register memory * and then post the Send WR. * - * Returns the result of ib_post_send. + * Returns the return code from ib_post_send. + * + * Caller must hold the transport send lock to ensure that the + * pointers to the transport's rdma_cm_id and QP are stable. */ -int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { + struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct ib_send_wr *post_wr; struct rpcrdma_mr *mr; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index d915524a8e68..8934c24a5701 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -688,7 +688,7 @@ xprt_rdma_send_request(struct rpc_rqst *rqst) goto drop_connection; rqst->rq_xtime = ktime_get(); - if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + if (rpcrdma_post_sends(r_xprt, req)) goto drop_connection; rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 02ce3d548825..8fd6682d2646 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1461,20 +1461,17 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb) } /** - * rpcrdma_ep_post - Post WRs to a transport's Send Queue - * @ia: transport's device information - * @ep: transport's RDMA endpoint information + * rpcrdma_post_sends - Post WRs to a transport's Send Queue + * @r_xprt: controlling transport instance * @req: rpcrdma_req containing the Send WR to post * * Returns 0 if the post was successful, otherwise -ENOTCONN * is returned. */ -int -rpcrdma_ep_post(struct rpcrdma_ia *ia, - struct rpcrdma_ep *ep, - struct rpcrdma_req *req) +int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *send_wr = &req->rl_wr; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; int rc; if (!ep->rep_send_count || kref_read(&req->rl_kref) > 1) { @@ -1485,7 +1482,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, --ep->rep_send_count; } - rc = frwr_send(ia, req); + rc = frwr_send(r_xprt, req); trace_xprtrdma_post_send(req, rc); if (rc) return -ENOTCONN; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9e3e9a82cb9f..82ec4c25432f 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -467,8 +467,7 @@ void rpcrdma_ia_close(struct rpcrdma_ia *); int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); -int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, - struct rpcrdma_req *); +int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); /* @@ -542,7 +541,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, __be32 xid, struct rpcrdma_mr *mr); -int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req); +int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs); void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); -- cgit v1.2.3 From 9144a803df6ca4185238ca343dbb65d8137c036e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Feb 2020 17:00:28 -0500 Subject: xprtrdma: Refactor rpcrdma_ep_connect() and rpcrdma_ep_disconnect() Clean up: Simplify the synopses of functions in the connect and disconnect paths in preparation for combining the rpcrdma_ia and struct rpcrdma_ep structures. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 6 +++--- net/sunrpc/xprtrdma/verbs.c | 30 +++++++++++++++--------------- net/sunrpc/xprtrdma/xprt_rdma.h | 4 ++-- 3 files changed, 20 insertions(+), 20 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 8934c24a5701..6349e6c98b57 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -240,7 +240,7 @@ xprt_rdma_connect_worker(struct work_struct *work) struct rpc_xprt *xprt = &r_xprt->rx_xprt; int rc; - rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); + rc = rpcrdma_xprt_connect(r_xprt); xprt_clear_connecting(xprt); if (r_xprt->rx_ep.rep_connected > 0) { xprt->stat.connect_count++; @@ -284,7 +284,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) cancel_delayed_work_sync(&r_xprt->rx_connect_worker); - rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); + rpcrdma_xprt_disconnect(r_xprt); rpcrdma_buffer_destroy(&r_xprt->rx_buf); rpcrdma_ia_close(&r_xprt->rx_ia); @@ -409,7 +409,7 @@ void xprt_rdma_close(struct rpc_xprt *xprt) if (ep->rep_connected == -ENODEV) return; - rpcrdma_ep_disconnect(ep, ia); + rpcrdma_xprt_disconnect(r_xprt); out: xprt->reestablish_timeout = 0; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 8fd6682d2646..f361213a8157 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -610,15 +610,17 @@ out: return rc; } -/* - * Connect unconnected endpoint. +/** + * rpcrdma_xprt_connect - Connect an unconnected transport + * @r_xprt: controlling transport instance + * + * Returns 0 on success or a negative errno. */ -int -rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, - rx_ia); struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; int rc; retry: @@ -634,7 +636,7 @@ retry: goto out_noupdate; break; case 1: - rpcrdma_ep_disconnect(ep, ia); + rpcrdma_xprt_disconnect(r_xprt); /* fall through */ default: rc = rpcrdma_ep_reconnect(r_xprt); @@ -668,7 +670,7 @@ retry: rc = rpcrdma_reqs_setup(r_xprt); if (rc) { - rpcrdma_ep_disconnect(ep, ia); + rpcrdma_xprt_disconnect(r_xprt); goto out; } rpcrdma_mrs_create(r_xprt); @@ -683,18 +685,16 @@ out_noupdate: } /** - * rpcrdma_ep_disconnect - Disconnect underlying transport - * @ep: endpoint to disconnect - * @ia: associated interface adapter + * rpcrdma_xprt_disconnect - Disconnect underlying transport + * @r_xprt: controlling transport instance * * Caller serializes. Either the transport send lock is held, * or we're being called to destroy the transport. */ -void -rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, - rx_ep); + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rdma_cm_id *id = ia->ri_id; int rc; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 82ec4c25432f..9ead06b1d8a4 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -464,8 +464,8 @@ void rpcrdma_ia_close(struct rpcrdma_ia *); /* * Endpoint calls - xprtrdma/verbs.c */ -int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); -void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); +int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt); +void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt); int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); -- cgit v1.2.3 From 9ba373ee24ea07b60b8c5041cafe88d5796bacef Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Feb 2020 17:00:33 -0500 Subject: xprtrdma: Allocate Protection Domain in rpcrdma_ep_create() Make a Protection Domain (PD) a per-connection resource rather than a per-transport resource. In other words, when the connection terminates, the PD is destroyed. Thus there is one less HW resource that remains allocated to a transport after a connection is closed. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index f361213a8157..36fe7baea014 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -363,14 +363,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt) rc = PTR_ERR(ia->ri_id); goto out_err; } - - ia->ri_pd = ib_alloc_pd(ia->ri_id->device, 0); - if (IS_ERR(ia->ri_pd)) { - rc = PTR_ERR(ia->ri_pd); - pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc); - goto out_err; - } - return 0; out_err: @@ -403,9 +395,6 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) rpcrdma_ep_destroy(r_xprt); - ib_dealloc_pd(ia->ri_pd); - ia->ri_pd = NULL; - /* Allow waiters to continue */ complete(&ia->ri_remove_done); @@ -423,11 +412,6 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia) if (ia->ri_id && !IS_ERR(ia->ri_id)) rdma_destroy_id(ia->ri_id); ia->ri_id = NULL; - - /* If the pd is still busy, xprtrdma missed freeing a resource */ - if (ia->ri_pd && !IS_ERR(ia->ri_pd)) - ib_dealloc_pd(ia->ri_pd); - ia->ri_pd = NULL; } static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt, @@ -514,6 +498,12 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt, ep->rep_remote_cma.flow_control = 0; ep->rep_remote_cma.rnr_retry_count = 0; + ia->ri_pd = ib_alloc_pd(id->device, 0); + if (IS_ERR(ia->ri_pd)) { + rc = PTR_ERR(ia->ri_pd); + goto out_destroy; + } + rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); if (rc) goto out_destroy; @@ -540,6 +530,10 @@ static void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt) if (ep->rep_attr.send_cq) ib_free_cq(ep->rep_attr.send_cq); ep->rep_attr.send_cq = NULL; + + if (ia->ri_pd) + ib_dealloc_pd(ia->ri_pd); + ia->ri_pd = NULL; } /* Re-establish a connection after a device removal event. -- cgit v1.2.3 From 81fe0c57f4e136375f3bcda74af413f82a34a1bb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Feb 2020 17:00:38 -0500 Subject: xprtrdma: Invoke rpcrdma_ia_open in the connect worker Move rdma_cm_id creation into rpcrdma_ep_create() so that it is now responsible for allocating all per-connection hardware resources. With this clean-up, all three arms of the switch statement in rpcrdma_ep_connect are exactly the same now, thus the switch can be removed. Because device removal behaves a little differently than disconnection, there is a little more work to be done before rpcrdma_ep_destroy() can release the connection's rdma_cm_id. So it is not quite symmetrical with rpcrdma_ep_create() yet. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 1 - net/sunrpc/xprtrdma/transport.c | 7 -- net/sunrpc/xprtrdma/verbs.c | 153 ++++++---------------------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 - 4 files changed, 20 insertions(+), 143 deletions(-) (limited to 'net/sunrpc') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 87f4461ab108..ba37c47b51e8 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -415,7 +415,6 @@ DEFINE_CONN_EVENT(disconnect); DEFINE_RXPRT_EVENT(xprtrdma_create); DEFINE_RXPRT_EVENT(xprtrdma_op_destroy); DEFINE_RXPRT_EVENT(xprtrdma_remove); -DEFINE_RXPRT_EVENT(xprtrdma_reinsert); DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc); DEFINE_RXPRT_EVENT(xprtrdma_op_close); DEFINE_RXPRT_EVENT(xprtrdma_op_setport); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 6349e6c98b57..745dfd149637 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -286,7 +286,6 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) rpcrdma_xprt_disconnect(r_xprt); rpcrdma_buffer_destroy(&r_xprt->rx_buf); - rpcrdma_ia_close(&r_xprt->rx_ia); xprt_rdma_free_addresses(xprt); xprt_free(xprt); @@ -347,10 +346,6 @@ xprt_setup_rdma(struct xprt_create *args) xprt_rdma_format_addresses(xprt, sap); new_xprt = rpcx_to_rdmax(xprt); - rc = rpcrdma_ia_open(new_xprt); - if (rc) - goto out1; - rc = rpcrdma_buffer_create(new_xprt); if (rc) goto out2; @@ -372,8 +367,6 @@ out4: rpcrdma_buffer_destroy(&new_xprt->rx_buf); rc = -ENODEV; out2: - rpcrdma_ia_close(&new_xprt->rx_ia); -out1: trace_xprtrdma_op_destroy(new_xprt); xprt_rdma_free_addresses(xprt); xprt_free(xprt); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 36fe7baea014..3df20f355579 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -345,31 +345,6 @@ out: * Exported functions. */ -/** - * rpcrdma_ia_open - Open and initialize an Interface Adapter. - * @xprt: transport with IA to (re)initialize - * - * Returns 0 on success, negative errno if an appropriate - * Interface Adapter could not be found and opened. - */ -int -rpcrdma_ia_open(struct rpcrdma_xprt *xprt) -{ - struct rpcrdma_ia *ia = &xprt->rx_ia; - int rc; - - ia->ri_id = rpcrdma_create_id(xprt, ia); - if (IS_ERR(ia->ri_id)) { - rc = PTR_ERR(ia->ri_id); - goto out_err; - } - return 0; - -out_err: - rpcrdma_ia_close(ia); - return rc; -} - /** * rpcrdma_ia_remove - Handle device driver unload * @ia: interface adapter being removed @@ -401,34 +376,26 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) trace_xprtrdma_remove(r_xprt); } -/** - * rpcrdma_ia_close - Clean up/close an IA. - * @ia: interface adapter to close - * - */ -void -rpcrdma_ia_close(struct rpcrdma_ia *ia) -{ - if (ia->ri_id && !IS_ERR(ia->ri_id)) - rdma_destroy_id(ia->ri_id); - ia->ri_id = NULL; -} - -static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt, - struct rdma_cm_id *id) +static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; + struct rdma_cm_id *id; int rc; + id = rpcrdma_create_id(r_xprt, ia); + if (IS_ERR(id)) + return PTR_ERR(id); + ep->rep_max_requests = r_xprt->rx_xprt.max_reqs; ep->rep_inline_send = xprt_rdma_max_inline_write; ep->rep_inline_recv = xprt_rdma_max_inline_read; rc = frwr_query_device(r_xprt, id->device); if (rc) - return rc; + goto out_destroy; + r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->rep_max_requests); ep->rep_attr.event_handler = rpcrdma_qp_event_handler; @@ -507,10 +474,12 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt, rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); if (rc) goto out_destroy; + ia->ri_id = id; return 0; out_destroy: rpcrdma_ep_destroy(r_xprt); + rdma_destroy_id(id); return rc; } @@ -536,79 +505,8 @@ static void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt) ia->ri_pd = NULL; } -/* Re-establish a connection after a device removal event. - * Unlike a normal reconnection, a fresh PD and a new set - * of MRs and buffers is needed. - */ -static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int rc, err; - - trace_xprtrdma_reinsert(r_xprt); - - rc = -EHOSTUNREACH; - if (rpcrdma_ia_open(r_xprt)) - goto out1; - - rc = -ENETUNREACH; - err = rpcrdma_ep_create(r_xprt, ia->ri_id); - if (err) - goto out2; - return 0; - -out2: - rpcrdma_ia_close(ia); -out1: - return rc; -} - -static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rdma_cm_id *id, *old; - int err, rc; - - rc = -EHOSTUNREACH; - id = rpcrdma_create_id(r_xprt, ia); - if (IS_ERR(id)) - goto out; - - /* As long as the new ID points to the same device as the - * old ID, we can reuse the transport's existing PD and all - * previously allocated MRs. Also, the same device means - * the transport's previous DMA mappings are still valid. - * - * This is a sanity check only. There should be no way these - * point to two different devices here. - */ - old = id; - rc = -ENETUNREACH; - if (ia->ri_id->device != id->device) { - pr_err("rpcrdma: can't reconnect on different device!\n"); - goto out_destroy; - } - - err = rpcrdma_ep_create(r_xprt, id); - if (err) - goto out_destroy; - - /* Atomically replace the transport's ID. */ - rc = 0; - old = ia->ri_id; - ia->ri_id = id; - -out_destroy: - rdma_destroy_id(old); -out: - return rc; -} - -/** - * rpcrdma_xprt_connect - Connect an unconnected transport - * @r_xprt: controlling transport instance - * - * Returns 0 on success or a negative errno. +/* + * Connect unconnected endpoint. */ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) { @@ -618,25 +516,10 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) int rc; retry: - switch (ep->rep_connected) { - case 0: - rc = -ENETUNREACH; - if (rpcrdma_ep_create(r_xprt, ia->ri_id)) - goto out_noupdate; - break; - case -ENODEV: - rc = rpcrdma_ep_recreate_xprt(r_xprt); - if (rc) - goto out_noupdate; - break; - case 1: - rpcrdma_xprt_disconnect(r_xprt); - /* fall through */ - default: - rc = rpcrdma_ep_reconnect(r_xprt); - if (rc) - goto out; - } + rpcrdma_xprt_disconnect(r_xprt); + rc = rpcrdma_ep_create(r_xprt); + if (rc) + goto out_noupdate; ep->rep_connected = 0; xprt_clear_connected(xprt); @@ -712,6 +595,10 @@ out: rpcrdma_sendctxs_destroy(r_xprt); rpcrdma_ep_destroy(r_xprt); + + if (ia->ri_id) + rdma_destroy_id(ia->ri_id); + ia->ri_id = NULL; } /* Fixed-size circular FIFO queue. This implementation is wait-free and diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9ead06b1d8a4..8be1b70b71a2 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -457,9 +457,7 @@ extern unsigned int xprt_rdma_memreg_strategy; /* * Interface Adapter calls - xprtrdma/verbs.c */ -int rpcrdma_ia_open(struct rpcrdma_xprt *xprt); void rpcrdma_ia_remove(struct rpcrdma_ia *ia); -void rpcrdma_ia_close(struct rpcrdma_ia *); /* * Endpoint calls - xprtrdma/verbs.c -- cgit v1.2.3 From 897b7be9bca0caa27cdf7520bdc7689abe989a53 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Feb 2020 17:00:44 -0500 Subject: xprtrdma: Remove rpcrdma_ia::ri_flags Clean up: The upper layer serializes calls to xprt_rdma_close, so there is no need for an atomic bit operation, saving 8 bytes in rpcrdma_ia. This enables merging rpcrdma_ia_remove directly into the disconnect logic. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 15 ----------- net/sunrpc/xprtrdma/verbs.c | 55 ++++++++++------------------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 10 -------- 3 files changed, 13 insertions(+), 67 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 745dfd149637..d7b7dab0aeb6 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -385,26 +385,11 @@ out2: void xprt_rdma_close(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - - might_sleep(); trace_xprtrdma_op_close(r_xprt); - /* Prevent marshaling and sending of new requests */ - xprt_clear_connected(xprt); - - if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) { - rpcrdma_ia_remove(ia); - goto out; - } - - if (ep->rep_connected == -ENODEV) - return; rpcrdma_xprt_disconnect(r_xprt); -out: xprt->reestablish_timeout = 0; ++xprt->connect_cookie; xprt_disconnect_done(xprt); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3df20f355579..a7f46bbbf017 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -250,12 +250,11 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); #endif init_completion(&ia->ri_remove_done); - set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); ep->rep_connected = -ENODEV; xprt_force_disconnect(xprt); wait_for_completion(&ia->ri_remove_done); + trace_xprtrdma_remove(r_xprt); - ia->ri_id = NULL; /* Return 1 to ensure the core destroys the id. */ return 1; case RDMA_CM_EVENT_ESTABLISHED: @@ -345,37 +344,6 @@ out: * Exported functions. */ -/** - * rpcrdma_ia_remove - Handle device driver unload - * @ia: interface adapter being removed - * - * Divest transport H/W resources associated with this adapter, - * but allow it to be restored later. - * - * Caller must hold the transport send lock. - */ -void -rpcrdma_ia_remove(struct rpcrdma_ia *ia) -{ - struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, - rx_ia); - - if (ia->ri_id->qp) - rpcrdma_xprt_drain(r_xprt); - - rpcrdma_reps_unmap(r_xprt); - rpcrdma_reqs_reset(r_xprt); - rpcrdma_mrs_destroy(r_xprt); - rpcrdma_sendctxs_destroy(r_xprt); - - rpcrdma_ep_destroy(r_xprt); - - /* Allow waiters to continue */ - complete(&ia->ri_remove_done); - - trace_xprtrdma_remove(r_xprt); -} - static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ep *ep = &r_xprt->rx_ep; @@ -573,12 +541,13 @@ void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt) struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rdma_cm_id *id = ia->ri_id; - int rc; + int rc, status = ep->rep_connected; + + might_sleep(); if (!id) - goto out; + return; - /* returns without wait if ID is not connected */ rc = rdma_disconnect(id); if (!rc) wait_event_interruptible(ep->rep_connect_wait, @@ -589,15 +558,17 @@ void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt) if (id->qp) rpcrdma_xprt_drain(r_xprt); -out: + rpcrdma_reps_unmap(r_xprt); rpcrdma_reqs_reset(r_xprt); rpcrdma_mrs_destroy(r_xprt); rpcrdma_sendctxs_destroy(r_xprt); rpcrdma_ep_destroy(r_xprt); - if (ia->ri_id) - rdma_destroy_id(ia->ri_id); + if (status == -ENODEV) + complete(&ia->ri_remove_done); + else + rdma_destroy_id(id); ia->ri_id = NULL; } @@ -815,10 +786,10 @@ void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ep *ep = &r_xprt->rx_ep; - /* If there is no underlying device, it's no use to - * wake the refresh worker. + /* If there is no underlying connection, it's no use + * to wake the refresh worker. */ - if (ep->rep_connected != -ENODEV) { + if (ep->rep_connected == 1) { /* The work is scheduled on a WQ_MEM_RECLAIM * workqueue in order to prevent MR allocation * from recursing into NFS during direct reclaim. diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 8be1b70b71a2..d2a0f125f7a8 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -75,15 +75,10 @@ struct rpcrdma_ia { unsigned int ri_max_frwr_depth; bool ri_implicit_roundup; enum ib_mr_type ri_mrtype; - unsigned long ri_flags; struct completion ri_done; struct completion ri_remove_done; }; -enum { - RPCRDMA_IAF_REMOVING = 0, -}; - /* * RDMA Endpoint -- one per transport instance */ @@ -454,11 +449,6 @@ extern int xprt_rdma_pad_optimize; */ extern unsigned int xprt_rdma_memreg_strategy; -/* - * Interface Adapter calls - xprtrdma/verbs.c - */ -void rpcrdma_ia_remove(struct rpcrdma_ia *ia); - /* * Endpoint calls - xprtrdma/verbs.c */ -- cgit v1.2.3 From d6ccebf956338ea015d7d54c4a4c9c17605707cb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Feb 2020 17:00:49 -0500 Subject: xprtrdma: Disconnect on flushed completion Completion errors after a disconnect often occur much sooner than a CM_DISCONNECT event. Use this to try to detect connection loss more quickly. Note that other kernel ULPs do take care to disconnect explicitly when a WR is flushed. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 3 ++- net/sunrpc/xprtrdma/frwr_ops.c | 24 ++++++++++++++++-------- net/sunrpc/xprtrdma/verbs.c | 37 ++++++++++++++++++++++++++++--------- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 4 files changed, 47 insertions(+), 18 deletions(-) (limited to 'net/sunrpc') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index ba37c47b51e8..cfbe28ad2614 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -109,7 +109,7 @@ DECLARE_EVENT_CLASS(xprtrdma_connect_class, __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("peer=[%s]:%s r_xprt=%p: rc=%d connect status=%d", + TP_printk("peer=[%s]:%s r_xprt=%p: rc=%d connection status=%d", __get_str(addr), __get_str(port), __entry->r_xprt, __entry->rc, __entry->connect_status ) @@ -411,6 +411,7 @@ TRACE_EVENT(xprtrdma_inline_thresh, DEFINE_CONN_EVENT(connect); DEFINE_CONN_EVENT(disconnect); +DEFINE_CONN_EVENT(flush_dct); DEFINE_RXPRT_EVENT(xprtrdma_create); DEFINE_RXPRT_EVENT(xprtrdma_op_destroy); diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index a1b5c8024cca..b482fac7be89 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -358,8 +358,8 @@ out_mapmr_err: /** * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed FastReg WR * */ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) @@ -371,6 +371,8 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_fastreg(wc, frwr); /* The MR will get recycled when the associated req is retransmitted */ + + rpcrdma_flush_disconnect(cq, wc); } /** @@ -441,8 +443,8 @@ static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr) /** * frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed LocalInv WR * */ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) @@ -455,12 +457,14 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li(wc, frwr); __frwr_release_mr(wc, mr); + + rpcrdma_flush_disconnect(cq, wc); } /** * frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed LocalInv WR * * Awaken anyone waiting for an MR to finish being fenced. */ @@ -475,6 +479,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) trace_xprtrdma_wc_li_wake(wc, frwr); __frwr_release_mr(wc, mr); complete(&frwr->fr_linv_done); + + rpcrdma_flush_disconnect(cq, wc); } /** @@ -562,8 +568,8 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /** * frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed LocalInv WR * */ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) @@ -581,6 +587,8 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) /* Ensure @rep is generated before __frwr_release_mr */ smp_rmb(); rpcrdma_complete_rqst(rep); + + rpcrdma_flush_disconnect(cq, wc); } /** diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index a7f46bbbf017..dfe680e3234a 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -128,14 +128,32 @@ rpcrdma_qp_event_handler(struct ib_event *event, void *context) trace_xprtrdma_qp_event(r_xprt, event); } +/** + * rpcrdma_flush_disconnect - Disconnect on flushed completion + * @cq: completion queue + * @wc: work completion entry + * + * Must be called in process context. + */ +void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc) +{ + struct rpcrdma_xprt *r_xprt = cq->cq_context; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + + if (wc->status != IB_WC_SUCCESS && r_xprt->rx_ep.rep_connected == 1) { + r_xprt->rx_ep.rep_connected = -ECONNABORTED; + trace_xprtrdma_flush_dct(r_xprt, wc->status); + xprt_force_disconnect(xprt); + } +} + /** * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC * @cq: completion queue - * @wc: completed WR + * @wc: WCE for a completed Send WR * */ -static void -rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) +static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; struct rpcrdma_sendctx *sc = @@ -144,21 +162,21 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_send(sc, wc); rpcrdma_sendctx_put_locked((struct rpcrdma_xprt *)cq->cq_context, sc); + rpcrdma_flush_disconnect(cq, wc); } /** * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed Receive WR * */ -static void -rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) +static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep, rr_cqe); - struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; + struct rpcrdma_xprt *r_xprt = cq->cq_context; /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_receive(wc); @@ -179,6 +197,7 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) return; out_flushed: + rpcrdma_flush_disconnect(cq, wc); rpcrdma_rep_destroy(rep); } @@ -395,7 +414,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) goto out_destroy; } - ep->rep_attr.recv_cq = ib_alloc_cq_any(id->device, NULL, + ep->rep_attr.recv_cq = ib_alloc_cq_any(id->device, r_xprt, ep->rep_attr.cap.max_recv_wr, IB_POLL_WORKQUEUE); if (IS_ERR(ep->rep_attr.recv_cq)) { diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index d2a0f125f7a8..8a3ac9d7ee81 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -452,6 +452,7 @@ extern unsigned int xprt_rdma_memreg_strategy; /* * Endpoint calls - xprtrdma/verbs.c */ +void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc); int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt); void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt); -- cgit v1.2.3 From 93aa8e0a9de80e1df2be17158a3469285e572b39 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Feb 2020 17:00:54 -0500 Subject: xprtrdma: Merge struct rpcrdma_ia into struct rpcrdma_ep I eventually want to allocate rpcrdma_ep separately from struct rpcrdma_xprt so that on occasion there can be more than one ep per xprt. The new struct rpcrdma_ep will contain all the fields currently in rpcrdma_ia and in rpcrdma_ep. This is all the device and CM settings for the connection, in addition to per-connection settings negotiated with the remote. Take this opportunity to rename the existing ep fields from rep_* to re_* to disambiguate these from struct rpcrdma_rep. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 12 +- net/sunrpc/xprtrdma/backchannel.c | 4 +- net/sunrpc/xprtrdma/frwr_ops.c | 108 +++++++-------- net/sunrpc/xprtrdma/rpc_rdma.c | 31 ++--- net/sunrpc/xprtrdma/transport.c | 9 +- net/sunrpc/xprtrdma/verbs.c | 283 +++++++++++++++++++------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 60 ++++---- 7 files changed, 246 insertions(+), 261 deletions(-) (limited to 'net/sunrpc') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index cfbe28ad2614..843269f0e291 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -104,7 +104,7 @@ DECLARE_EVENT_CLASS(xprtrdma_connect_class, TP_fast_assign( __entry->r_xprt = r_xprt; __entry->rc = rc; - __entry->connect_status = r_xprt->rx_ep.rep_connected; + __entry->connect_status = r_xprt->rx_ep.re_connect_status; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), @@ -394,10 +394,10 @@ TRACE_EVENT(xprtrdma_inline_thresh, const struct rpcrdma_ep *ep = &r_xprt->rx_ep; __entry->r_xprt = r_xprt; - __entry->inline_send = ep->rep_inline_send; - __entry->inline_recv = ep->rep_inline_recv; - __entry->max_send = ep->rep_max_inline_send; - __entry->max_recv = ep->rep_max_inline_recv; + __entry->inline_send = ep->re_inline_send; + __entry->inline_recv = ep->re_inline_recv; + __entry->max_send = ep->re_max_inline_send; + __entry->max_recv = ep->re_max_inline_recv; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), @@ -803,7 +803,7 @@ TRACE_EVENT(xprtrdma_post_recvs, __entry->r_xprt = r_xprt; __entry->count = count; __entry->status = status; - __entry->posted = r_xprt->rx_ep.rep_receive_count; + __entry->posted = r_xprt->rx_ep.re_receive_count; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 4b43910a6ed2..4b20102cf060 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -47,7 +47,7 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) struct rpcrdma_ep *ep = &r_xprt->rx_ep; size_t maxmsg; - maxmsg = min_t(unsigned int, ep->rep_inline_send, ep->rep_inline_recv); + maxmsg = min_t(unsigned int, ep->re_inline_send, ep->re_inline_recv); maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE); return maxmsg - RPCRDMA_HDRLEN_MIN; } @@ -190,7 +190,7 @@ create_req: if (xprt->bc_alloc_count >= RPCRDMA_BACKWARD_WRS) return NULL; - size = min_t(size_t, r_xprt->rx_ep.rep_inline_recv, PAGE_SIZE); + size = min_t(size_t, r_xprt->rx_ep.re_inline_recv, PAGE_SIZE); req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL); if (!req) return NULL; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index b482fac7be89..19bf422f010b 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -74,7 +74,7 @@ static void frwr_mr_recycle(struct rpcrdma_mr *mr) if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device, + ib_dma_unmap_sg(r_xprt->rx_ep.re_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); mr->mr_dir = DMA_NONE; } @@ -115,13 +115,13 @@ void frwr_reset(struct rpcrdma_req *req) */ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - unsigned int depth = ia->ri_max_frwr_depth; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + unsigned int depth = ep->re_max_fr_depth; struct scatterlist *sg; struct ib_mr *frmr; int rc; - frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); + frmr = ib_alloc_mr(ep->re_pd, ep->re_mrtype, depth); if (IS_ERR(frmr)) goto out_mr_err; @@ -151,29 +151,24 @@ out_list_err: /** * frwr_query_device - Prepare a transport for use with FRWR - * @r_xprt: controlling transport instance + * @ep: endpoint to fill in * @device: RDMA device to query * * On success, sets: - * ep->rep_attr - * ep->rep_max_requests - * ia->ri_max_rdma_segs - * - * And these FRWR-related fields: - * ia->ri_max_frwr_depth - * ia->ri_mrtype + * ep->re_attr + * ep->re_max_requests + * ep->re_max_rdma_segs + * ep->re_max_fr_depth + * ep->re_mrtype * * Return values: * On success, returns zero. * %-EINVAL - the device does not support FRWR memory registration * %-ENOMEM - the device is not sufficiently capable for NFS/RDMA */ -int frwr_query_device(struct rpcrdma_xprt *r_xprt, - const struct ib_device *device) +int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device) { const struct ib_device_attr *attrs = &device->attrs; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; int max_qp_wr, depth, delta; unsigned int max_sge; @@ -190,23 +185,23 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt, pr_err("rpcrdma: HCA provides only %u send SGEs\n", max_sge); return -ENOMEM; } - ep->rep_attr.cap.max_send_sge = max_sge; - ep->rep_attr.cap.max_recv_sge = 1; + ep->re_attr.cap.max_send_sge = max_sge; + ep->re_attr.cap.max_recv_sge = 1; - ia->ri_mrtype = IB_MR_TYPE_MEM_REG; + ep->re_mrtype = IB_MR_TYPE_MEM_REG; if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG) - ia->ri_mrtype = IB_MR_TYPE_SG_GAPS; + ep->re_mrtype = IB_MR_TYPE_SG_GAPS; /* Quirk: Some devices advertise a large max_fast_reg_page_list_len * capability, but perform optimally when the MRs are not larger * than a page. */ if (attrs->max_sge_rd > RPCRDMA_MAX_HDR_SEGS) - ia->ri_max_frwr_depth = attrs->max_sge_rd; + ep->re_max_fr_depth = attrs->max_sge_rd; else - ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len; - if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS) - ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS; + ep->re_max_fr_depth = attrs->max_fast_reg_page_list_len; + if (ep->re_max_fr_depth > RPCRDMA_MAX_DATA_SEGS) + ep->re_max_fr_depth = RPCRDMA_MAX_DATA_SEGS; /* Add room for frwr register and invalidate WRs. * 1. FRWR reg WR for head @@ -222,11 +217,11 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt, /* Calculate N if the device max FRWR depth is smaller than * RPCRDMA_MAX_DATA_SEGS. */ - if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) { - delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth; + if (ep->re_max_fr_depth < RPCRDMA_MAX_DATA_SEGS) { + delta = RPCRDMA_MAX_DATA_SEGS - ep->re_max_fr_depth; do { depth += 2; /* FRWR reg + invalidate */ - delta -= ia->ri_max_frwr_depth; + delta -= ep->re_max_fr_depth; } while (delta > 0); } @@ -235,34 +230,34 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt, max_qp_wr -= 1; if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) return -ENOMEM; - if (ep->rep_max_requests > max_qp_wr) - ep->rep_max_requests = max_qp_wr; - ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth; - if (ep->rep_attr.cap.max_send_wr > max_qp_wr) { - ep->rep_max_requests = max_qp_wr / depth; - if (!ep->rep_max_requests) + if (ep->re_max_requests > max_qp_wr) + ep->re_max_requests = max_qp_wr; + ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth; + if (ep->re_attr.cap.max_send_wr > max_qp_wr) { + ep->re_max_requests = max_qp_wr / depth; + if (!ep->re_max_requests) return -ENOMEM; - ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth; + ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth; } - ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; - ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ - ep->rep_attr.cap.max_recv_wr = ep->rep_max_requests; - ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; - ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ - - ia->ri_max_rdma_segs = - DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth); + ep->re_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; + ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ + ep->re_attr.cap.max_recv_wr = ep->re_max_requests; + ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; + ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ + + ep->re_max_rdma_segs = + DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ep->re_max_fr_depth); /* Reply chunks require segments for head and tail buffers */ - ia->ri_max_rdma_segs += 2; - if (ia->ri_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS) - ia->ri_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS; + ep->re_max_rdma_segs += 2; + if (ep->re_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS) + ep->re_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS; /* Ensure the underlying device is capable of conveying the * largest r/wsize NFS will ask for. This guarantees that * failing over from one RDMA device to another will not * break NFS I/O. */ - if ((ia->ri_max_rdma_segs * ia->ri_max_frwr_depth) < RPCRDMA_MAX_SEGS) + if ((ep->re_max_rdma_segs * ep->re_max_fr_depth) < RPCRDMA_MAX_SEGS) return -ENOMEM; return 0; @@ -288,14 +283,14 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, int nsegs, bool writing, __be32 xid, struct rpcrdma_mr *mr) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct ib_reg_wr *reg_wr; int i, n, dma_nents; struct ib_mr *ibmr; u8 key; - if (nsegs > ia->ri_max_frwr_depth) - nsegs = ia->ri_max_frwr_depth; + if (nsegs > ep->re_max_fr_depth) + nsegs = ep->re_max_fr_depth; for (i = 0; i < nsegs;) { if (seg->mr_page) sg_set_page(&mr->mr_sg[i], @@ -308,7 +303,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, ++seg; ++i; - if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS) + if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS) continue; if ((i < nsegs && offset_in_page(seg->mr_offset)) || offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) @@ -317,7 +312,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, mr->mr_dir = rpcrdma_data_dir(writing); mr->mr_nents = i; - dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, mr->mr_nents, + dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); if (!dma_nents) goto out_dmamap_err; @@ -391,7 +386,6 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) */ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct ib_send_wr *post_wr; struct rpcrdma_mr *mr; @@ -411,7 +405,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) post_wr = &frwr->fr_regwr.wr; } - return ib_post_send(ia->ri_id->qp, post_wr, NULL); + return ib_post_send(r_xprt->rx_ep.re_id->qp, post_wr, NULL); } /** @@ -538,10 +532,10 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us - * unless ri_id->qp is a valid pointer. + * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr); + rc = ib_post_send(r_xprt->rx_ep.re_id->qp, first, &bad_wr); /* The final LOCAL_INV WR in the chain is supposed to * do the wake. If it was never posted, the wake will @@ -643,10 +637,10 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us - * unless ri_id->qp is a valid pointer. + * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr); + rc = ib_post_send(r_xprt->rx_ep.re_id->qp, first, &bad_wr); if (!rc) return; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 28020ec104d4..ad7e6b0187bd 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -103,21 +103,20 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) /** * rpcrdma_set_max_header_sizes - Initialize inline payload sizes - * @r_xprt: transport instance to initialize + * @ep: endpoint to initialize * * The max_inline fields contain the maximum size of an RPC message * so the marshaling code doesn't have to repeat this calculation * for every RPC. */ -void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt) +void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep) { - unsigned int maxsegs = r_xprt->rx_ia.ri_max_rdma_segs; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + unsigned int maxsegs = ep->re_max_rdma_segs; - ep->rep_max_inline_send = - ep->rep_inline_send - rpcrdma_max_call_header_size(maxsegs); - ep->rep_max_inline_recv = - ep->rep_inline_recv - rpcrdma_max_reply_header_size(maxsegs); + ep->re_max_inline_send = + ep->re_inline_send - rpcrdma_max_call_header_size(maxsegs); + ep->re_max_inline_recv = + ep->re_inline_recv - rpcrdma_max_reply_header_size(maxsegs); } /* The client can send a request inline as long as the RPCRDMA header @@ -134,7 +133,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdr = &rqst->rq_snd_buf; unsigned int count, remaining, offset; - if (xdr->len > r_xprt->rx_ep.rep_max_inline_send) + if (xdr->len > r_xprt->rx_ep.re_max_inline_send) return false; if (xdr->page_len) { @@ -145,7 +144,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, remaining -= min_t(unsigned int, PAGE_SIZE - offset, remaining); offset = 0; - if (++count > r_xprt->rx_ep.rep_attr.cap.max_send_sge) + if (++count > r_xprt->rx_ep.re_attr.cap.max_send_sge) return false; } } @@ -162,7 +161,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { - return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.rep_max_inline_recv; + return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.re_max_inline_recv; } /* The client is required to provide a Reply chunk if the maximum @@ -176,7 +175,7 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt, const struct xdr_buf *buf = &rqst->rq_rcv_buf; return (buf->head[0].iov_len + buf->tail[0].iov_len) < - r_xprt->rx_ep.rep_max_inline_recv; + r_xprt->rx_ep.re_max_inline_recv; } /* Split @vec on page boundaries into SGEs. FMR registers pages, not @@ -255,7 +254,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, /* When encoding a Read chunk, the tail iovec contains an * XDR pad and may be omitted. */ - if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup) + if (type == rpcrdma_readch && r_xprt->rx_ep.re_implicit_roundup) goto out; /* When encoding a Write chunk, some servers need to see an @@ -263,7 +262,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, * layer provides space in the tail iovec that may be used * for this purpose. */ - if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup) + if (type == rpcrdma_writech && r_xprt->rx_ep.re_implicit_roundup) goto out; if (xdrbuf->tail[0].iov_len) @@ -1476,8 +1475,8 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) if (credits == 0) credits = 1; /* don't deadlock */ - else if (credits > r_xprt->rx_ep.rep_max_requests) - credits = r_xprt->rx_ep.rep_max_requests; + else if (credits > r_xprt->rx_ep.re_max_requests) + credits = r_xprt->rx_ep.re_max_requests; if (buf->rb_credits != credits) rpcrdma_update_cwnd(r_xprt, credits); rpcrdma_post_recvs(r_xprt, false); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index d7b7dab0aeb6..4352fd6e9817 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -238,11 +238,12 @@ xprt_rdma_connect_worker(struct work_struct *work) struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt, rx_connect_worker.work); struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; int rc; rc = rpcrdma_xprt_connect(r_xprt); xprt_clear_connecting(xprt); - if (r_xprt->rx_ep.rep_connected > 0) { + if (ep->re_connect_status > 0) { xprt->stat.connect_count++; xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; @@ -265,7 +266,7 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); trace_xprtrdma_op_inject_dsc(r_xprt); - rdma_disconnect(r_xprt->rx_ia.ri_id); + rdma_disconnect(r_xprt->rx_ep.re_id); } /** @@ -355,6 +356,7 @@ xprt_setup_rdma(struct xprt_create *args) INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, xprt_rdma_connect_worker); + xprt->max_payload = RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT; dprintk("RPC: %s: %s:%s\n", __func__, @@ -489,10 +491,11 @@ static void xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_ep *ep = &r_xprt->rx_ep; unsigned long delay; delay = 0; - if (r_xprt->rx_ep.rep_connected != 0) { + if (ep->re_connect_status != 0) { delay = xprt_reconnect_delay(xprt); xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO); } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index dfe680e3234a..10826982ddf8 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -97,17 +97,17 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); */ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rdma_cm_id *id = r_xprt->rx_ep.re_id; /* Flush Receives, then wait for deferred Reply work * to complete. */ - ib_drain_rq(ia->ri_id->qp); + ib_drain_rq(id->qp); /* Deferred Reply processing might have scheduled * local invalidations. */ - ib_drain_sq(ia->ri_id->qp); + ib_drain_sq(id->qp); } /** @@ -140,8 +140,9 @@ void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_xprt *r_xprt = cq->cq_context; struct rpc_xprt *xprt = &r_xprt->rx_xprt; - if (wc->status != IB_WC_SUCCESS && r_xprt->rx_ep.rep_connected == 1) { - r_xprt->rx_ep.rep_connected = -ECONNABORTED; + if (wc->status != IB_WC_SUCCESS && + r_xprt->rx_ep.re_connect_status == 1) { + r_xprt->rx_ep.re_connect_status = -ECONNABORTED; trace_xprtrdma_flush_dct(r_xprt, wc->status); xprt_force_disconnect(xprt); } @@ -180,7 +181,7 @@ static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_receive(wc); - --r_xprt->rx_ep.rep_receive_count; + --r_xprt->rx_ep.re_receive_count; if (wc->status != IB_WC_SUCCESS) goto out_flushed; @@ -209,24 +210,24 @@ static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt, unsigned int rsize, wsize; /* Default settings for RPC-over-RDMA Version One */ - r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize; + ep->re_implicit_roundup = xprt_rdma_pad_optimize; rsize = RPCRDMA_V1_DEF_INLINE_SIZE; wsize = RPCRDMA_V1_DEF_INLINE_SIZE; if (pmsg && pmsg->cp_magic == rpcrdma_cmp_magic && pmsg->cp_version == RPCRDMA_CMP_VERSION) { - r_xprt->rx_ia.ri_implicit_roundup = true; + ep->re_implicit_roundup = true; rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); } - if (rsize < ep->rep_inline_recv) - ep->rep_inline_recv = rsize; - if (wsize < ep->rep_inline_send) - ep->rep_inline_send = wsize; + if (rsize < ep->re_inline_recv) + ep->re_inline_recv = rsize; + if (wsize < ep->re_inline_send) + ep->re_inline_send = wsize; - rpcrdma_set_max_header_sizes(r_xprt); + rpcrdma_set_max_header_sizes(ep); } /** @@ -241,7 +242,6 @@ static int rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct rpcrdma_xprt *r_xprt = id->context; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpc_xprt *xprt = &r_xprt->rx_xprt; @@ -251,57 +251,57 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: - ia->ri_async_rc = 0; - complete(&ia->ri_done); + ep->re_async_rc = 0; + complete(&ep->re_done); return 0; case RDMA_CM_EVENT_ADDR_ERROR: - ia->ri_async_rc = -EPROTO; - complete(&ia->ri_done); + ep->re_async_rc = -EPROTO; + complete(&ep->re_done); return 0; case RDMA_CM_EVENT_ROUTE_ERROR: - ia->ri_async_rc = -ENETUNREACH; - complete(&ia->ri_done); + ep->re_async_rc = -ENETUNREACH; + complete(&ep->re_done); return 0; case RDMA_CM_EVENT_DEVICE_REMOVAL: #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) pr_info("rpcrdma: removing device %s for %s:%s\n", - ia->ri_id->device->name, + ep->re_id->device->name, rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); #endif - init_completion(&ia->ri_remove_done); - ep->rep_connected = -ENODEV; + init_completion(&ep->re_remove_done); + ep->re_connect_status = -ENODEV; xprt_force_disconnect(xprt); - wait_for_completion(&ia->ri_remove_done); + wait_for_completion(&ep->re_remove_done); trace_xprtrdma_remove(r_xprt); /* Return 1 to ensure the core destroys the id. */ return 1; case RDMA_CM_EVENT_ESTABLISHED: ++xprt->connect_cookie; - ep->rep_connected = 1; + ep->re_connect_status = 1; rpcrdma_update_cm_private(r_xprt, &event->param.conn); trace_xprtrdma_inline_thresh(r_xprt); - wake_up_all(&ep->rep_connect_wait); + wake_up_all(&ep->re_connect_wait); break; case RDMA_CM_EVENT_CONNECT_ERROR: - ep->rep_connected = -ENOTCONN; + ep->re_connect_status = -ENOTCONN; goto disconnected; case RDMA_CM_EVENT_UNREACHABLE: - ep->rep_connected = -ENETUNREACH; + ep->re_connect_status = -ENETUNREACH; goto disconnected; case RDMA_CM_EVENT_REJECTED: dprintk("rpcrdma: connection to %s:%s rejected: %s\n", rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), rdma_reject_msg(id, event->status)); - ep->rep_connected = -ECONNREFUSED; + ep->re_connect_status = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) - ep->rep_connected = -EAGAIN; + ep->re_connect_status = -EAGAIN; goto disconnected; case RDMA_CM_EVENT_DISCONNECTED: - ep->rep_connected = -ECONNABORTED; + ep->re_connect_status = -ECONNABORTED; disconnected: xprt_force_disconnect(xprt); - wake_up_all(&ep->rep_connect_wait); + wake_up_all(&ep->re_connect_wait); break; default: break; @@ -309,46 +309,46 @@ disconnected: dprintk("RPC: %s: %s:%s on %s/frwr: %s\n", __func__, rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), - ia->ri_id->device->name, rdma_event_msg(event->event)); + ep->re_id->device->name, rdma_event_msg(event->event)); return 0; } -static struct rdma_cm_id * -rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) +static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_ep *ep) { unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct rdma_cm_id *id; int rc; - init_completion(&ia->ri_done); + init_completion(&ep->re_done); - id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler, - xprt, RDMA_PS_TCP, IB_QPT_RC); + id = rdma_create_id(xprt->xprt_net, rpcrdma_cm_event_handler, r_xprt, + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(id)) return id; - ia->ri_async_rc = -ETIMEDOUT; - rc = rdma_resolve_addr(id, NULL, - (struct sockaddr *)&xprt->rx_xprt.addr, + ep->re_async_rc = -ETIMEDOUT; + rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->addr, RDMA_RESOLVE_TIMEOUT); if (rc) goto out; - rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); + rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout); if (rc < 0) goto out; - rc = ia->ri_async_rc; + rc = ep->re_async_rc; if (rc) goto out; - ia->ri_async_rc = -ETIMEDOUT; + ep->re_async_rc = -ETIMEDOUT; rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); if (rc) goto out; - rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); + rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout); if (rc < 0) goto out; - rc = ia->ri_async_rc; + rc = ep->re_async_rc; if (rc) goto out; @@ -366,102 +366,101 @@ out: static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; + struct rpcrdma_connect_private *pmsg = &ep->re_cm_private; struct rdma_cm_id *id; int rc; - id = rpcrdma_create_id(r_xprt, ia); + id = rpcrdma_create_id(r_xprt, ep); if (IS_ERR(id)) return PTR_ERR(id); - ep->rep_max_requests = r_xprt->rx_xprt.max_reqs; - ep->rep_inline_send = xprt_rdma_max_inline_write; - ep->rep_inline_recv = xprt_rdma_max_inline_read; - - rc = frwr_query_device(r_xprt, id->device); + ep->re_max_requests = r_xprt->rx_xprt.max_reqs; + ep->re_inline_send = xprt_rdma_max_inline_write; + ep->re_inline_recv = xprt_rdma_max_inline_read; + rc = frwr_query_device(ep, id->device); if (rc) goto out_destroy; - r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->rep_max_requests); + r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->re_max_requests); - ep->rep_attr.event_handler = rpcrdma_qp_event_handler; - ep->rep_attr.qp_context = ep; - ep->rep_attr.srq = NULL; - ep->rep_attr.cap.max_inline_data = 0; - ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; - ep->rep_attr.qp_type = IB_QPT_RC; - ep->rep_attr.port_num = ~0; + ep->re_attr.event_handler = rpcrdma_qp_event_handler; + ep->re_attr.qp_context = ep; + ep->re_attr.srq = NULL; + ep->re_attr.cap.max_inline_data = 0; + ep->re_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + ep->re_attr.qp_type = IB_QPT_RC; + ep->re_attr.port_num = ~0; dprintk("RPC: %s: requested max: dtos: send %d recv %d; " "iovs: send %d recv %d\n", __func__, - ep->rep_attr.cap.max_send_wr, - ep->rep_attr.cap.max_recv_wr, - ep->rep_attr.cap.max_send_sge, - ep->rep_attr.cap.max_recv_sge); - - ep->rep_send_batch = ep->rep_max_requests >> 3; - ep->rep_send_count = ep->rep_send_batch; - init_waitqueue_head(&ep->rep_connect_wait); - ep->rep_receive_count = 0; - - ep->rep_attr.send_cq = ib_alloc_cq_any(id->device, r_xprt, - ep->rep_attr.cap.max_send_wr, - IB_POLL_WORKQUEUE); - if (IS_ERR(ep->rep_attr.send_cq)) { - rc = PTR_ERR(ep->rep_attr.send_cq); + ep->re_attr.cap.max_send_wr, + ep->re_attr.cap.max_recv_wr, + ep->re_attr.cap.max_send_sge, + ep->re_attr.cap.max_recv_sge); + + ep->re_send_batch = ep->re_max_requests >> 3; + ep->re_send_count = ep->re_send_batch; + init_waitqueue_head(&ep->re_connect_wait); + + ep->re_attr.send_cq = ib_alloc_cq_any(id->device, r_xprt, + ep->re_attr.cap.max_send_wr, + IB_POLL_WORKQUEUE); + if (IS_ERR(ep->re_attr.send_cq)) { + rc = PTR_ERR(ep->re_attr.send_cq); goto out_destroy; } - ep->rep_attr.recv_cq = ib_alloc_cq_any(id->device, r_xprt, - ep->rep_attr.cap.max_recv_wr, - IB_POLL_WORKQUEUE); - if (IS_ERR(ep->rep_attr.recv_cq)) { - rc = PTR_ERR(ep->rep_attr.recv_cq); + ep->re_attr.recv_cq = ib_alloc_cq_any(id->device, r_xprt, + ep->re_attr.cap.max_recv_wr, + IB_POLL_WORKQUEUE); + if (IS_ERR(ep->re_attr.recv_cq)) { + rc = PTR_ERR(ep->re_attr.recv_cq); goto out_destroy; } + ep->re_receive_count = 0; /* Initialize cma parameters */ - memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma)); + memset(&ep->re_remote_cma, 0, sizeof(ep->re_remote_cma)); /* Prepare RDMA-CM private message */ pmsg->cp_magic = rpcrdma_cmp_magic; pmsg->cp_version = RPCRDMA_CMP_VERSION; pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK; - pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->rep_inline_send); - pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->rep_inline_recv); - ep->rep_remote_cma.private_data = pmsg; - ep->rep_remote_cma.private_data_len = sizeof(*pmsg); + pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->re_inline_send); + pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->re_inline_recv); + ep->re_remote_cma.private_data = pmsg; + ep->re_remote_cma.private_data_len = sizeof(*pmsg); /* Client offers RDMA Read but does not initiate */ - ep->rep_remote_cma.initiator_depth = 0; - ep->rep_remote_cma.responder_resources = + ep->re_remote_cma.initiator_depth = 0; + ep->re_remote_cma.responder_resources = min_t(int, U8_MAX, id->device->attrs.max_qp_rd_atom); /* Limit transport retries so client can detect server * GID changes quickly. RPC layer handles re-establishing * transport connection and retransmission. */ - ep->rep_remote_cma.retry_count = 6; + ep->re_remote_cma.retry_count = 6; /* RPC-over-RDMA handles its own flow control. In addition, * make all RNR NAKs visible so we know that RPC-over-RDMA * flow control is working correctly (no NAKs should be seen). */ - ep->rep_remote_cma.flow_control = 0; - ep->rep_remote_cma.rnr_retry_count = 0; + ep->re_remote_cma.flow_control = 0; + ep->re_remote_cma.rnr_retry_count = 0; - ia->ri_pd = ib_alloc_pd(id->device, 0); - if (IS_ERR(ia->ri_pd)) { - rc = PTR_ERR(ia->ri_pd); + ep->re_pd = ib_alloc_pd(id->device, 0); + if (IS_ERR(ep->re_pd)) { + rc = PTR_ERR(ep->re_pd); goto out_destroy; } - rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); + rc = rdma_create_qp(id, ep->re_pd, &ep->re_attr); if (rc) goto out_destroy; - ia->ri_id = id; + + ep->re_id = id; return 0; out_destroy: @@ -473,23 +472,22 @@ out_destroy: static void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - if (ia->ri_id && ia->ri_id->qp) { - rdma_destroy_qp(ia->ri_id); - ia->ri_id->qp = NULL; + if (ep->re_id && ep->re_id->qp) { + rdma_destroy_qp(ep->re_id); + ep->re_id->qp = NULL; } - if (ep->rep_attr.recv_cq) - ib_free_cq(ep->rep_attr.recv_cq); - ep->rep_attr.recv_cq = NULL; - if (ep->rep_attr.send_cq) - ib_free_cq(ep->rep_attr.send_cq); - ep->rep_attr.send_cq = NULL; + if (ep->re_attr.recv_cq) + ib_free_cq(ep->re_attr.recv_cq); + ep->re_attr.recv_cq = NULL; + if (ep->re_attr.send_cq) + ib_free_cq(ep->re_attr.send_cq); + ep->re_attr.send_cq = NULL; - if (ia->ri_pd) - ib_dealloc_pd(ia->ri_pd); - ia->ri_pd = NULL; + if (ep->re_pd) + ib_dealloc_pd(ep->re_pd); + ep->re_pd = NULL; } /* @@ -499,7 +497,6 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) { struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; int rc; retry: @@ -508,7 +505,7 @@ retry: if (rc) goto out_noupdate; - ep->rep_connected = 0; + ep->re_connect_status = 0; xprt_clear_connected(xprt); rpcrdma_reset_cwnd(r_xprt); @@ -518,17 +515,18 @@ retry: if (rc) goto out; - rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); + rc = rdma_connect(ep->re_id, &ep->re_remote_cma); if (rc) goto out; if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; - wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); - if (ep->rep_connected <= 0) { - if (ep->rep_connected == -EAGAIN) + wait_event_interruptible(ep->re_connect_wait, + ep->re_connect_status != 0); + if (ep->re_connect_status <= 0) { + if (ep->re_connect_status == -EAGAIN) goto retry; - rc = ep->rep_connected; + rc = ep->re_connect_status; goto out; } @@ -541,7 +539,7 @@ retry: out: if (rc) - ep->rep_connected = rc; + ep->re_connect_status = rc; out_noupdate: trace_xprtrdma_connect(r_xprt, rc); @@ -558,9 +556,8 @@ out_noupdate: void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rdma_cm_id *id = ia->ri_id; - int rc, status = ep->rep_connected; + struct rdma_cm_id *id = ep->re_id; + int rc, status = ep->re_connect_status; might_sleep(); @@ -569,10 +566,10 @@ void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt) rc = rdma_disconnect(id); if (!rc) - wait_event_interruptible(ep->rep_connect_wait, - ep->rep_connected != 1); + wait_event_interruptible(ep->re_connect_wait, + ep->re_connect_status != 1); else - ep->rep_connected = rc; + ep->re_connect_status = rc; trace_xprtrdma_disconnect(r_xprt, rc); if (id->qp) @@ -585,10 +582,10 @@ void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt) rpcrdma_ep_destroy(r_xprt); if (status == -ENODEV) - complete(&ia->ri_remove_done); + complete(&ep->re_remove_done); else rdma_destroy_id(id); - ia->ri_id = NULL; + ep->re_id = NULL; } /* Fixed-size circular FIFO queue. This implementation is wait-free and @@ -625,7 +622,7 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep) { struct rpcrdma_sendctx *sc; - sc = kzalloc(struct_size(sc, sc_sges, ep->rep_attr.cap.max_send_sge), + sc = kzalloc(struct_size(sc, sc_sges, ep->re_attr.cap.max_send_sge), GFP_KERNEL); if (!sc) return NULL; @@ -645,7 +642,7 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) * the ->send_request call to fail temporarily before too many * Sends are posted. */ - i = r_xprt->rx_ep.rep_max_requests + RPCRDMA_MAX_BC_REQUESTS; + i = r_xprt->rx_ep.re_max_requests + RPCRDMA_MAX_BC_REQUESTS; buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL); if (!buf->rb_sc_ctxs) return -ENOMEM; @@ -756,10 +753,10 @@ static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; unsigned int count; - for (count = 0; count < ia->ri_max_rdma_segs; count++) { + for (count = 0; count < ep->re_max_rdma_segs; count++) { struct rpcrdma_mr *mr; int rc; @@ -808,7 +805,7 @@ void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt) /* If there is no underlying connection, it's no use * to wake the refresh worker. */ - if (ep->rep_connected == 1) { + if (ep->re_connect_status == 1) { /* The work is scheduled on a WQ_MEM_RECLAIM * workqueue in order to prevent MR allocation * from recursing into NFS during direct reclaim. @@ -872,7 +869,7 @@ int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Compute maximum header buffer size in bytes */ maxhdrsize = rpcrdma_fixed_maxsz + 3 + - r_xprt->rx_ia.ri_max_rdma_segs * rpcrdma_readchunk_maxsz; + r_xprt->rx_ep.re_max_rdma_segs * rpcrdma_readchunk_maxsz; maxhdrsize *= sizeof(__be32); rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize), DMA_TO_DEVICE, GFP_KERNEL); @@ -950,7 +947,7 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, if (rep == NULL) goto out; - rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep.rep_inline_recv, + rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep.re_inline_recv, DMA_FROM_DEVICE, GFP_KERNEL); if (!rep->rr_rdmabuf) goto out_free; @@ -1175,7 +1172,7 @@ void rpcrdma_mr_put(struct rpcrdma_mr *mr) if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device, + ib_dma_unmap_sg(r_xprt->rx_ep.re_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); mr->mr_dir = DMA_NONE; } @@ -1293,7 +1290,7 @@ bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags) bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_regbuf *rb) { - struct ib_device *device = r_xprt->rx_ia.ri_id->device; + struct ib_device *device = r_xprt->rx_ep.re_id->device; if (rb->rg_direction == DMA_NONE) return false; @@ -1306,7 +1303,7 @@ bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, } rb->rg_device = device; - rb->rg_iov.lkey = r_xprt->rx_ia.ri_pd->local_dma_lkey; + rb->rg_iov.lkey = r_xprt->rx_ep.re_pd->local_dma_lkey; return true; } @@ -1345,12 +1342,12 @@ int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) struct rpcrdma_ep *ep = &r_xprt->rx_ep; int rc; - if (!ep->rep_send_count || kref_read(&req->rl_kref) > 1) { + if (!ep->re_send_count || kref_read(&req->rl_kref) > 1) { send_wr->send_flags |= IB_SEND_SIGNALED; - ep->rep_send_count = ep->rep_send_batch; + ep->re_send_count = ep->re_send_batch; } else { send_wr->send_flags &= ~IB_SEND_SIGNALED; - --ep->rep_send_count; + --ep->re_send_count; } rc = frwr_send(r_xprt, req); @@ -1378,9 +1375,9 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) count = 0; needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); - if (likely(ep->rep_receive_count > needed)) + if (likely(ep->re_receive_count > needed)) goto out; - needed -= ep->rep_receive_count; + needed -= ep->re_receive_count; if (!temp) needed += RPCRDMA_MAX_RECV_BATCH; @@ -1406,7 +1403,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) if (!wr) goto out; - rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr, + rc = ib_post_recv(r_xprt->rx_ep.re_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); out: trace_xprtrdma_post_recvs(r_xprt, count, rc); @@ -1420,6 +1417,6 @@ out: --count; } } - ep->rep_receive_count += count; + ep->re_receive_count += count; return; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 8a3ac9d7ee81..f3c0b826c9ed 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -65,38 +65,32 @@ #define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) /* - * Interface Adapter -- one per transport instance + * RDMA Endpoint -- connection endpoint details */ -struct rpcrdma_ia { - struct rdma_cm_id *ri_id; - struct ib_pd *ri_pd; - int ri_async_rc; - unsigned int ri_max_rdma_segs; - unsigned int ri_max_frwr_depth; - bool ri_implicit_roundup; - enum ib_mr_type ri_mrtype; - struct completion ri_done; - struct completion ri_remove_done; -}; - -/* - * RDMA Endpoint -- one per transport instance - */ - struct rpcrdma_ep { - unsigned int rep_send_count; - unsigned int rep_send_batch; - unsigned int rep_max_inline_send; - unsigned int rep_max_inline_recv; - int rep_connected; - struct ib_qp_init_attr rep_attr; - wait_queue_head_t rep_connect_wait; - struct rpcrdma_connect_private rep_cm_private; - struct rdma_conn_param rep_remote_cma; - unsigned int rep_max_requests; /* depends on device */ - unsigned int rep_inline_send; /* negotiated */ - unsigned int rep_inline_recv; /* negotiated */ - int rep_receive_count; + struct rdma_cm_id *re_id; + struct ib_pd *re_pd; + unsigned int re_max_rdma_segs; + unsigned int re_max_fr_depth; + bool re_implicit_roundup; + enum ib_mr_type re_mrtype; + struct completion re_done; + struct completion re_remove_done; + unsigned int re_send_count; + unsigned int re_send_batch; + unsigned int re_max_inline_send; + unsigned int re_max_inline_recv; + int re_async_rc; + int re_connect_status; + struct ib_qp_init_attr re_attr; + wait_queue_head_t re_connect_wait; + struct rpcrdma_connect_private + re_cm_private; + struct rdma_conn_param re_remote_cma; + int re_receive_count; + unsigned int re_max_requests; /* depends on device */ + unsigned int re_inline_send; /* negotiated */ + unsigned int re_inline_recv; /* negotiated */ }; /* Pre-allocate extra Work Requests for handling backward receives @@ -417,7 +411,6 @@ struct rpcrdma_stats { */ struct rpcrdma_xprt { struct rpc_xprt rx_xprt; - struct rpcrdma_ia rx_ia; struct rpcrdma_ep rx_ep; struct rpcrdma_buffer rx_buf; struct delayed_work rx_connect_worker; @@ -522,8 +515,7 @@ rpcrdma_data_dir(bool writing) /* Memory registration calls xprtrdma/frwr_ops.c */ void frwr_reset(struct rpcrdma_req *req); -int frwr_query_device(struct rpcrdma_xprt *r_xprt, - const struct ib_device *device); +int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device); int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr); void frwr_release_mr(struct rpcrdma_mr *mr); struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, @@ -555,7 +547,7 @@ int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, enum rpcrdma_chunktype rtype); void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc); int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); -void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); +void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep); void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt); void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); void rpcrdma_reply_handler(struct rpcrdma_rep *rep); -- cgit v1.2.3 From 745b734c9bb80559b8ca64ae688901afefc1c3fd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Feb 2020 17:01:00 -0500 Subject: xprtrdma: Extract sockaddr from struct rdma_cm_id rpcrdma_cm_event_handler() is always passed an @id pointer that is valid. However, in a subsequent patch, we won't be able to extract an r_xprt in every case. So instead of using the r_xprt's presentation address strings, extract them from struct rdma_cm_id. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 78 ++++++++++++++++++++++++++++-------------- net/sunrpc/xprtrdma/verbs.c | 35 ++++++++----------- 2 files changed, 67 insertions(+), 46 deletions(-) (limited to 'net/sunrpc') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 843269f0e291..295f75b9b796 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -375,47 +375,74 @@ TRACE_EVENT(xprtrdma_cm_event, TRACE_EVENT(xprtrdma_inline_thresh, TP_PROTO( - const struct rpcrdma_xprt *r_xprt + const struct rpcrdma_ep *ep ), - TP_ARGS(r_xprt), + TP_ARGS(ep), TP_STRUCT__entry( - __field(const void *, r_xprt) __field(unsigned int, inline_send) __field(unsigned int, inline_recv) __field(unsigned int, max_send) __field(unsigned int, max_recv) - __string(addr, rpcrdma_addrstr(r_xprt)) - __string(port, rpcrdma_portstr(r_xprt)) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( - const struct rpcrdma_ep *ep = &r_xprt->rx_ep; + const struct rdma_cm_id *id = ep->re_id; - __entry->r_xprt = r_xprt; __entry->inline_send = ep->re_inline_send; __entry->inline_recv = ep->re_inline_recv; __entry->max_send = ep->re_max_inline_send; __entry->max_recv = ep->re_max_inline_recv; - __assign_str(addr, rpcrdma_addrstr(r_xprt)); - __assign_str(port, rpcrdma_portstr(r_xprt)); + memcpy(__entry->srcaddr, &id->route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id->route.addr.dst_addr, + sizeof(struct sockaddr_in6)); ), - TP_printk("peer=[%s]:%s r_xprt=%p neg send/recv=%u/%u, calc send/recv=%u/%u", - __get_str(addr), __get_str(port), __entry->r_xprt, + TP_printk("%pISpc -> %pISpc neg send/recv=%u/%u, calc send/recv=%u/%u", + __entry->srcaddr, __entry->dstaddr, __entry->inline_send, __entry->inline_recv, __entry->max_send, __entry->max_recv ) ); +TRACE_EVENT(xprtrdma_remove, + TP_PROTO( + const struct rpcrdma_ep *ep + ), + + TP_ARGS(ep), + + TP_STRUCT__entry( + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) + __string(name, ep->re_id->device->name) + ), + + TP_fast_assign( + const struct rdma_cm_id *id = ep->re_id; + + memcpy(__entry->srcaddr, &id->route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id->route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + __assign_str(name, id->device->name); + ), + + TP_printk("%pISpc -> %pISpc device=%s", + __entry->srcaddr, __entry->dstaddr, __get_str(name) + ) +); + DEFINE_CONN_EVENT(connect); DEFINE_CONN_EVENT(disconnect); DEFINE_CONN_EVENT(flush_dct); DEFINE_RXPRT_EVENT(xprtrdma_create); DEFINE_RXPRT_EVENT(xprtrdma_op_destroy); -DEFINE_RXPRT_EVENT(xprtrdma_remove); DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc); DEFINE_RXPRT_EVENT(xprtrdma_op_close); DEFINE_RXPRT_EVENT(xprtrdma_op_setport); @@ -482,32 +509,33 @@ TRACE_EVENT(xprtrdma_op_set_cto, TRACE_EVENT(xprtrdma_qp_event, TP_PROTO( - const struct rpcrdma_xprt *r_xprt, + const struct rpcrdma_ep *ep, const struct ib_event *event ), - TP_ARGS(r_xprt, event), + TP_ARGS(ep, event), TP_STRUCT__entry( - __field(const void *, r_xprt) - __field(unsigned int, event) + __field(unsigned long, event) __string(name, event->device->name) - __string(addr, rpcrdma_addrstr(r_xprt)) - __string(port, rpcrdma_portstr(r_xprt)) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( - __entry->r_xprt = r_xprt; + const struct rdma_cm_id *id = ep->re_id; + __entry->event = event->event; __assign_str(name, event->device->name); - __assign_str(addr, rpcrdma_addrstr(r_xprt)); - __assign_str(port, rpcrdma_portstr(r_xprt)); + memcpy(__entry->srcaddr, &id->route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id->route.addr.dst_addr, + sizeof(struct sockaddr_in6)); ), - TP_printk("peer=[%s]:%s r_xprt=%p: dev %s: %s (%u)", - __get_str(addr), __get_str(port), __entry->r_xprt, - __get_str(name), rdma_show_ib_event(__entry->event), - __entry->event + TP_printk("%pISpc -> %pISpc device=%s %s (%lu)", + __entry->srcaddr, __entry->dstaddr, __get_str(name), + rdma_show_ib_event(__entry->event), __entry->event ) ); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 10826982ddf8..37d07072bdbf 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -116,16 +116,14 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) * @context: ep that owns QP where event occurred * * Called from the RDMA provider (device driver) possibly in an interrupt - * context. + * context. The QP is always destroyed before the ID, so the ID will be + * reliably available when this handler is invoked. */ -static void -rpcrdma_qp_event_handler(struct ib_event *event, void *context) +static void rpcrdma_qp_event_handler(struct ib_event *event, void *context) { struct rpcrdma_ep *ep = context; - struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, - rx_ep); - trace_xprtrdma_qp_event(r_xprt, event); + trace_xprtrdma_qp_event(ep, event); } /** @@ -202,11 +200,10 @@ out_flushed: rpcrdma_rep_destroy(rep); } -static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt, +static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep, struct rdma_conn_param *param) { const struct rpcrdma_connect_private *pmsg = param->private_data; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; unsigned int rsize, wsize; /* Default settings for RPC-over-RDMA Version One */ @@ -241,6 +238,7 @@ static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt, static int rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { + struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr; struct rpcrdma_xprt *r_xprt = id->context; struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpc_xprt *xprt = &r_xprt->rx_xprt; @@ -263,24 +261,21 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) complete(&ep->re_done); return 0; case RDMA_CM_EVENT_DEVICE_REMOVAL: -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - pr_info("rpcrdma: removing device %s for %s:%s\n", - ep->re_id->device->name, - rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); -#endif + pr_info("rpcrdma: removing device %s for %pISpc\n", + ep->re_id->device->name, sap); init_completion(&ep->re_remove_done); ep->re_connect_status = -ENODEV; xprt_force_disconnect(xprt); wait_for_completion(&ep->re_remove_done); - trace_xprtrdma_remove(r_xprt); + trace_xprtrdma_remove(ep); /* Return 1 to ensure the core destroys the id. */ return 1; case RDMA_CM_EVENT_ESTABLISHED: ++xprt->connect_cookie; ep->re_connect_status = 1; - rpcrdma_update_cm_private(r_xprt, &event->param.conn); - trace_xprtrdma_inline_thresh(r_xprt); + rpcrdma_update_cm_private(ep, &event->param.conn); + trace_xprtrdma_inline_thresh(ep); wake_up_all(&ep->re_connect_wait); break; case RDMA_CM_EVENT_CONNECT_ERROR: @@ -290,9 +285,8 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) ep->re_connect_status = -ENETUNREACH; goto disconnected; case RDMA_CM_EVENT_REJECTED: - dprintk("rpcrdma: connection to %s:%s rejected: %s\n", - rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), - rdma_reject_msg(id, event->status)); + dprintk("rpcrdma: connection to %pISpc rejected: %s\n", + sap, rdma_reject_msg(id, event->status)); ep->re_connect_status = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) ep->re_connect_status = -EAGAIN; @@ -307,8 +301,7 @@ disconnected: break; } - dprintk("RPC: %s: %s:%s on %s/frwr: %s\n", __func__, - rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), + dprintk("RPC: %s: %pISpc on %s/frwr: %s\n", __func__, sap, ep->re_id->device->name, rdma_event_msg(event->event)); return 0; } -- cgit v1.2.3 From e28ce90083f032ca0e8ea03478f5b6a38f5930f7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 21 Feb 2020 17:01:05 -0500 Subject: xprtrdma: kmalloc rpcrdma_ep separate from rpcrdma_xprt Change the rpcrdma_xprt_disconnect() function so that it no longer waits for the DISCONNECTED event. This prevents blocking if the remote is unresponsive. In rpcrdma_xprt_disconnect(), the transport's rpcrdma_ep is detached. Upon return from rpcrdma_xprt_disconnect(), the transport (r_xprt) is ready immediately for a new connection. The RDMA_CM_DEVICE_REMOVAL and RDMA_CM_DISCONNECTED events are now handled almost identically. However, because the lifetimes of rpcrdma_xprt structures and rpcrdma_ep structures are now independent, creating an rpcrdma_ep needs to take a module ref count. The ep now owns most of the hardware resources for a transport. Also, a kref is needed to ensure that rpcrdma_ep sticks around long enough for the cm_event_handler to finish. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 63 +------------ net/sunrpc/xprtrdma/backchannel.c | 4 +- net/sunrpc/xprtrdma/frwr_ops.c | 12 +-- net/sunrpc/xprtrdma/rpc_rdma.c | 17 ++-- net/sunrpc/xprtrdma/transport.c | 37 ++++---- net/sunrpc/xprtrdma/verbs.c | 194 ++++++++++++++++++++------------------ net/sunrpc/xprtrdma/xprt_rdma.h | 7 +- 7 files changed, 143 insertions(+), 191 deletions(-) (limited to 'net/sunrpc') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 295f75b9b796..81b87428f166 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -104,7 +104,7 @@ DECLARE_EVENT_CLASS(xprtrdma_connect_class, TP_fast_assign( __entry->r_xprt = r_xprt; __entry->rc = rc; - __entry->connect_status = r_xprt->rx_ep.re_connect_status; + __entry->connect_status = r_xprt->rx_ep->re_connect_status; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), @@ -342,37 +342,6 @@ DECLARE_EVENT_CLASS(xprtrdma_cb_event, ** Connection events **/ -TRACE_EVENT(xprtrdma_cm_event, - TP_PROTO( - const struct rpcrdma_xprt *r_xprt, - struct rdma_cm_event *event - ), - - TP_ARGS(r_xprt, event), - - TP_STRUCT__entry( - __field(const void *, r_xprt) - __field(unsigned int, event) - __field(int, status) - __string(addr, rpcrdma_addrstr(r_xprt)) - __string(port, rpcrdma_portstr(r_xprt)) - ), - - TP_fast_assign( - __entry->r_xprt = r_xprt; - __entry->event = event->event; - __entry->status = event->status; - __assign_str(addr, rpcrdma_addrstr(r_xprt)); - __assign_str(port, rpcrdma_portstr(r_xprt)); - ), - - TP_printk("peer=[%s]:%s r_xprt=%p: %s (%u/%d)", - __get_str(addr), __get_str(port), - __entry->r_xprt, rdma_show_cm_event(__entry->event), - __entry->event, __entry->status - ) -); - TRACE_EVENT(xprtrdma_inline_thresh, TP_PROTO( const struct rpcrdma_ep *ep @@ -409,34 +378,6 @@ TRACE_EVENT(xprtrdma_inline_thresh, ) ); -TRACE_EVENT(xprtrdma_remove, - TP_PROTO( - const struct rpcrdma_ep *ep - ), - - TP_ARGS(ep), - - TP_STRUCT__entry( - __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) - __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) - __string(name, ep->re_id->device->name) - ), - - TP_fast_assign( - const struct rdma_cm_id *id = ep->re_id; - - memcpy(__entry->srcaddr, &id->route.addr.src_addr, - sizeof(struct sockaddr_in6)); - memcpy(__entry->dstaddr, &id->route.addr.dst_addr, - sizeof(struct sockaddr_in6)); - __assign_str(name, id->device->name); - ), - - TP_printk("%pISpc -> %pISpc device=%s", - __entry->srcaddr, __entry->dstaddr, __get_str(name) - ) -); - DEFINE_CONN_EVENT(connect); DEFINE_CONN_EVENT(disconnect); DEFINE_CONN_EVENT(flush_dct); @@ -831,7 +772,7 @@ TRACE_EVENT(xprtrdma_post_recvs, __entry->r_xprt = r_xprt; __entry->count = count; __entry->status = status; - __entry->posted = r_xprt->rx_ep.re_receive_count; + __entry->posted = r_xprt->rx_ep->re_receive_count; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 4b20102cf060..c92c1aac270a 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -44,7 +44,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; size_t maxmsg; maxmsg = min_t(unsigned int, ep->re_inline_send, ep->re_inline_recv); @@ -190,7 +190,7 @@ create_req: if (xprt->bc_alloc_count >= RPCRDMA_BACKWARD_WRS) return NULL; - size = min_t(size_t, r_xprt->rx_ep.re_inline_recv, PAGE_SIZE); + size = min_t(size_t, r_xprt->rx_ep->re_inline_recv, PAGE_SIZE); req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL); if (!req) return NULL; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 19bf422f010b..ef997880e17a 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -74,7 +74,7 @@ static void frwr_mr_recycle(struct rpcrdma_mr *mr) if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ep.re_id->device, + ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); mr->mr_dir = DMA_NONE; } @@ -115,7 +115,7 @@ void frwr_reset(struct rpcrdma_req *req) */ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) { - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; unsigned int depth = ep->re_max_fr_depth; struct scatterlist *sg; struct ib_mr *frmr; @@ -283,7 +283,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, int nsegs, bool writing, __be32 xid, struct rpcrdma_mr *mr) { - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct ib_reg_wr *reg_wr; int i, n, dma_nents; struct ib_mr *ibmr; @@ -405,7 +405,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) post_wr = &frwr->fr_regwr.wr; } - return ib_post_send(r_xprt->rx_ep.re_id->qp, post_wr, NULL); + return ib_post_send(r_xprt->rx_ep->re_id->qp, post_wr, NULL); } /** @@ -535,7 +535,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ep.re_id->qp, first, &bad_wr); + rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr); /* The final LOCAL_INV WR in the chain is supposed to * do the wake. If it was never posted, the wake will @@ -640,7 +640,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ep.re_id->qp, first, &bad_wr); + rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr); if (!rc) return; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index ad7e6b0187bd..d1af48e0139c 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -131,9 +131,10 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { struct xdr_buf *xdr = &rqst->rq_snd_buf; + struct rpcrdma_ep *ep = r_xprt->rx_ep; unsigned int count, remaining, offset; - if (xdr->len > r_xprt->rx_ep.re_max_inline_send) + if (xdr->len > ep->re_max_inline_send) return false; if (xdr->page_len) { @@ -144,7 +145,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, remaining -= min_t(unsigned int, PAGE_SIZE - offset, remaining); offset = 0; - if (++count > r_xprt->rx_ep.re_attr.cap.max_send_sge) + if (++count > ep->re_attr.cap.max_send_sge) return false; } } @@ -161,7 +162,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { - return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.re_max_inline_recv; + return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep->re_max_inline_recv; } /* The client is required to provide a Reply chunk if the maximum @@ -175,7 +176,7 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt, const struct xdr_buf *buf = &rqst->rq_rcv_buf; return (buf->head[0].iov_len + buf->tail[0].iov_len) < - r_xprt->rx_ep.re_max_inline_recv; + r_xprt->rx_ep->re_max_inline_recv; } /* Split @vec on page boundaries into SGEs. FMR registers pages, not @@ -254,7 +255,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, /* When encoding a Read chunk, the tail iovec contains an * XDR pad and may be omitted. */ - if (type == rpcrdma_readch && r_xprt->rx_ep.re_implicit_roundup) + if (type == rpcrdma_readch && r_xprt->rx_ep->re_implicit_roundup) goto out; /* When encoding a Write chunk, some servers need to see an @@ -262,7 +263,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, * layer provides space in the tail iovec that may be used * for this purpose. */ - if (type == rpcrdma_writech && r_xprt->rx_ep.re_implicit_roundup) + if (type == rpcrdma_writech && r_xprt->rx_ep->re_implicit_roundup) goto out; if (xdrbuf->tail[0].iov_len) @@ -1475,8 +1476,8 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) if (credits == 0) credits = 1; /* don't deadlock */ - else if (credits > r_xprt->rx_ep.re_max_requests) - credits = r_xprt->rx_ep.re_max_requests; + else if (credits > r_xprt->rx_ep->re_max_requests) + credits = r_xprt->rx_ep->re_max_requests; if (buf->rb_credits != credits) rpcrdma_update_cwnd(r_xprt, credits); rpcrdma_post_recvs(r_xprt, false); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 4352fd6e9817..659da37020a4 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -238,12 +238,12 @@ xprt_rdma_connect_worker(struct work_struct *work) struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt, rx_connect_worker.work); struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; int rc; rc = rpcrdma_xprt_connect(r_xprt); xprt_clear_connecting(xprt); - if (ep->re_connect_status > 0) { + if (r_xprt->rx_ep && r_xprt->rx_ep->re_connect_status > 0) { + xprt->connect_cookie++; xprt->stat.connect_count++; xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; @@ -266,7 +266,7 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); trace_xprtrdma_op_inject_dsc(r_xprt); - rdma_disconnect(r_xprt->rx_ep.re_id); + rdma_disconnect(r_xprt->rx_ep->re_id); } /** @@ -316,10 +316,15 @@ xprt_setup_rdma(struct xprt_create *args) if (args->addrlen > sizeof(xprt->addr)) return ERR_PTR(-EBADF); + if (!try_module_get(THIS_MODULE)) + return ERR_PTR(-EIO); + xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0, xprt_rdma_slot_table_entries); - if (!xprt) + if (!xprt) { + module_put(THIS_MODULE); return ERR_PTR(-ENOMEM); + } xprt->timeout = &xprt_rdma_default_timeout; xprt->connect_timeout = xprt->timeout->to_initval; @@ -348,11 +353,12 @@ xprt_setup_rdma(struct xprt_create *args) new_xprt = rpcx_to_rdmax(xprt); rc = rpcrdma_buffer_create(new_xprt); - if (rc) - goto out2; - - if (!try_module_get(THIS_MODULE)) - goto out4; + if (rc) { + xprt_rdma_free_addresses(xprt); + xprt_free(xprt); + module_put(THIS_MODULE); + return ERR_PTR(rc); + } INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, xprt_rdma_connect_worker); @@ -364,15 +370,6 @@ xprt_setup_rdma(struct xprt_create *args) xprt->address_strings[RPC_DISPLAY_PORT]); trace_xprtrdma_create(new_xprt); return xprt; - -out4: - rpcrdma_buffer_destroy(&new_xprt->rx_buf); - rc = -ENODEV; -out2: - trace_xprtrdma_op_destroy(new_xprt); - xprt_rdma_free_addresses(xprt); - xprt_free(xprt); - return ERR_PTR(rc); } /** @@ -491,11 +488,11 @@ static void xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; unsigned long delay; delay = 0; - if (ep->re_connect_status != 0) { + if (ep && ep->re_connect_status != 0) { delay = xprt_reconnect_delay(xprt); xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO); } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 37d07072bdbf..cdd84c09df10 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -84,7 +84,7 @@ static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep); static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt); -static void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt); +static int rpcrdma_ep_destroy(struct rpcrdma_ep *ep); static struct rpcrdma_regbuf * rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, gfp_t flags); @@ -97,7 +97,7 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); */ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) { - struct rdma_cm_id *id = r_xprt->rx_ep.re_id; + struct rdma_cm_id *id = r_xprt->rx_ep->re_id; /* Flush Receives, then wait for deferred Reply work * to complete. @@ -139,8 +139,8 @@ void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc) struct rpc_xprt *xprt = &r_xprt->rx_xprt; if (wc->status != IB_WC_SUCCESS && - r_xprt->rx_ep.re_connect_status == 1) { - r_xprt->rx_ep.re_connect_status = -ECONNABORTED; + r_xprt->rx_ep->re_connect_status == 1) { + r_xprt->rx_ep->re_connect_status = -ECONNABORTED; trace_xprtrdma_flush_dct(r_xprt, wc->status); xprt_force_disconnect(xprt); } @@ -179,7 +179,7 @@ static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_receive(wc); - --r_xprt->rx_ep.re_receive_count; + --r_xprt->rx_ep->re_receive_count; if (wc->status != IB_WC_SUCCESS) goto out_flushed; @@ -239,13 +239,11 @@ static int rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr; - struct rpcrdma_xprt *r_xprt = id->context; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct rpcrdma_ep *ep = id->context; + struct rpc_xprt *xprt = ep->re_xprt; might_sleep(); - trace_xprtrdma_cm_event(r_xprt, event); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: @@ -263,16 +261,13 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DEVICE_REMOVAL: pr_info("rpcrdma: removing device %s for %pISpc\n", ep->re_id->device->name, sap); - init_completion(&ep->re_remove_done); + /* fall through */ + case RDMA_CM_EVENT_ADDR_CHANGE: ep->re_connect_status = -ENODEV; xprt_force_disconnect(xprt); - wait_for_completion(&ep->re_remove_done); - trace_xprtrdma_remove(ep); - - /* Return 1 to ensure the core destroys the id. */ - return 1; + goto disconnected; case RDMA_CM_EVENT_ESTABLISHED: - ++xprt->connect_cookie; + kref_get(&ep->re_kref); ep->re_connect_status = 1; rpcrdma_update_cm_private(ep, &event->param.conn); trace_xprtrdma_inline_thresh(ep); @@ -294,9 +289,7 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DISCONNECTED: ep->re_connect_status = -ECONNABORTED; disconnected: - xprt_force_disconnect(xprt); - wake_up_all(&ep->re_connect_wait); - break; + return rpcrdma_ep_destroy(ep); default: break; } @@ -316,7 +309,7 @@ static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, init_completion(&ep->re_done); - id = rdma_create_id(xprt->xprt_net, rpcrdma_cm_event_handler, r_xprt, + id = rdma_create_id(xprt->xprt_net, rpcrdma_cm_event_handler, ep, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(id)) return id; @@ -352,25 +345,66 @@ out: return ERR_PTR(rc); } -/* - * Exported functions. +static void rpcrdma_ep_put(struct kref *kref) +{ + struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref); + + if (ep->re_id->qp) { + rdma_destroy_qp(ep->re_id); + ep->re_id->qp = NULL; + } + + if (ep->re_attr.recv_cq) + ib_free_cq(ep->re_attr.recv_cq); + ep->re_attr.recv_cq = NULL; + if (ep->re_attr.send_cq) + ib_free_cq(ep->re_attr.send_cq); + ep->re_attr.send_cq = NULL; + + if (ep->re_pd) + ib_dealloc_pd(ep->re_pd); + ep->re_pd = NULL; + + kfree(ep); + module_put(THIS_MODULE); +} + +/* Returns: + * %0 if @ep still has a positive kref count, or + * %1 if @ep was destroyed successfully. */ +static int rpcrdma_ep_destroy(struct rpcrdma_ep *ep) +{ + return kref_put(&ep->re_kref, rpcrdma_ep_put); +} static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_connect_private *pmsg = &ep->re_cm_private; + struct rpcrdma_connect_private *pmsg; + struct ib_device *device; struct rdma_cm_id *id; + struct rpcrdma_ep *ep; int rc; + ep = kzalloc(sizeof(*ep), GFP_NOFS); + if (!ep) + return -EAGAIN; + ep->re_xprt = &r_xprt->rx_xprt; + kref_init(&ep->re_kref); + id = rpcrdma_create_id(r_xprt, ep); - if (IS_ERR(id)) - return PTR_ERR(id); + if (IS_ERR(id)) { + rc = PTR_ERR(id); + goto out_free; + } + __module_get(THIS_MODULE); + device = id->device; + ep->re_id = id; ep->re_max_requests = r_xprt->rx_xprt.max_reqs; ep->re_inline_send = xprt_rdma_max_inline_write; ep->re_inline_recv = xprt_rdma_max_inline_read; - rc = frwr_query_device(ep, id->device); + rc = frwr_query_device(ep, device); if (rc) goto out_destroy; @@ -396,7 +430,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) ep->re_send_count = ep->re_send_batch; init_waitqueue_head(&ep->re_connect_wait); - ep->re_attr.send_cq = ib_alloc_cq_any(id->device, r_xprt, + ep->re_attr.send_cq = ib_alloc_cq_any(device, r_xprt, ep->re_attr.cap.max_send_wr, IB_POLL_WORKQUEUE); if (IS_ERR(ep->re_attr.send_cq)) { @@ -404,7 +438,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) goto out_destroy; } - ep->re_attr.recv_cq = ib_alloc_cq_any(id->device, r_xprt, + ep->re_attr.recv_cq = ib_alloc_cq_any(device, r_xprt, ep->re_attr.cap.max_recv_wr, IB_POLL_WORKQUEUE); if (IS_ERR(ep->re_attr.recv_cq)) { @@ -417,6 +451,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) memset(&ep->re_remote_cma, 0, sizeof(ep->re_remote_cma)); /* Prepare RDMA-CM private message */ + pmsg = &ep->re_cm_private; pmsg->cp_magic = rpcrdma_cmp_magic; pmsg->cp_version = RPCRDMA_CMP_VERSION; pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK; @@ -428,7 +463,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) /* Client offers RDMA Read but does not initiate */ ep->re_remote_cma.initiator_depth = 0; ep->re_remote_cma.responder_resources = - min_t(int, U8_MAX, id->device->attrs.max_qp_rd_atom); + min_t(int, U8_MAX, device->attrs.max_qp_rd_atom); /* Limit transport retries so client can detect server * GID changes quickly. RPC layer handles re-establishing @@ -443,7 +478,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) ep->re_remote_cma.flow_control = 0; ep->re_remote_cma.rnr_retry_count = 0; - ep->re_pd = ib_alloc_pd(id->device, 0); + ep->re_pd = ib_alloc_pd(device, 0); if (IS_ERR(ep->re_pd)) { rc = PTR_ERR(ep->re_pd); goto out_destroy; @@ -453,50 +488,36 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) if (rc) goto out_destroy; - ep->re_id = id; + r_xprt->rx_ep = ep; return 0; out_destroy: - rpcrdma_ep_destroy(r_xprt); + rpcrdma_ep_destroy(ep); rdma_destroy_id(id); +out_free: + kfree(ep); + r_xprt->rx_ep = NULL; return rc; } -static void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - - if (ep->re_id && ep->re_id->qp) { - rdma_destroy_qp(ep->re_id); - ep->re_id->qp = NULL; - } - - if (ep->re_attr.recv_cq) - ib_free_cq(ep->re_attr.recv_cq); - ep->re_attr.recv_cq = NULL; - if (ep->re_attr.send_cq) - ib_free_cq(ep->re_attr.send_cq); - ep->re_attr.send_cq = NULL; - - if (ep->re_pd) - ib_dealloc_pd(ep->re_pd); - ep->re_pd = NULL; -} - -/* - * Connect unconnected endpoint. +/** + * rpcrdma_xprt_connect - Connect an unconnected transport + * @r_xprt: controlling transport instance + * + * Returns 0 on success or a negative errno. */ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) { struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep; int rc; retry: rpcrdma_xprt_disconnect(r_xprt); rc = rpcrdma_ep_create(r_xprt); if (rc) - goto out_noupdate; + return rc; + ep = r_xprt->rx_ep; ep->re_connect_status = 0; xprt_clear_connected(xprt); @@ -533,8 +554,6 @@ retry: out: if (rc) ep->re_connect_status = rc; - -out_noupdate: trace_xprtrdma_connect(r_xprt, rc); return rc; } @@ -545,40 +564,33 @@ out_noupdate: * * Caller serializes. Either the transport send lock is held, * or we're being called to destroy the transport. + * + * On return, @r_xprt is completely divested of all hardware + * resources and prepared for the next ->connect operation. */ void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rdma_cm_id *id = ep->re_id; - int rc, status = ep->re_connect_status; - - might_sleep(); + struct rpcrdma_ep *ep = r_xprt->rx_ep; + struct rdma_cm_id *id; + int rc; - if (!id) + if (!ep) return; + id = ep->re_id; rc = rdma_disconnect(id); - if (!rc) - wait_event_interruptible(ep->re_connect_wait, - ep->re_connect_status != 1); - else - ep->re_connect_status = rc; trace_xprtrdma_disconnect(r_xprt, rc); - if (id->qp) - rpcrdma_xprt_drain(r_xprt); + rpcrdma_xprt_drain(r_xprt); rpcrdma_reps_unmap(r_xprt); rpcrdma_reqs_reset(r_xprt); rpcrdma_mrs_destroy(r_xprt); rpcrdma_sendctxs_destroy(r_xprt); - rpcrdma_ep_destroy(r_xprt); - - if (status == -ENODEV) - complete(&ep->re_remove_done); - else + if (rpcrdma_ep_destroy(ep)) rdma_destroy_id(id); - ep->re_id = NULL; + + r_xprt->rx_ep = NULL; } /* Fixed-size circular FIFO queue. This implementation is wait-free and @@ -635,14 +647,14 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) * the ->send_request call to fail temporarily before too many * Sends are posted. */ - i = r_xprt->rx_ep.re_max_requests + RPCRDMA_MAX_BC_REQUESTS; + i = r_xprt->rx_ep->re_max_requests + RPCRDMA_MAX_BC_REQUESTS; buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL); if (!buf->rb_sc_ctxs) return -ENOMEM; buf->rb_sc_last = i - 1; for (i = 0; i <= buf->rb_sc_last; i++) { - sc = rpcrdma_sendctx_create(&r_xprt->rx_ep); + sc = rpcrdma_sendctx_create(r_xprt->rx_ep); if (!sc) return -ENOMEM; @@ -746,7 +758,7 @@ static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; unsigned int count; for (count = 0; count < ep->re_max_rdma_segs; count++) { @@ -793,7 +805,7 @@ rpcrdma_mr_refresh_worker(struct work_struct *work) void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; /* If there is no underlying connection, it's no use * to wake the refresh worker. @@ -862,7 +874,7 @@ int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Compute maximum header buffer size in bytes */ maxhdrsize = rpcrdma_fixed_maxsz + 3 + - r_xprt->rx_ep.re_max_rdma_segs * rpcrdma_readchunk_maxsz; + r_xprt->rx_ep->re_max_rdma_segs * rpcrdma_readchunk_maxsz; maxhdrsize *= sizeof(__be32); rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize), DMA_TO_DEVICE, GFP_KERNEL); @@ -940,7 +952,7 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, if (rep == NULL) goto out; - rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep.re_inline_recv, + rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv, DMA_FROM_DEVICE, GFP_KERNEL); if (!rep->rr_rdmabuf) goto out_free; @@ -1165,7 +1177,7 @@ void rpcrdma_mr_put(struct rpcrdma_mr *mr) if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ep.re_id->device, + ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); mr->mr_dir = DMA_NONE; } @@ -1283,7 +1295,7 @@ bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags) bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_regbuf *rb) { - struct ib_device *device = r_xprt->rx_ep.re_id->device; + struct ib_device *device = r_xprt->rx_ep->re_id->device; if (rb->rg_direction == DMA_NONE) return false; @@ -1296,7 +1308,7 @@ bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, } rb->rg_device = device; - rb->rg_iov.lkey = r_xprt->rx_ep.re_pd->local_dma_lkey; + rb->rg_iov.lkey = r_xprt->rx_ep->re_pd->local_dma_lkey; return true; } @@ -1332,7 +1344,7 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb) int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *send_wr = &req->rl_wr; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; int rc; if (!ep->re_send_count || kref_read(&req->rl_kref) > 1) { @@ -1359,7 +1371,7 @@ int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct ib_recv_wr *wr, *bad_wr; struct rpcrdma_rep *rep; int needed, count, rc; @@ -1396,7 +1408,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) if (!wr) goto out; - rc = ib_post_recv(r_xprt->rx_ep.re_id->qp, wr, + rc = ib_post_recv(ep->re_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); out: trace_xprtrdma_post_recvs(r_xprt, count, rc); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index f3c0b826c9ed..0a16fdb09b2c 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -68,6 +68,7 @@ * RDMA Endpoint -- connection endpoint details */ struct rpcrdma_ep { + struct kref re_kref; struct rdma_cm_id *re_id; struct ib_pd *re_pd; unsigned int re_max_rdma_segs; @@ -75,7 +76,6 @@ struct rpcrdma_ep { bool re_implicit_roundup; enum ib_mr_type re_mrtype; struct completion re_done; - struct completion re_remove_done; unsigned int re_send_count; unsigned int re_send_batch; unsigned int re_max_inline_send; @@ -83,7 +83,8 @@ struct rpcrdma_ep { int re_async_rc; int re_connect_status; struct ib_qp_init_attr re_attr; - wait_queue_head_t re_connect_wait; + wait_queue_head_t re_connect_wait; + struct rpc_xprt *re_xprt; struct rpcrdma_connect_private re_cm_private; struct rdma_conn_param re_remote_cma; @@ -411,7 +412,7 @@ struct rpcrdma_stats { */ struct rpcrdma_xprt { struct rpc_xprt rx_xprt; - struct rpcrdma_ep rx_ep; + struct rpcrdma_ep *rx_ep; struct rpcrdma_buffer rx_buf; struct delayed_work rx_connect_worker; struct rpc_timeout rx_timeout; -- cgit v1.2.3 From 1fab7dc477241c12f977955aa6baea7938b6f08d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 4 Apr 2020 19:52:21 -0400 Subject: SUNRPC: Don't start a timer on an already queued rpc task Move the test for whether a task is already queued to prevent corruption of the timer list in __rpc_sleep_on_priority_timeout(). Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 6eff14119a88..7eba20a88438 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -204,10 +204,6 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task, unsigned char queue_priority) { - WARN_ON_ONCE(RPC_IS_QUEUED(task)); - if (RPC_IS_QUEUED(task)) - return; - INIT_LIST_HEAD(&task->u.tk_wait.timer_list); if (RPC_IS_PRIORITY(queue)) __rpc_add_wait_queue_priority(queue, task, queue_priority); @@ -382,7 +378,7 @@ static void rpc_make_runnable(struct workqueue_struct *wq, * NB: An RPC task will only receive interrupt-driven events as long * as it's on a wait queue. */ -static void __rpc_sleep_on_priority(struct rpc_wait_queue *q, +static void __rpc_do_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, unsigned char queue_priority) { @@ -395,12 +391,23 @@ static void __rpc_sleep_on_priority(struct rpc_wait_queue *q, } +static void __rpc_sleep_on_priority(struct rpc_wait_queue *q, + struct rpc_task *task, + unsigned char queue_priority) +{ + if (WARN_ON_ONCE(RPC_IS_QUEUED(task))) + return; + __rpc_do_sleep_on_priority(q, task, queue_priority); +} + static void __rpc_sleep_on_priority_timeout(struct rpc_wait_queue *q, struct rpc_task *task, unsigned long timeout, unsigned char queue_priority) { + if (WARN_ON_ONCE(RPC_IS_QUEUED(task))) + return; if (time_is_after_jiffies(timeout)) { - __rpc_sleep_on_priority(q, task, queue_priority); + __rpc_do_sleep_on_priority(q, task, queue_priority); __rpc_add_timer(q, task, timeout); } else task->tk_status = -ETIMEDOUT; -- cgit v1.2.3