diff options
author | Linus Torvalds | 2023-06-26 10:48:57 -0700 |
---|---|---|
committer | Linus Torvalds | 2023-06-26 10:48:57 -0700 |
commit | f7976a6493b3f00c4d057a37d9e63c322154ef8c (patch) | |
tree | 0f295581c6993306c76a711807c82dda19dd2df4 /net | |
parent | c0a572d9d32fe1e95672f24e860776dba0750a38 (diff) | |
parent | 75bfb70457a4c4c9f0095e39885382fc5049c5ce (diff) |
Merge tag 'nfsd-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux
Pull nfsd updates from Chuck Lever:
- Clean-ups in the READ path in anticipation of MSG_SPLICE_PAGES
- Better NUMA awareness when allocating pages and other objects
- A number of minor clean-ups to XDR encoding
- Elimination of a race when accepting a TCP socket
- Numerous observability enhancements
* tag 'nfsd-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/cel/linux: (46 commits)
nfsd: remove redundant assignments to variable len
svcrdma: Fix stale comment
NFSD: Distinguish per-net namespace initialization
nfsd: move init of percpu reply_cache_stats counters back to nfsd_init_net
SUNRPC: Address RCU warning in net/sunrpc/svc.c
SUNRPC: Use sysfs_emit in place of strlcpy/sprintf
SUNRPC: Remove transport class dprintk call sites
SUNRPC: Fix comments for transport class registration
svcrdma: Remove an unused argument from __svc_rdma_put_rw_ctxt()
svcrdma: trace cc_release calls
svcrdma: Convert "might sleep" comment into a code annotation
NFSD: Add an nfsd4_encode_nfstime4() helper
SUNRPC: Move initialization of rq_stime
SUNRPC: Optimize page release in svc_rdma_sendto()
svcrdma: Prevent page release when nothing was received
svcrdma: Revert 2a1e4f21d841 ("svcrdma: Normalize Send page handling")
SUNRPC: Revert 579900670ac7 ("svcrdma: Remove unused sc_pages field")
SUNRPC: Revert cc93ce9529a6 ("svcrdma: Retain the page backing rq_res.head[0].iov_base")
NFSD: add encoding of op_recall flag for write delegation
NFSD: Add "official" reviewers for this subsystem
...
Diffstat (limited to 'net')
-rw-r--r-- | net/sunrpc/svc.c | 41 | ||||
-rw-r--r-- | net/sunrpc/svc_xprt.c | 26 | ||||
-rw-r--r-- | net/sunrpc/svcsock.c | 38 | ||||
-rw-r--r-- | net/sunrpc/xdr.c | 26 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 8 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 36 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_rw.c | 24 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_sendto.c | 62 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_transport.c | 18 |
9 files changed, 143 insertions, 136 deletions
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 79967b6925bd..e7c101290425 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -109,15 +109,15 @@ param_get_pool_mode(char *buf, const struct kernel_param *kp) switch (*ip) { case SVC_POOL_AUTO: - return strlcpy(buf, "auto\n", 20); + return sysfs_emit(buf, "auto\n"); case SVC_POOL_GLOBAL: - return strlcpy(buf, "global\n", 20); + return sysfs_emit(buf, "global\n"); case SVC_POOL_PERCPU: - return strlcpy(buf, "percpu\n", 20); + return sysfs_emit(buf, "percpu\n"); case SVC_POOL_PERNODE: - return strlcpy(buf, "pernode\n", 20); + return sysfs_emit(buf, "pernode\n"); default: - return sprintf(buf, "%d\n", *ip); + return sysfs_emit(buf, "%d\n", *ip); } } @@ -597,34 +597,25 @@ svc_destroy(struct kref *ref) } EXPORT_SYMBOL_GPL(svc_destroy); -/* - * Allocate an RPC server's buffer space. - * We allocate pages and place them in rq_pages. - */ -static int +static bool svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node) { - unsigned int pages, arghi; + unsigned long pages, ret; /* bc_xprt uses fore channel allocated buffers */ if (svc_is_backchannel(rqstp)) - return 1; + return true; pages = size / PAGE_SIZE + 1; /* extra page as we hold both request and reply. * We assume one is at most one page */ - arghi = 0; WARN_ON_ONCE(pages > RPCSVC_MAXPAGES); if (pages > RPCSVC_MAXPAGES) pages = RPCSVC_MAXPAGES; - while (pages) { - struct page *p = alloc_pages_node(node, GFP_KERNEL, 0); - if (!p) - break; - rqstp->rq_pages[arghi++] = p; - pages--; - } - return pages == 0; + + ret = alloc_pages_bulk_array_node(GFP_KERNEL, node, pages, + rqstp->rq_pages); + return ret == pages; } /* @@ -1173,6 +1164,7 @@ static void __svc_unregister(struct net *net, const u32 program, const u32 versi */ static void svc_unregister(const struct svc_serv *serv, struct net *net) { + struct sighand_struct *sighand; struct svc_program *progp; unsigned long flags; unsigned int i; @@ -1189,9 +1181,12 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net) } } - spin_lock_irqsave(¤t->sighand->siglock, flags); + rcu_read_lock(); + sighand = rcu_dereference(current->sighand); + spin_lock_irqsave(&sighand->siglock, flags); recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); + spin_unlock_irqrestore(&sighand->siglock, flags); + rcu_read_unlock(); } /* diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 13a14897bc17..62c7919ea610 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -74,13 +74,18 @@ static LIST_HEAD(svc_xprt_class_list); * that no other thread will be using the transport or will * try to set XPT_DEAD. */ + +/** + * svc_reg_xprt_class - Register a server-side RPC transport class + * @xcl: New transport class to be registered + * + * Returns zero on success; otherwise a negative errno is returned. + */ int svc_reg_xprt_class(struct svc_xprt_class *xcl) { struct svc_xprt_class *cl; int res = -EEXIST; - dprintk("svc: Adding svc transport class '%s'\n", xcl->xcl_name); - INIT_LIST_HEAD(&xcl->xcl_list); spin_lock(&svc_xprt_class_lock); /* Make sure there isn't already a class with the same name */ @@ -96,9 +101,13 @@ out: } EXPORT_SYMBOL_GPL(svc_reg_xprt_class); +/** + * svc_unreg_xprt_class - Unregister a server-side RPC transport class + * @xcl: Transport class to be unregistered + * + */ void svc_unreg_xprt_class(struct svc_xprt_class *xcl) { - dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name); spin_lock(&svc_xprt_class_lock); list_del_init(&xcl->xcl_list); spin_unlock(&svc_xprt_class_lock); @@ -685,8 +694,9 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) } for (filled = 0; filled < pages; filled = ret) { - ret = alloc_pages_bulk_array(GFP_KERNEL, pages, - rqstp->rq_pages); + ret = alloc_pages_bulk_array_node(GFP_KERNEL, + rqstp->rq_pool->sp_id, + pages, rqstp->rq_pages); if (ret > filled) /* Made progress, don't sleep yet */ continue; @@ -843,15 +853,11 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) svc_xprt_received(xprt); } else if (svc_xprt_reserve_slot(rqstp, xprt)) { /* XPT_DATA|XPT_DEFERRED case: */ - dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", - rqstp, rqstp->rq_pool->sp_id, xprt, - kref_read(&xprt->xpt_ref)); rqstp->rq_deferred = svc_deferred_dequeue(xprt); if (rqstp->rq_deferred) len = svc_deferred_recv(rqstp); else len = xprt->xpt_ops->xpo_recvfrom(rqstp); - rqstp->rq_stime = ktime_get(); rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); } else @@ -894,6 +900,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) err = -EAGAIN; if (len <= 0) goto out_release; + trace_svc_xdr_recvfrom(&rqstp->rq_arg); clear_bit(XPT_OLD, &xprt->xpt_flags); @@ -902,6 +909,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) if (serv->sv_stats) serv->sv_stats->netcnt++; + rqstp->rq_stime = ktime_get(); return len; out_release: rqstp->rq_res.len = 0; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index f77cebe2c071..5f519fc0541b 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -826,12 +826,6 @@ static void svc_tcp_listen_data_ready(struct sock *sk) trace_sk_data_ready(sk); - if (svsk) { - /* Refer to svc_setup_socket() for details. */ - rmb(); - svsk->sk_odata(sk); - } - /* * This callback may called twice when a new connection * is established as a child socket inherits everything @@ -840,13 +834,18 @@ static void svc_tcp_listen_data_ready(struct sock *sk) * when one of child sockets become ESTABLISHED. * 2) data_ready method of the child socket may be called * when it receives data before the socket is accepted. - * In case of 2, we should ignore it silently. + * In case of 2, we should ignore it silently and DO NOT + * dereference svsk. */ - if (sk->sk_state == TCP_LISTEN) { - if (svsk) { - set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); - svc_xprt_enqueue(&svsk->sk_xprt); - } + if (sk->sk_state != TCP_LISTEN) + return; + + if (svsk) { + /* Refer to svc_setup_socket() for details. */ + rmb(); + svsk->sk_odata(sk); + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); + svc_xprt_enqueue(&svsk->sk_xprt); } } @@ -887,13 +886,8 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); err = kernel_accept(sock, &newsock, O_NONBLOCK); if (err < 0) { - if (err == -ENOMEM) - printk(KERN_WARNING "%s: no more sockets!\n", - serv->sv_name); - else if (err != -EAGAIN) - net_warn_ratelimited("%s: accept failed (err %d)!\n", - serv->sv_name, -err); - trace_svcsock_accept_err(xprt, serv->sv_name, err); + if (err != -EAGAIN) + trace_svcsock_accept_err(xprt, serv->sv_name, err); return NULL; } if (IS_ERR(sock_alloc_file(newsock, O_NONBLOCK, NULL))) @@ -1464,7 +1458,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, svsk->sk_owspace = inet->sk_write_space; /* * This barrier is necessary in order to prevent race condition - * with svc_data_ready(), svc_listen_data_ready() and others + * with svc_data_ready(), svc_tcp_listen_data_ready(), and others * when calling callbacks above. */ wmb(); @@ -1476,7 +1470,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, else svc_tcp_init(svsk, serv); - trace_svcsock_new_socket(sock); + trace_svcsock_new(svsk, sock); return svsk; } @@ -1657,6 +1651,8 @@ static void svc_sock_free(struct svc_xprt *xprt) struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); struct socket *sock = svsk->sk_sock; + trace_svcsock_free(svsk, sock); + tls_handshake_cancel(sock->sk); if (sock->file) sockfd_put(sock); diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 36835b2f5446..2a22e78af116 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1070,22 +1070,22 @@ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes) } EXPORT_SYMBOL_GPL(xdr_reserve_space); - /** * xdr_reserve_space_vec - Reserves a large amount of buffer space for sending * @xdr: pointer to xdr_stream - * @vec: pointer to a kvec array * @nbytes: number of bytes to reserve * - * Reserves enough buffer space to encode 'nbytes' of data and stores the - * pointers in 'vec'. The size argument passed to xdr_reserve_space() is - * determined based on the number of bytes remaining in the current page to - * avoid invalidating iov_base pointers when xdr_commit_encode() is called. + * The size argument passed to xdr_reserve_space() is determined based + * on the number of bytes remaining in the current page to avoid + * invalidating iov_base pointers when xdr_commit_encode() is called. + * + * Return values: + * %0: success + * %-EMSGSIZE: not enough space is available in @xdr */ -int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbytes) +int xdr_reserve_space_vec(struct xdr_stream *xdr, size_t nbytes) { - int thislen; - int v = 0; + size_t thislen; __be32 *p; /* @@ -1097,21 +1097,19 @@ int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec, size_t nbyte xdr->end = xdr->p; } + /* XXX: Let's find a way to make this more efficient */ while (nbytes) { thislen = xdr->buf->page_len % PAGE_SIZE; thislen = min_t(size_t, nbytes, PAGE_SIZE - thislen); p = xdr_reserve_space(xdr, thislen); if (!p) - return -EIO; + return -EMSGSIZE; - vec[v].iov_base = p; - vec[v].iov_len = thislen; - v++; nbytes -= thislen; } - return v; + return 0; } EXPORT_SYMBOL_GPL(xdr_reserve_space_vec); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index aa2227a7e552..7420a2c990c7 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -93,13 +93,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, */ get_page(virt_to_page(rqst->rq_buffer)); sctxt->sc_send_wr.opcode = IB_WR_SEND; - ret = svc_rdma_send(rdma, sctxt); - if (ret < 0) - return ret; - - ret = wait_for_completion_killable(&sctxt->sc_done); - svc_rdma_send_ctxt_put(rdma, sctxt); - return ret; + return svc_rdma_send(rdma, sctxt); } /* Server-side transport endpoint wants a whole page for its send diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index a22fe7587fa6..85c8bcaebb80 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -125,14 +125,15 @@ static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma, static struct svc_rdma_recv_ctxt * svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) { + int node = ibdev_to_node(rdma->sc_cm_id->device); struct svc_rdma_recv_ctxt *ctxt; dma_addr_t addr; void *buffer; - ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); + ctxt = kmalloc_node(sizeof(*ctxt), GFP_KERNEL, node); if (!ctxt) goto fail0; - buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL); + buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node); if (!buffer) goto fail1; addr = ib_dma_map_single(rdma->sc_pd->device, buffer, @@ -155,7 +156,6 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->rc_recv_sge.length = rdma->sc_max_req_size; ctxt->rc_recv_sge.lkey = rdma->sc_pd->local_dma_lkey; ctxt->rc_recv_buf = buffer; - ctxt->rc_temp = false; return ctxt; fail2: @@ -232,10 +232,7 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, pcl_free(&ctxt->rc_write_pcl); pcl_free(&ctxt->rc_reply_pcl); - if (!ctxt->rc_temp) - llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts); - else - svc_rdma_recv_ctxt_destroy(rdma, ctxt); + llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts); } /** @@ -258,7 +255,7 @@ void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *vctxt) } static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma, - unsigned int wanted, bool temp) + unsigned int wanted) { const struct ib_recv_wr *bad_wr = NULL; struct svc_rdma_recv_ctxt *ctxt; @@ -275,7 +272,6 @@ static bool svc_rdma_refresh_recvs(struct svcxprt_rdma *rdma, break; trace_svcrdma_post_recv(ctxt); - ctxt->rc_temp = temp; ctxt->rc_recv_wr.next = recv_chain; recv_chain = &ctxt->rc_recv_wr; rdma->sc_pending_recvs++; @@ -309,7 +305,7 @@ err_free: */ bool svc_rdma_post_recvs(struct svcxprt_rdma *rdma) { - return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests, true); + return svc_rdma_refresh_recvs(rdma, rdma->sc_max_requests); } /** @@ -343,7 +339,7 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) * client reconnects. */ if (rdma->sc_pending_recvs < rdma->sc_max_requests) - if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch, false)) + if (!svc_rdma_refresh_recvs(rdma, rdma->sc_recv_batch)) goto dropped; /* All wc fields are now known to be valid */ @@ -775,9 +771,6 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt, * * The next ctxt is removed from the "receive" lists. * - * - If the ctxt completes a Read, then finish assembling the Call - * message and return the number of bytes in the message. - * * - If the ctxt completes a Receive, then construct the Call * message from the contents of the Receive buffer. * @@ -786,7 +779,8 @@ static bool svc_rdma_is_reverse_direction_reply(struct svc_xprt *xprt, * in the message. * * - If there are Read chunks in this message, post Read WRs to - * pull that payload and return 0. + * pull that payload. When the Read WRs complete, build the + * full message and return the number of bytes in it. */ int svc_rdma_recvfrom(struct svc_rqst *rqstp) { @@ -796,6 +790,12 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) struct svc_rdma_recv_ctxt *ctxt; int ret; + /* Prevent svc_xprt_release() from releasing pages in rq_pages + * when returning 0 or an error. + */ + rqstp->rq_respages = rqstp->rq_pages; + rqstp->rq_next_page = rqstp->rq_respages; + rqstp->rq_xprt_ctxt = NULL; ctxt = NULL; @@ -819,12 +819,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) DMA_FROM_DEVICE); svc_rdma_build_arg_xdr(rqstp, ctxt); - /* Prevent svc_xprt_release from releasing pages in rq_pages - * if we return 0 or an error. - */ - rqstp->rq_respages = rqstp->rq_pages; - rqstp->rq_next_page = rqstp->rq_respages; - ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt); if (ret < 0) goto out_err; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 11cf7c646644..e460e25a1d6d 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -62,8 +62,8 @@ svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) if (node) { ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); } else { - ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE), - GFP_KERNEL); + ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE), + GFP_KERNEL, ibdev_to_node(rdma->sc_cm_id->device)); if (!ctxt) goto out_noctx; @@ -84,8 +84,7 @@ out_noctx: return NULL; } -static void __svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, - struct svc_rdma_rw_ctxt *ctxt, +static void __svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt *ctxt, struct llist_head *list) { sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE); @@ -95,7 +94,7 @@ static void __svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, struct svc_rdma_rw_ctxt *ctxt) { - __svc_rdma_put_rw_ctxt(rdma, ctxt, &rdma->sc_rw_ctxts); + __svc_rdma_put_rw_ctxt(ctxt, &rdma->sc_rw_ctxts); } /** @@ -191,6 +190,8 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, struct svc_rdma_rw_ctxt *ctxt; LLIST_HEAD(free); + trace_svcrdma_cc_release(&cc->cc_cid, cc->cc_sqecount); + first = last = NULL; while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { list_del(&ctxt->rw_list); @@ -198,7 +199,7 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, ctxt->rw_sg_table.sgl, ctxt->rw_nents, dir); - __svc_rdma_put_rw_ctxt(rdma, ctxt, &free); + __svc_rdma_put_rw_ctxt(ctxt, &free); ctxt->rw_node.next = first; first = &ctxt->rw_node; @@ -234,7 +235,8 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, { struct svc_rdma_write_info *info; - info = kmalloc(sizeof(*info), GFP_KERNEL); + info = kmalloc_node(sizeof(*info), GFP_KERNEL, + ibdev_to_node(rdma->sc_cm_id->device)); if (!info) return info; @@ -304,7 +306,8 @@ svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma) { struct svc_rdma_read_info *info; - info = kmalloc(sizeof(*info), GFP_KERNEL); + info = kmalloc_node(sizeof(*info), GFP_KERNEL, + ibdev_to_node(rdma->sc_cm_id->device)); if (!info) return info; @@ -351,8 +354,7 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) return; } -/* This function sleeps when the transport's Send Queue is congested. - * +/* * Assumptions: * - If ib_post_send() succeeds, only one completion is expected, * even if one or more WRs are flushed. This is true when posting @@ -367,6 +369,8 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) struct ib_cqe *cqe; int ret; + might_sleep(); + if (cc->cc_sqecount > rdma->sc_sq_depth) return -EINVAL; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 22a871e6fe4d..c6644cca52c5 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -123,18 +123,17 @@ static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma, static struct svc_rdma_send_ctxt * svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) { + int node = ibdev_to_node(rdma->sc_cm_id->device); struct svc_rdma_send_ctxt *ctxt; dma_addr_t addr; void *buffer; - size_t size; int i; - size = sizeof(*ctxt); - size += rdma->sc_max_send_sges * sizeof(struct ib_sge); - ctxt = kmalloc(size, GFP_KERNEL); + ctxt = kmalloc_node(struct_size(ctxt, sc_sges, rdma->sc_max_send_sges), + GFP_KERNEL, node); if (!ctxt) goto fail0; - buffer = kmalloc(rdma->sc_max_req_size, GFP_KERNEL); + buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node); if (!buffer) goto fail1; addr = ib_dma_map_single(rdma->sc_pd->device, buffer, @@ -148,7 +147,6 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe; ctxt->sc_send_wr.sg_list = ctxt->sc_sges; ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; - init_completion(&ctxt->sc_done); ctxt->sc_cqe.done = svc_rdma_wc_send; ctxt->sc_xprt_buf = buffer; xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, @@ -214,6 +212,7 @@ out: ctxt->sc_send_wr.num_sge = 0; ctxt->sc_cur_sge_no = 0; + ctxt->sc_page_count = 0; return ctxt; out_empty: @@ -228,6 +227,8 @@ out_empty: * svc_rdma_send_ctxt_put - Return send_ctxt to free list * @rdma: controlling svcxprt_rdma * @ctxt: object to return to the free list + * + * Pages left in sc_pages are DMA unmapped and released. */ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) @@ -235,6 +236,9 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, struct ib_device *device = rdma->sc_cm_id->device; unsigned int i; + if (ctxt->sc_page_count) + release_pages(ctxt->sc_pages, ctxt->sc_page_count); + /* The first SGE contains the transport header, which * remains mapped until @ctxt is destroyed. */ @@ -281,12 +285,12 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); svc_rdma_wake_send_waiters(rdma, 1); - complete(&ctxt->sc_done); if (unlikely(wc->status != IB_WC_SUCCESS)) goto flushed; trace_svcrdma_wc_send(wc, &ctxt->sc_cid); + svc_rdma_send_ctxt_put(rdma, ctxt); return; flushed: @@ -294,6 +298,7 @@ flushed: trace_svcrdma_wc_send_err(wc, &ctxt->sc_cid); else trace_svcrdma_wc_send_flush(wc, &ctxt->sc_cid); + svc_rdma_send_ctxt_put(rdma, ctxt); svc_xprt_deferred_close(&rdma->sc_xprt); } @@ -310,7 +315,7 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) struct ib_send_wr *wr = &ctxt->sc_send_wr; int ret; - reinit_completion(&ctxt->sc_done); + might_sleep(); /* Sync the transport header buffer */ ib_dma_sync_single_for_device(rdma->sc_pd->device, @@ -799,6 +804,25 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, svc_rdma_xb_dma_map, &args); } +/* The svc_rqst and all resources it owns are released as soon as + * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt + * so they are released by the Send completion handler. + */ +static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, + struct svc_rdma_send_ctxt *ctxt) +{ + int i, pages = rqstp->rq_next_page - rqstp->rq_respages; + + ctxt->sc_page_count += pages; + for (i = 0; i < pages; i++) { + ctxt->sc_pages[i] = rqstp->rq_respages[i]; + rqstp->rq_respages[i] = NULL; + } + + /* Prevent svc_xprt_release from releasing pages in rq_pages */ + rqstp->rq_next_page = rqstp->rq_respages; +} + /* Prepare the portion of the RPC Reply that will be transmitted * via RDMA Send. The RPC-over-RDMA transport header is prepared * in sc_sges[0], and the RPC xdr_buf is prepared in following sges. @@ -828,6 +852,8 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, if (ret < 0) return ret; + svc_rdma_save_io_pages(rqstp, sctxt); + if (rctxt->rc_inv_rkey) { sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV; sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey; @@ -835,13 +861,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, sctxt->sc_send_wr.opcode = IB_WR_SEND; } - ret = svc_rdma_send(rdma, sctxt); - if (ret < 0) - return ret; - - ret = wait_for_completion_killable(&sctxt->sc_done); - svc_rdma_send_ctxt_put(rdma, sctxt); - return ret; + return svc_rdma_send(rdma, sctxt); } /** @@ -907,8 +927,7 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; if (svc_rdma_send(rdma, sctxt)) goto put_ctxt; - - wait_for_completion_killable(&sctxt->sc_done); + return; put_ctxt: svc_rdma_send_ctxt_put(rdma, sctxt); @@ -976,17 +995,16 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp); if (ret < 0) goto put_ctxt; - - /* Prevent svc_xprt_release() from releasing the page backing - * rq_res.head[0].iov_base. It's no longer being accessed by - * the I/O device. */ - rqstp->rq_respages++; return 0; reply_chunk: if (ret != -E2BIG && ret != -EINVAL) goto put_ctxt; + /* Send completion releases payload pages that were part + * of previously posted RDMA Writes. + */ + svc_rdma_save_io_pages(rqstp, sctxt); svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret); return 0; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index ca04f7a6a085..2abd895046ee 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -64,7 +64,7 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, - struct net *net); + struct net *net, int node); static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, struct net *net, struct sockaddr *sa, int salen, @@ -123,14 +123,14 @@ static void qp_event_handler(struct ib_event *event, void *context) } static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, - struct net *net) + struct net *net, int node) { - struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL); + struct svcxprt_rdma *cma_xprt; - if (!cma_xprt) { - dprintk("svcrdma: failed to create new transport\n"); + cma_xprt = kzalloc_node(sizeof(*cma_xprt), GFP_KERNEL, node); + if (!cma_xprt) return NULL; - } + svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); INIT_LIST_HEAD(&cma_xprt->sc_accept_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); @@ -193,9 +193,9 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, struct svcxprt_rdma *newxprt; struct sockaddr *sa; - /* Create a new transport */ newxprt = svc_rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, - listen_xprt->sc_xprt.xpt_net); + listen_xprt->sc_xprt.xpt_net, + ibdev_to_node(new_cma_id->device)); if (!newxprt) return; newxprt->sc_cm_id = new_cma_id; @@ -304,7 +304,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, if (sa->sa_family != AF_INET && sa->sa_family != AF_INET6) return ERR_PTR(-EAFNOSUPPORT); - cma_xprt = svc_rdma_create_xprt(serv, net); + cma_xprt = svc_rdma_create_xprt(serv, net, NUMA_NO_NODE); if (!cma_xprt) return ERR_PTR(-ENOMEM); set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); |