From 52900d22288e7d45846037e1db277c665bbc40db Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 30 Nov 2018 15:32:40 -0500 Subject: udp: elide zerocopy operation in hot path With MSG_ZEROCOPY, each skb holds a reference to a struct ubuf_info. Release of its last reference triggers a completion notification. The TCP stack in tcp_sendmsg_locked holds an extra ref independent of the skbs, because it can build, send and free skbs within its loop, possibly reaching refcount zero and freeing the ubuf_info too soon. The UDP stack currently also takes this extra ref, but does not need it as all skbs are sent after return from __ip(6)_append_data. Avoid the extra refcount_inc and refcount_dec_and_test, and generally the sock_zerocopy_put in the common path, by passing the initial reference to the first skb. This approach is taken instead of initializing the refcount to 0, as that would generate error "refcount_t: increment on 0" on the next skb_zcopy_set. Changes v3 -> v4 - Move skb_zcopy_set below the only kfree_skb that might cause a premature uarg destroy before skb_zerocopy_put_abort - Move the entire skb_shinfo assignment block, to keep that cacheline access in one place Signed-off-by: Willem de Bruijn Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net/ipv6') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7df04d20a91f..ec8c235ea891 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1258,7 +1258,7 @@ static int __ip6_append_data(struct sock *sk, int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; unsigned int wmem_alloc_delta = 0; - bool paged; + bool paged, extra_uref; skb = skb_peek_tail(queue); if (!skb) { @@ -1327,12 +1327,13 @@ emsgsize: uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); if (!uarg) return -ENOBUFS; + extra_uref = true; if (rt->dst.dev->features & NETIF_F_SG && csummode == CHECKSUM_PARTIAL) { paged = true; } else { uarg->zerocopy = 0; - skb_zcopy_set(skb, uarg); + skb_zcopy_set(skb, uarg, &extra_uref); } } @@ -1454,13 +1455,6 @@ alloc_new_skb: skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + dst_exthdrlen); - /* Only the initial fragment is time stamped */ - skb_shinfo(skb)->tx_flags = cork->tx_flags; - cork->tx_flags = 0; - skb_shinfo(skb)->tskey = tskey; - tskey = 0; - skb_zcopy_set(skb, uarg); - /* * Find where to start putting bytes */ @@ -1492,6 +1486,13 @@ alloc_new_skb: exthdrlen = 0; dst_exthdrlen = 0; + /* Only the initial fragment is time stamped */ + skb_shinfo(skb)->tx_flags = cork->tx_flags; + cork->tx_flags = 0; + skb_shinfo(skb)->tskey = tskey; + tskey = 0; + skb_zcopy_set(skb, uarg, &extra_uref); + if ((flags & MSG_CONFIRM) && !skb_prev) skb_set_dst_pending_confirm(skb, 1); @@ -1562,13 +1563,12 @@ alloc_new_skb: if (wmem_alloc_delta) refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); - sock_zerocopy_put(uarg); return 0; error_efault: err = -EFAULT; error: - sock_zerocopy_put_abort(uarg); + sock_zerocopy_put_abort(uarg, extra_uref); cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); -- cgit v1.2.3