From c8628155ece363487b57d33441ea0359018c0fa7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 18 Mar 2012 11:07:47 +0000 Subject: tcp: reduce out_of_order memory use With increasing receive window sizes, but speed of light not improved that much, out of order queue can contain a huge number of skbs, waiting to be moved to receive_queue when missing packets can fill the holes. Some devices happen to use fat skbs (truesize of 4096 + sizeof(struct sk_buff)) to store regular (MTU <= 1500) frames. This makes highly probable sk_rmem_alloc hits sk_rcvbuf limit, which can be 4Mbytes in many cases. When limit is hit, tcp stack calls tcp_collapse_ofo_queue(), a true latency killer and cpu cache blower. Doing the coalescing attempt each time we add a frame in ofo queue permits to keep memory use tight and in many cases avoid the tcp_collapse() thing later. Tested on various wireless setups (b43, ath9k, ...) known to use big skb truesize, this patch removed the "packets collapsed in receive queue due to low socket buffer" I had before. This also reduced average memory used by tcp sockets. With help from Neal Cardwell. Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Cc: H.K. Jerry Chu Cc: Tom Herbert Cc: Ilpo Järvinen Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_input.c') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fa7de12c4a52..e886e2f7fa8d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4484,7 +4484,24 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) end_seq = TCP_SKB_CB(skb)->end_seq; if (seq == TCP_SKB_CB(skb1)->end_seq) { - __skb_queue_after(&tp->out_of_order_queue, skb1, skb); + /* Packets in ofo can stay in queue a long time. + * Better try to coalesce them right now + * to avoid future tcp_collapse_ofo_queue(), + * probably the most expensive function in tcp stack. + */ + if (skb->len <= skb_tailroom(skb1) && !tcp_hdr(skb)->fin) { + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPRCVCOALESCE); + BUG_ON(skb_copy_bits(skb, 0, + skb_put(skb1, skb->len), + skb->len)); + TCP_SKB_CB(skb1)->end_seq = end_seq; + TCP_SKB_CB(skb1)->ack_seq = TCP_SKB_CB(skb)->ack_seq; + __kfree_skb(skb); + skb = NULL; + } else { + __skb_queue_after(&tp->out_of_order_queue, skb1, skb); + } if (!tp->rx_opt.num_sacks || tp->selective_acks[0].end_seq != seq) -- cgit v1.2.3