From ff76def3bd7e816fe0ca7f0840065c566a42ba2f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 29 Mar 2016 11:05:16 +0200 Subject: netfilter: arp_tables: register table in initns arptables is broken since we didn't register the table anymore -- even 'arptables -L' fails. Fixes: b9e69e127397187b ("netfilter: xtables: don't hook tables by default") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arptable_filter.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index dd8c80dc32a2..8f8713b4388f 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -81,6 +81,12 @@ static int __init arptable_filter_init(void) return ret; } + ret = arptable_filter_table_init(&init_net); + if (ret) { + unregister_pernet_subsys(&arptable_filter_net_ops); + kfree(arpfilter_ops); + } + return ret; } -- cgit v1.2.3 From 9ab179d83b4e31ea277a123492e419067c2f129a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 7 Apr 2016 11:10:06 -0700 Subject: net: vrf: Fix dst reference counting Vivek reported a kernel exception deleting a VRF with an active connection through it. The root cause is that the socket has a cached reference to a dst that is destroyed. Converting the dst_destroy to dst_release and letting proper reference counting kick in does not work as the dst has a reference to the device which needs to be released as well. I talked to Hannes about this at netdev and he pointed out the ipv4 and ipv6 dst handling has dst_ifdown for just this scenario. Rather than continuing with the reinvented dst wheel in VRF just remove it and leverage the ipv4 and ipv6 versions. Fixes: 193125dbd8eb2 ("net: Introduce VRF device driver") Fixes: 35402e3136634 ("net: Add IPv6 support to VRF device") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 177 +++++------------------------------------------- include/net/ip6_route.h | 3 + include/net/route.h | 3 + net/ipv4/route.c | 7 +- net/ipv6/route.c | 7 +- 5 files changed, 30 insertions(+), 167 deletions(-) (limited to 'net/ipv4') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 9a9fabb900c1..8a8f1e58b415 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -60,41 +60,6 @@ struct pcpu_dstats { struct u64_stats_sync syncp; }; -static struct dst_entry *vrf_ip_check(struct dst_entry *dst, u32 cookie) -{ - return dst; -} - -static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) -{ - return ip_local_out(net, sk, skb); -} - -static unsigned int vrf_v4_mtu(const struct dst_entry *dst) -{ - /* TO-DO: return max ethernet size? */ - return dst->dev->mtu; -} - -static void vrf_dst_destroy(struct dst_entry *dst) -{ - /* our dst lives forever - or until the device is closed */ -} - -static unsigned int vrf_default_advmss(const struct dst_entry *dst) -{ - return 65535 - 40; -} - -static struct dst_ops vrf_dst_ops = { - .family = AF_INET, - .local_out = vrf_ip_local_out, - .check = vrf_ip_check, - .mtu = vrf_v4_mtu, - .destroy = vrf_dst_destroy, - .default_advmss = vrf_default_advmss, -}; - /* neighbor handling is done with actual device; do not want * to flip skb->dev for those ndisc packets. This really fails * for multiple next protocols (e.g., NEXTHDR_HOP). But it is @@ -349,46 +314,6 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) } #if IS_ENABLED(CONFIG_IPV6) -static struct dst_entry *vrf_ip6_check(struct dst_entry *dst, u32 cookie) -{ - return dst; -} - -static struct dst_ops vrf_dst_ops6 = { - .family = AF_INET6, - .local_out = ip6_local_out, - .check = vrf_ip6_check, - .mtu = vrf_v4_mtu, - .destroy = vrf_dst_destroy, - .default_advmss = vrf_default_advmss, -}; - -static int init_dst_ops6_kmem_cachep(void) -{ - vrf_dst_ops6.kmem_cachep = kmem_cache_create("vrf_ip6_dst_cache", - sizeof(struct rt6_info), - 0, - SLAB_HWCACHE_ALIGN, - NULL); - - if (!vrf_dst_ops6.kmem_cachep) - return -ENOMEM; - - return 0; -} - -static void free_dst_ops6_kmem_cachep(void) -{ - kmem_cache_destroy(vrf_dst_ops6.kmem_cachep); -} - -static int vrf_input6(struct sk_buff *skb) -{ - skb->dev->stats.rx_errors++; - kfree_skb(skb); - return 0; -} - /* modelled after ip6_finish_output2 */ static int vrf_finish_output6(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -429,67 +354,34 @@ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } -static void vrf_rt6_destroy(struct net_vrf *vrf) +static void vrf_rt6_release(struct net_vrf *vrf) { - dst_destroy(&vrf->rt6->dst); - free_percpu(vrf->rt6->rt6i_pcpu); + dst_release(&vrf->rt6->dst); vrf->rt6 = NULL; } static int vrf_rt6_create(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); - struct dst_entry *dst; + struct net *net = dev_net(dev); struct rt6_info *rt6; - int cpu; int rc = -ENOMEM; - rt6 = dst_alloc(&vrf_dst_ops6, dev, 0, - DST_OBSOLETE_NONE, - (DST_HOST | DST_NOPOLICY | DST_NOXFRM)); + rt6 = ip6_dst_alloc(net, dev, + DST_HOST | DST_NOPOLICY | DST_NOXFRM | DST_NOCACHE); if (!rt6) goto out; - dst = &rt6->dst; - - rt6->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_KERNEL); - if (!rt6->rt6i_pcpu) { - dst_destroy(dst); - goto out; - } - for_each_possible_cpu(cpu) { - struct rt6_info **p = per_cpu_ptr(rt6->rt6i_pcpu, cpu); - *p = NULL; - } - - memset(dst + 1, 0, sizeof(*rt6) - sizeof(*dst)); - - INIT_LIST_HEAD(&rt6->rt6i_siblings); - INIT_LIST_HEAD(&rt6->rt6i_uncached); - - rt6->dst.input = vrf_input6; rt6->dst.output = vrf_output6; - - rt6->rt6i_table = fib6_get_table(dev_net(dev), vrf->tb_id); - - atomic_set(&rt6->dst.__refcnt, 2); - + rt6->rt6i_table = fib6_get_table(net, vrf->tb_id); + dst_hold(&rt6->dst); vrf->rt6 = rt6; rc = 0; out: return rc; } #else -static int init_dst_ops6_kmem_cachep(void) -{ - return 0; -} - -static void free_dst_ops6_kmem_cachep(void) -{ -} - -static void vrf_rt6_destroy(struct net_vrf *vrf) +static void vrf_rt6_release(struct net_vrf *vrf) { } @@ -557,11 +449,11 @@ static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) !(IPCB(skb)->flags & IPSKB_REROUTED)); } -static void vrf_rtable_destroy(struct net_vrf *vrf) +static void vrf_rtable_release(struct net_vrf *vrf) { struct dst_entry *dst = (struct dst_entry *)vrf->rth; - dst_destroy(dst); + dst_release(dst); vrf->rth = NULL; } @@ -570,22 +462,10 @@ static struct rtable *vrf_rtable_create(struct net_device *dev) struct net_vrf *vrf = netdev_priv(dev); struct rtable *rth; - rth = dst_alloc(&vrf_dst_ops, dev, 2, - DST_OBSOLETE_NONE, - (DST_HOST | DST_NOPOLICY | DST_NOXFRM)); + rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1, 1, 0); if (rth) { rth->dst.output = vrf_output; - rth->rt_genid = rt_genid_ipv4(dev_net(dev)); - rth->rt_flags = 0; - rth->rt_type = RTN_UNICAST; - rth->rt_is_input = 0; - rth->rt_iif = 0; - rth->rt_pmtu = 0; - rth->rt_gateway = 0; - rth->rt_uses_gateway = 0; rth->rt_table_id = vrf->tb_id; - INIT_LIST_HEAD(&rth->rt_uncached); - rth->rt_uncached_list = NULL; } return rth; @@ -673,8 +553,8 @@ static void vrf_dev_uninit(struct net_device *dev) struct net_device *port_dev; struct list_head *iter; - vrf_rtable_destroy(vrf); - vrf_rt6_destroy(vrf); + vrf_rtable_release(vrf); + vrf_rt6_release(vrf); netdev_for_each_lower_dev(dev, port_dev, iter) vrf_del_slave(dev, port_dev); @@ -704,7 +584,7 @@ static int vrf_dev_init(struct net_device *dev) return 0; out_rth: - vrf_rtable_destroy(vrf); + vrf_rtable_release(vrf); out_stats: free_percpu(dev->dstats); dev->dstats = NULL; @@ -737,7 +617,7 @@ static struct rtable *vrf_get_rtable(const struct net_device *dev, struct net_vrf *vrf = netdev_priv(dev); rth = vrf->rth; - atomic_inc(&rth->dst.__refcnt); + dst_hold(&rth->dst); } return rth; @@ -788,7 +668,7 @@ static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev, struct net_vrf *vrf = netdev_priv(dev); rt = vrf->rt6; - atomic_inc(&rt->dst.__refcnt); + dst_hold(&rt->dst); } return (struct dst_entry *)rt; @@ -946,19 +826,6 @@ static int __init vrf_init_module(void) { int rc; - vrf_dst_ops.kmem_cachep = - kmem_cache_create("vrf_ip_dst_cache", - sizeof(struct rtable), 0, - SLAB_HWCACHE_ALIGN, - NULL); - - if (!vrf_dst_ops.kmem_cachep) - return -ENOMEM; - - rc = init_dst_ops6_kmem_cachep(); - if (rc != 0) - goto error2; - register_netdevice_notifier(&vrf_notifier_block); rc = rtnl_link_register(&vrf_link_ops); @@ -969,22 +836,10 @@ static int __init vrf_init_module(void) error: unregister_netdevice_notifier(&vrf_notifier_block); - free_dst_ops6_kmem_cachep(); -error2: - kmem_cache_destroy(vrf_dst_ops.kmem_cachep); return rc; } -static void __exit vrf_cleanup_module(void) -{ - rtnl_link_unregister(&vrf_link_ops); - unregister_netdevice_notifier(&vrf_notifier_block); - kmem_cache_destroy(vrf_dst_ops.kmem_cachep); - free_dst_ops6_kmem_cachep(); -} - module_init(vrf_init_module); -module_exit(vrf_cleanup_module); MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern"); MODULE_DESCRIPTION("Device driver to instantiate VRF domains"); MODULE_LICENSE("GPL"); diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 295d291269e2..54c779416eec 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -101,6 +101,9 @@ void fib6_force_start_gc(struct net *net); struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, const struct in6_addr *addr, bool anycast); +struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, + int flags); + /* * support functions for ND * diff --git a/include/net/route.h b/include/net/route.h index 9b0a523bb428..6de665bf1750 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -209,6 +209,9 @@ unsigned int inet_addr_type_dev_table(struct net *net, void ip_rt_multicast_event(struct in_device *); int ip_rt_ioctl(struct net *, unsigned int cmd, void __user *arg); void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt); +struct rtable *rt_dst_alloc(struct net_device *dev, + unsigned int flags, u16 type, + bool nopolicy, bool noxfrm, bool will_cache); struct in_ifaddr; void fib_add_ifaddr(struct in_ifaddr *); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 02c62299d717..2852bdf73540 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1438,9 +1438,9 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #endif } -static struct rtable *rt_dst_alloc(struct net_device *dev, - unsigned int flags, u16 type, - bool nopolicy, bool noxfrm, bool will_cache) +struct rtable *rt_dst_alloc(struct net_device *dev, + unsigned int flags, u16 type, + bool nopolicy, bool noxfrm, bool will_cache) { struct rtable *rt; @@ -1468,6 +1468,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev, return rt; } +EXPORT_SYMBOL(rt_dst_alloc); /* called in rcu_read_lock() section */ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ed446639219c..1d8871a5ed20 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -338,9 +338,9 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net, return rt; } -static struct rt6_info *ip6_dst_alloc(struct net *net, - struct net_device *dev, - int flags) +struct rt6_info *ip6_dst_alloc(struct net *net, + struct net_device *dev, + int flags) { struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags); @@ -364,6 +364,7 @@ static struct rt6_info *ip6_dst_alloc(struct net *net, return rt; } +EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { -- cgit v1.2.3 From d6d5e999e5df67f8ec20b6be45e2229455ee3699 Mon Sep 17 00:00:00 2001 From: Chris Friesen Date: Fri, 8 Apr 2016 15:21:30 -0600 Subject: route: do not cache fib route info on local routes with oif For local routes that require a particular output interface we do not want to cache the result. Caching the result causes incorrect behaviour when there are multiple source addresses on the interface. The end result being that if the intended recipient is waiting on that interface for the packet he won't receive it because it will be delivered on the loopback interface and the IP_PKTINFO ipi_ifindex will be set to the loopback interface as well. This can be tested by running a program such as "dhcp_release" which attempts to inject a packet on a particular interface so that it is received by another program on the same board. The receiving process should see an IP_PKTINFO ipi_ifndex value of the source interface (e.g., eth1) instead of the loopback interface (e.g., lo). The packet will still appear on the loopback interface in tcpdump but the important aspect is that the CMSG info is correct. Sample dhcp_release command line: dhcp_release eth1 192.168.204.222 02:11:33:22:44:66 Signed-off-by: Allain Legacy Signed off-by: Chris Friesen Reviewed-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/route.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2852bdf73540..60398a9370e7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2046,6 +2046,18 @@ static struct rtable *__mkroute_output(const struct fib_result *res, */ if (fi && res->prefixlen < 4) fi = NULL; + } else if ((type == RTN_LOCAL) && (orig_oif != 0) && + (orig_oif != dev_out->ifindex)) { + /* For local routes that require a particular output interface + * we do not want to cache the result. Caching the result + * causes incorrect behaviour when there are multiple source + * addresses on the interface, the end result being that if the + * intended recipient is waiting on that interface for the + * packet he won't receive it because it will be delivered on + * the loopback interface and the IP_PKTINFO ipi_ifindex will + * be set to the loopback interface as well. + */ + fi = NULL; } fnhe = NULL; -- cgit v1.2.3 From d894ba18d4e449b3a7f6eb491f16c9e02933736e Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Tue, 12 Apr 2016 13:11:25 -0400 Subject: soreuseport: fix ordering for mixed v4/v6 sockets With the SO_REUSEPORT socket option, it is possible to create sockets in the AF_INET and AF_INET6 domains which are bound to the same IPv4 address. This is only possible with SO_REUSEPORT and when not using IPV6_V6ONLY on the AF_INET6 sockets. Prior to the commits referenced below, an incoming IPv4 packet would always be routed to a socket of type AF_INET when this mixed-mode was used. After those changes, the same packet would be routed to the most recently bound socket (if this happened to be an AF_INET6 socket, it would have an IPv4 mapped IPv6 address). The change in behavior occurred because the recent SO_REUSEPORT optimizations short-circuit the socket scoring logic as soon as they find a match. They did not take into account the scoring logic that favors AF_INET sockets over AF_INET6 sockets in the event of a tie. To fix this problem, this patch changes the insertion order of AF_INET and AF_INET6 addresses in the TCP and UDP socket lists when the sockets have SO_REUSEPORT set. AF_INET sockets will be inserted at the head of the list and AF_INET6 sockets with SO_REUSEPORT set will always be inserted at the tail of the list. This will force AF_INET sockets to always be considered first. Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection") Fixes: 125e80b88687 ("soreuseport: fast reuseport TCP socket selection") Reported-by: Maciej Żenczykowski Signed-off-by: Craig Gallek Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/rculist_nulls.h | 39 +++++++++++++++++++++++++++++++++++++++ include/net/sock.h | 6 +++++- net/ipv4/udp.c | 9 +++++++-- 3 files changed, 51 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index 1c33dd7da4a7..4ae95f7e8597 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -98,6 +98,45 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, if (!is_a_nulls(first)) first->pprev = &n->next; } + +/** + * hlist_nulls_add_tail_rcu + * @n: the element to add to the hash list. + * @h: the list to add to. + * + * Description: + * Adds the specified element to the end of the specified hlist_nulls, + * while permitting racing traversals. NOTE: tail insertion requires + * list traversal. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() + * or hlist_nulls_del_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency + * problems on Alpha CPUs. Regardless of the type of CPU, the + * list-traversal primitive must be guarded by rcu_read_lock(). + */ +static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, + struct hlist_nulls_head *h) +{ + struct hlist_nulls_node *i, *last = NULL; + + for (i = hlist_nulls_first_rcu(h); !is_a_nulls(i); + i = hlist_nulls_next_rcu(i)) + last = i; + + if (last) { + n->next = last->next; + n->pprev = &last->next; + rcu_assign_pointer(hlist_nulls_next_rcu(last), n); + } else { + hlist_nulls_add_head_rcu(n, h); + } +} + /** * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. diff --git a/include/net/sock.h b/include/net/sock.h index 255d3e03727b..121ffc115c4f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -630,7 +630,11 @@ static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list) static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { - hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); + if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && + sk->sk_family == AF_INET6) + hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list); + else + hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); } static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 08eed5e16df0..a2e7f55a1f61 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -339,8 +339,13 @@ found: hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock(&hslot2->lock); - hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, - &hslot2->head); + if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && + sk->sk_family == AF_INET6) + hlist_nulls_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); + else + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); hslot2->count++; spin_unlock(&hslot2->lock); } -- cgit v1.2.3 From 479f85c36688f5c855ad463b71902ef5992628b7 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 18 Apr 2016 15:39:53 -0700 Subject: tcp: Fix SOF_TIMESTAMPING_TX_ACK when handling dup acks Assuming SOF_TIMESTAMPING_TX_ACK is on. When dup acks are received, it could incorrectly think that a skb has already been acked and queue a SCM_TSTAMP_ACK cmsg to the sk->sk_error_queue. In tcp_ack_tstamp(), it checks 'between(shinfo->tskey, prior_snd_una, tcp_sk(sk)->snd_una - 1)'. If prior_snd_una == tcp_sk(sk)->snd_una like the following packetdrill script, between() returns true but the tskey is actually not acked. e.g. try between(3, 2, 1). The fix is to replace between() with one before() and one !before(). By doing this, the -1 offset on the tcp_sk(sk)->snd_una can also be removed. A packetdrill script is used to reproduce the dup ack scenario. Due to the lacking cmsg support in packetdrill (may be I cannot find it), a BPF prog is used to kprobe to sock_queue_err_skb() and print out the value of serr->ee.ee_data. Both the packetdrill and the bcc BPF script is attached at the end of this commit message. BPF Output Before Fix: ~~~~~~ <...>-2056 [001] d.s. 433.927987: : ee_data:1459 #incorrect packetdrill-2056 [001] d.s. 433.929563: : ee_data:1459 #incorrect packetdrill-2056 [001] d.s. 433.930765: : ee_data:1459 #incorrect packetdrill-2056 [001] d.s. 434.028177: : ee_data:1459 packetdrill-2056 [001] d.s. 434.029686: : ee_data:14599 BPF Output After Fix: ~~~~~~ <...>-2049 [000] d.s. 113.517039: : ee_data:1459 <...>-2049 [000] d.s. 113.517253: : ee_data:14599 BCC BPF Script: ~~~~~~ #!/usr/bin/env python from __future__ import print_function from bcc import BPF bpf_text = """ #include #include #include #include #ifdef memset #undef memset #endif int trace_err_skb(struct pt_regs *ctx) { struct sk_buff *skb = (struct sk_buff *)ctx->si; struct sock *sk = (struct sock *)ctx->di; struct sock_exterr_skb *serr; u32 ee_data = 0; if (!sk || !skb) return 0; serr = SKB_EXT_ERR(skb); bpf_probe_read(&ee_data, sizeof(ee_data), &serr->ee.ee_data); bpf_trace_printk("ee_data:%u\\n", ee_data); return 0; }; """ b = BPF(text=bpf_text) b.attach_kprobe(event="sock_queue_err_skb", fn_name="trace_err_skb") print("Attached to kprobe") b.trace_print() Packetdrill Script: ~~~~~~ +0 `sysctl -q -w net.ipv4.tcp_min_tso_segs=10` +0 `sysctl -q -w net.ipv4.tcp_no_metrics_save=1` +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 0.100 < S 0:0(0) win 32792 0.100 > S. 0:0(0) ack 1 0.200 < . 1:1(0) ack 1 win 257 0.200 accept(3, ..., ...) = 4 +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0 +0 setsockopt(4, SOL_SOCKET, 37, [2688], 4) = 0 0.200 write(4, ..., 1460) = 1460 0.200 write(4, ..., 13140) = 13140 0.200 > P. 1:1461(1460) ack 1 0.200 > . 1461:8761(7300) ack 1 0.200 > P. 8761:14601(5840) ack 1 0.300 < . 1:1(0) ack 1 win 257 0.300 < . 1:1(0) ack 1 win 257 0.300 < . 1:1(0) ack 1 win 257 0.300 > P. 1:1461(1460) ack 1 0.400 < . 1:1(0) ack 14601 win 257 0.400 close(4) = 0 0.400 > F. 14601:14601(0) ack 1 0.500 < F. 1:1(0) ack 14602 win 257 0.500 > . 14602:14602(0) ack 2 Signed-off-by: Martin KaFai Lau Cc: Eric Dumazet Cc: Neal Cardwell Cc: Soheil Hassas Yeganeh Cc: Willem de Bruijn Cc: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Tested-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e6e65f79ade8..0edb07185c25 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3098,7 +3098,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, shinfo = skb_shinfo(skb); if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) && - between(shinfo->tskey, prior_snd_una, tcp_sk(sk)->snd_una - 1)) + !before(shinfo->tskey, prior_snd_una) && + before(shinfo->tskey, tcp_sk(sk)->snd_una)) __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); } -- cgit v1.2.3 From 082ac2d51d9f19ec1c29bdaaaf7fb49889e4fade Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 19 Apr 2016 22:39:28 -0700 Subject: tcp: Merge tx_flags and tskey in tcp_collapse_retrans If two skbs are merged/collapsed during retransmission, the current logic does not merge the tx_flags and tskey. The end result is the SCM_TSTAMP_ACK timestamp could be missing for a packet. The patch: 1. Merge the tx_flags 2. Overwrite the prev_skb's tskey with the next_skb's tskey BPF Output Before: ~~~~~~ BPF Output After: ~~~~~~ packetdrill-2092 [001] d.s. 453.998486: : ee_data:1459 Packetdrill Script: ~~~~~~ +0 `sysctl -q -w net.ipv4.tcp_min_tso_segs=10` +0 `sysctl -q -w net.ipv4.tcp_no_metrics_save=1` +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 0.100 < S 0:0(0) win 32792 0.100 > S. 0:0(0) ack 1 0.200 < . 1:1(0) ack 1 win 257 0.200 accept(3, ..., ...) = 4 +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0 0.200 write(4, ..., 730) = 730 +0 setsockopt(4, SOL_SOCKET, 37, [2688], 4) = 0 0.200 write(4, ..., 730) = 730 +0 setsockopt(4, SOL_SOCKET, 37, [2176], 4) = 0 0.200 write(4, ..., 11680) = 11680 +0 setsockopt(4, SOL_SOCKET, 37, [2688], 4) = 0 0.200 > P. 1:731(730) ack 1 0.200 > P. 731:1461(730) ack 1 0.200 > . 1461:8761(7300) ack 1 0.200 > P. 8761:13141(4380) ack 1 0.300 < . 1:1(0) ack 1 win 257 0.300 < . 1:1(0) ack 1 win 257 0.300 < . 1:1(0) ack 1 win 257 0.300 > P. 1:1461(1460) ack 1 0.400 < . 1:1(0) ack 13141 win 257 0.400 close(4) = 0 0.400 > F. 13141:13141(0) ack 1 0.500 < F. 1:1(0) ack 13142 win 257 0.500 > . 13142:13142(0) ack 2 Signed-off-by: Martin KaFai Lau Cc: Eric Dumazet Cc: Neal Cardwell Cc: Soheil Hassas Yeganeh Cc: Willem de Bruijn Cc: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Tested-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7d2dc015cd19..5bc3c3062672 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2441,6 +2441,20 @@ u32 __tcp_select_window(struct sock *sk) return window; } +static void tcp_skb_collapse_tstamp(struct sk_buff *skb, + const struct sk_buff *next_skb) +{ + const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb); + u8 tsflags = next_shinfo->tx_flags & SKBTX_ANY_TSTAMP; + + if (unlikely(tsflags)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + + shinfo->tx_flags |= tsflags; + shinfo->tskey = next_shinfo->tskey; + } +} + /* Collapses two adjacent SKB's during retransmission. */ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { @@ -2484,6 +2498,8 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb)); + tcp_skb_collapse_tstamp(skb, next_skb); + sk_wmem_free_skb(sk, next_skb); } -- cgit v1.2.3 From cfea5a688eb37bcd1081255df9f9f777f4e61999 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 19 Apr 2016 22:39:29 -0700 Subject: tcp: Merge tx_flags and tskey in tcp_shifted_skb After receiving sacks, tcp_shifted_skb() will collapse skbs if possible. tx_flags and tskey also have to be merged. This patch reuses the tcp_skb_collapse_tstamp() to handle them. BPF Output Before: ~~~~~ BPF Output After: ~~~~~ <...>-2024 [007] d.s. 88.644374: : ee_data:14599 Packetdrill Script: ~~~~~ +0 `sysctl -q -w net.ipv4.tcp_min_tso_segs=10` +0 `sysctl -q -w net.ipv4.tcp_no_metrics_save=1` +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 0.100 < S 0:0(0) win 32792 0.100 > S. 0:0(0) ack 1 0.200 < . 1:1(0) ack 1 win 257 0.200 accept(3, ..., ...) = 4 +0 setsockopt(4, SOL_TCP, TCP_NODELAY, [1], 4) = 0 0.200 write(4, ..., 1460) = 1460 +0 setsockopt(4, SOL_SOCKET, 37, [2688], 4) = 0 0.200 write(4, ..., 13140) = 13140 0.200 > P. 1:1461(1460) ack 1 0.200 > . 1461:8761(7300) ack 1 0.200 > P. 8761:14601(5840) ack 1 0.300 < . 1:1(0) ack 1 win 257 0.300 > P. 1:1461(1460) ack 1 0.400 < . 1:1(0) ack 14601 win 257 0.400 close(4) = 0 0.400 > F. 14601:14601(0) ack 1 0.500 < F. 1:1(0) ack 14602 win 257 0.500 > . 14602:14602(0) ack 2 Signed-off-by: Martin KaFai Lau Cc: Eric Dumazet Cc: Neal Cardwell Cc: Soheil Hassas Yeganeh Cc: Willem de Bruijn Cc: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Tested-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ net/ipv4/tcp_input.c | 1 + net/ipv4/tcp_output.c | 4 ++-- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/include/net/tcp.h b/include/net/tcp.h index b91370f61be6..6db10228113f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -552,6 +552,8 @@ void tcp_send_ack(struct sock *sk); void tcp_send_delayed_ack(struct sock *sk); void tcp_send_loss_probe(struct sock *sk); bool tcp_schedule_loss_probe(struct sock *sk); +void tcp_skb_collapse_tstamp(struct sk_buff *skb, + const struct sk_buff *next_skb); /* tcp_input.c */ void tcp_resume_early_retransmit(struct sock *sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0edb07185c25..c124c3c12f7c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1309,6 +1309,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, if (skb == tcp_highest_sack(sk)) tcp_advance_highest_sack(sk, skb); + tcp_skb_collapse_tstamp(prev, skb); tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5bc3c3062672..441ae9da3a23 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2441,8 +2441,8 @@ u32 __tcp_select_window(struct sock *sk) return window; } -static void tcp_skb_collapse_tstamp(struct sk_buff *skb, - const struct sk_buff *next_skb) +void tcp_skb_collapse_tstamp(struct sk_buff *skb, + const struct sk_buff *next_skb) { const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb); u8 tsflags = next_shinfo->tx_flags & SKBTX_ANY_TSTAMP; -- cgit v1.2.3