From 9cb429d692b341e972b12e6cd097364050ebbb26 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Jul 2012 01:19:31 +0000 Subject: tcp: early_demux fixes 1) Remove a non needed pskb_may_pull() in tcp_v4_early_demux() and fix a potential bug if skb->head was reallocated (iph & th pointers were not reloaded) TCP stack will pull/check headers anyway. 2) must reload iph in ip_rcv_finish() after early_demux() call since skb->head might have changed. 3) skb->dev->ifindex can be now replaced by skb->skb_iif Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 5 ++++- net/ipv4/tcp_ipv4.c | 9 ++------- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 4ebc6feee250..93134b0eab0c 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -326,8 +326,11 @@ static int ip_rcv_finish(struct sk_buff *skb) rcu_read_lock(); ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot && ipprot->early_demux) + if (ipprot && ipprot->early_demux) { ipprot->early_demux(skb); + /* must reload iph, skb->head might have changed */ + iph = ip_hdr(skb); + } rcu_read_unlock(); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3e30548ac32a..b6b07c93924c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1686,7 +1686,6 @@ void tcp_v4_early_demux(struct sk_buff *skb) struct net *net = dev_net(skb->dev); const struct iphdr *iph; const struct tcphdr *th; - struct net_device *dev; struct sock *sk; if (skb->pkt_type != PACKET_HOST) @@ -1701,14 +1700,10 @@ void tcp_v4_early_demux(struct sk_buff *skb) if (th->doff < sizeof(struct tcphdr) / 4) return; - if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4)) - return; - - dev = skb->dev; sk = __inet_lookup_established(net, &tcp_hashinfo, iph->saddr, th->source, iph->daddr, ntohs(th->dest), - dev->ifindex); + skb->skb_iif); if (sk) { skb->sk = sk; skb->destructor = sock_edemux; @@ -1718,7 +1713,7 @@ void tcp_v4_early_demux(struct sk_buff *skb) if (dst) dst = dst_check(dst, 0); if (dst && - icsk->rx_dst_ifindex == dev->ifindex) + icsk->rx_dst_ifindex == skb->skb_iif) skb_dst_set_noref(skb, dst); } } -- cgit v1.2.3 From 8b72ff6484fe303e01498b58621810a114f3cf09 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Tue, 24 Jul 2012 08:16:25 +0000 Subject: wanmain: comparing array with NULL gcc really should warn about these ! Signed-off-by: Alan Cox Signed-off-by: David S. Miller --- net/wanrouter/wanmain.c | 51 ++++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c index 788a12c1eb5d..2ab785064b7e 100644 --- a/net/wanrouter/wanmain.c +++ b/net/wanrouter/wanmain.c @@ -602,36 +602,31 @@ static int wanrouter_device_new_if(struct wan_device *wandev, * successfully, add it to the interface list. */ - if (dev->name == NULL) { - err = -EINVAL; - } else { +#ifdef WANDEBUG + printk(KERN_INFO "%s: registering interface %s...\n", + wanrouter_modname, dev->name); +#endif - #ifdef WANDEBUG - printk(KERN_INFO "%s: registering interface %s...\n", - wanrouter_modname, dev->name); - #endif - - err = register_netdev(dev); - if (!err) { - struct net_device *slave = NULL; - unsigned long smp_flags=0; - - lock_adapter_irq(&wandev->lock, &smp_flags); - - if (wandev->dev == NULL) { - wandev->dev = dev; - } else { - for (slave=wandev->dev; - DEV_TO_SLAVE(slave); - slave = DEV_TO_SLAVE(slave)) - DEV_TO_SLAVE(slave) = dev; - } - ++wandev->ndev; - - unlock_adapter_irq(&wandev->lock, &smp_flags); - err = 0; /* done !!! */ - goto out; + err = register_netdev(dev); + if (!err) { + struct net_device *slave = NULL; + unsigned long smp_flags=0; + + lock_adapter_irq(&wandev->lock, &smp_flags); + + if (wandev->dev == NULL) { + wandev->dev = dev; + } else { + for (slave=wandev->dev; + DEV_TO_SLAVE(slave); + slave = DEV_TO_SLAVE(slave)) + DEV_TO_SLAVE(slave) = dev; } + ++wandev->ndev; + + unlock_adapter_irq(&wandev->lock, &smp_flags); + err = 0; /* done !!! */ + goto out; } if (wandev->del_if) wandev->del_if(wandev, dev); -- cgit v1.2.3 From 4331debc51ee1ce319f4a389484e0e8e05de2aca Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 25 Jul 2012 05:11:23 +0000 Subject: ipv4: rt_cache_valid must check expired routes commit d2d68ba9fe8 (ipv4: Cache input routes in fib_info nexthops.) introduced rt_cache_valid() helper. It unfortunately doesn't check if route is expired before caching it. I noticed sk_setup_caps() was constantly called on a tcp workload. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6bcb8fc71cbc..3f7bb7185c50 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -444,7 +444,7 @@ static inline int ip_rt_proc_init(void) } #endif /* CONFIG_PROC_FS */ -static inline int rt_is_expired(struct rtable *rth) +static inline bool rt_is_expired(const struct rtable *rth) { return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); } @@ -1222,9 +1222,11 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) } } -static bool rt_cache_valid(struct rtable *rt) +static bool rt_cache_valid(const struct rtable *rt) { - return (rt && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK); + return rt && + rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && + !rt_is_expired(rt); } static void rt_set_nexthop(struct rtable *rt, __be32 daddr, -- cgit v1.2.3 From c6cffba4ffa26a8ffacd0bb9f3144e34f20da7de Mon Sep 17 00:00:00 2001 From: David S. Miller Date: Thu, 26 Jul 2012 11:14:38 +0000 Subject: ipv4: Fix input route performance regression. With the routing cache removal we lost the "noref" code paths on input, and this can kill some routing workloads. Reinstate the noref path when we hit a cached route in the FIB nexthops. With help from Eric Dumazet. Reported-by: Alexander Duyck Signed-off-by: David S. Miller Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/route.h | 19 +++++++++++++++++-- net/ipv4/arp.c | 2 +- net/ipv4/fib_semantics.c | 4 ++-- net/ipv4/ip_fragment.c | 4 ++-- net/ipv4/ip_input.c | 4 ++-- net/ipv4/route.c | 48 ++++++++++++++++++++++-------------------------- net/ipv4/xfrm4_input.c | 4 ++-- 7 files changed, 48 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/include/net/route.h b/include/net/route.h index c29ef2733f2d..8c52bc6f1c90 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -157,8 +158,22 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4 return ip_route_output_key(net, fl4); } -extern int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, - u8 tos, struct net_device *devin); +extern int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src, + u8 tos, struct net_device *devin); + +static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src, + u8 tos, struct net_device *devin) +{ + int err; + + rcu_read_lock(); + err = ip_route_input_noref(skb, dst, src, tos, devin); + if (!err) + skb_dst_force(skb); + rcu_read_unlock(); + + return err; +} extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, u32 mark, u8 protocol, int flow_flags); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index a0124eb7dbea..77e87aff419a 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -827,7 +827,7 @@ static int arp_process(struct sk_buff *skb) } if (arp->ar_op == htons(ARPOP_REQUEST) && - ip_route_input(skb, tip, sip, 0, dev) == 0) { + ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { rt = skb_rtable(skb); addr_type = rt->rt_type; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e55171f184f9..da0cc2e6b250 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -172,9 +172,9 @@ static void free_fib_info_rcu(struct rcu_head *head) if (nexthop_nh->nh_exceptions) free_nh_exceptions(nexthop_nh); if (nexthop_nh->nh_rth_output) - dst_release(&nexthop_nh->nh_rth_output->dst); + dst_free(&nexthop_nh->nh_rth_output->dst); if (nexthop_nh->nh_rth_input) - dst_release(&nexthop_nh->nh_rth_input->dst); + dst_free(&nexthop_nh->nh_rth_input->dst); } endfor_nexthops(fi); release_net(fi->fib_net); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 7ad88e5e7110..8d07c973409c 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -258,8 +258,8 @@ static void ip_expire(unsigned long arg) /* skb dst is stale, drop it, and perform route lookup again */ skb_dst_drop(head); iph = ip_hdr(head); - err = ip_route_input(head, iph->daddr, iph->saddr, - iph->tos, head->dev); + err = ip_route_input_noref(head, iph->daddr, iph->saddr, + iph->tos, head->dev); if (err) goto out_rcu_unlock; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 93134b0eab0c..bda8cac2ae91 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -339,8 +339,8 @@ static int ip_rcv_finish(struct sk_buff *skb) * how the packet travels inside Linux networking. */ if (!skb_dst(skb)) { - int err = ip_route_input(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev); + int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev); if (unlikely(err)) { if (err == -EXDEV) NET_INC_STATS_BH(dev_net(skb->dev), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3f7bb7185c50..fc1a81ca79a7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1199,10 +1199,9 @@ restart: fnhe->fnhe_stamp = jiffies; } -static inline void rt_release_rcu(struct rcu_head *head) +static inline void rt_free(struct rtable *rt) { - struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - dst_release(dst); + call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); } static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) @@ -1216,9 +1215,15 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) prev = cmpxchg(p, orig, rt); if (prev == orig) { - dst_clone(&rt->dst); if (orig) - call_rcu_bh(&orig->dst.rcu_head, rt_release_rcu); + rt_free(orig); + } else { + /* Routes we intend to cache in the FIB nexthop have + * the DST_NOCACHE bit clear. However, if we are + * unsuccessful at storing this route into the cache + * we really need to set it. + */ + rt->dst.flags |= DST_NOCACHE; } } @@ -1245,7 +1250,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif - if (!(rt->dst.flags & DST_HOST)) + if (!(rt->dst.flags & DST_NOCACHE)) rt_cache_route(nh, rt); } @@ -1261,7 +1266,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev, bool nopolicy, bool noxfrm, bool will_cache) { return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, - (will_cache ? 0 : DST_HOST) | DST_NOCACHE | + (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); } @@ -1366,8 +1371,7 @@ static void ip_handle_martian_source(struct net_device *dev, static int __mkroute_input(struct sk_buff *skb, const struct fib_result *res, struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos, - struct rtable **result) + __be32 daddr, __be32 saddr, u32 tos) { struct rtable *rth; int err; @@ -1418,7 +1422,7 @@ static int __mkroute_input(struct sk_buff *skb, if (!itag) { rth = FIB_RES_NH(*res).nh_rth_input; if (rt_cache_valid(rth)) { - dst_hold(&rth->dst); + skb_dst_set_noref(skb, &rth->dst); goto out; } do_cache = true; @@ -1445,8 +1449,8 @@ static int __mkroute_input(struct sk_buff *skb, rth->dst.output = ip_output; rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag); + skb_dst_set(skb, &rth->dst); out: - *result = rth; err = 0; cleanup: return err; @@ -1458,21 +1462,13 @@ static int ip_mkroute_input(struct sk_buff *skb, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos) { - struct rtable *rth = NULL; - int err; - #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && res->fi->fib_nhs > 1) fib_select_multipath(res); #endif /* create a routing cache entry */ - err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); - if (err) - return err; - - skb_dst_set(skb, &rth->dst); - return 0; + return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); } /* @@ -1588,8 +1584,9 @@ local_input: if (!itag) { rth = FIB_RES_NH(res).nh_rth_input; if (rt_cache_valid(rth)) { - dst_hold(&rth->dst); - goto set_and_out; + skb_dst_set_noref(skb, &rth->dst); + err = 0; + goto out; } do_cache = true; } @@ -1620,7 +1617,6 @@ local_input: } if (do_cache) rt_cache_route(&FIB_RES_NH(res), rth); -set_and_out: skb_dst_set(skb, &rth->dst); err = 0; goto out; @@ -1658,8 +1654,8 @@ martian_source_keep_err: goto out; } -int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev) +int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, + u8 tos, struct net_device *dev) { int res; @@ -1702,7 +1698,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, rcu_read_unlock(); return res; } -EXPORT_SYMBOL(ip_route_input); +EXPORT_SYMBOL(ip_route_input_noref); /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 58d23a572509..06814b6216dc 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -27,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb) if (skb_dst(skb) == NULL) { const struct iphdr *iph = ip_hdr(skb); - if (ip_route_input(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev)) + if (ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev)) goto drop; } return dst_input(skb); -- cgit v1.2.3 From c7109986db3c945f50ceed884a30e0fd8af3b89b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Jul 2012 12:18:11 +0000 Subject: ipv6: Early TCP socket demux This is the IPv6 missing bits for infrastructure added in commit 41063e9dd1195 (ipv4: Early TCP socket demux.) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet6_hashtables.h | 13 +++++++------ include/net/protocol.h | 2 ++ net/ipv4/ip_input.c | 1 + net/ipv6/ip6_input.c | 13 +++++++++++-- net/ipv6/tcp_ipv6.c | 38 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 59 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 00cbb4384c79..9e34c877a770 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -96,14 +96,15 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, const __be16 sport, const __be16 dport) { - struct sock *sk; + struct sock *sk = skb_steal_sock(skb); - if (unlikely(sk = skb_steal_sock(skb))) + if (sk) return sk; - else return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, - &ipv6_hdr(skb)->saddr, sport, - &ipv6_hdr(skb)->daddr, ntohs(dport), - inet6_iif(skb)); + + return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, + &ipv6_hdr(skb)->saddr, sport, + &ipv6_hdr(skb)->daddr, ntohs(dport), + inet6_iif(skb)); } extern struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, diff --git a/include/net/protocol.h b/include/net/protocol.h index 057f2d315567..929528c73fe8 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -52,6 +52,8 @@ struct net_protocol { #if IS_ENABLED(CONFIG_IPV6) struct inet6_protocol { + void (*early_demux)(struct sk_buff *skb); + int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index bda8cac2ae91..981ff1eef28c 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -314,6 +314,7 @@ drop: } int sysctl_ip_early_demux __read_mostly = 1; +EXPORT_SYMBOL(sysctl_ip_early_demux); static int ip_rcv_finish(struct sk_buff *skb) { diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 5ab923e51af3..47975e363fcd 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -47,9 +47,18 @@ -inline int ip6_rcv_finish( struct sk_buff *skb) +int ip6_rcv_finish(struct sk_buff *skb) { - if (skb_dst(skb) == NULL) + if (sysctl_ip_early_demux && !skb_dst(skb)) { + const struct inet6_protocol *ipprot; + + rcu_read_lock(); + ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); + if (ipprot && ipprot->early_demux) + ipprot->early_demux(skb); + rcu_read_unlock(); + } + if (!skb_dst(skb)) ip6_route_input(skb); return dst_input(skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f49476e2d884..221224e72507 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1674,6 +1674,43 @@ do_time_wait: goto discard_it; } +static void tcp_v6_early_demux(struct sk_buff *skb) +{ + const struct ipv6hdr *hdr; + const struct tcphdr *th; + struct sock *sk; + + if (skb->pkt_type != PACKET_HOST) + return; + + if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) + return; + + hdr = ipv6_hdr(skb); + th = tcp_hdr(skb); + + if (th->doff < sizeof(struct tcphdr) / 4) + return; + + sk = __inet6_lookup_established(dev_net(skb->dev), &tcp_hashinfo, + &hdr->saddr, th->source, + &hdr->daddr, ntohs(th->dest), + inet6_iif(skb)); + if (sk) { + skb->sk = sk; + skb->destructor = sock_edemux; + if (sk->sk_state != TCP_TIME_WAIT) { + struct dst_entry *dst = sk->sk_rx_dst; + struct inet_sock *icsk = inet_sk(sk); + if (dst) + dst = dst_check(dst, 0); + if (dst && + icsk->rx_dst_ifindex == inet6_iif(skb)) + skb_dst_set_noref(skb, dst); + } + } +} + static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), .twsk_unique = tcp_twsk_unique, @@ -1984,6 +2021,7 @@ struct proto tcpv6_prot = { }; static const struct inet6_protocol tcpv6_protocol = { + .early_demux = tcp_v6_early_demux, .handler = tcp_v6_rcv, .err_handler = tcp_v6_err, .gso_send_check = tcp_v6_gso_send_check, -- cgit v1.2.3