From d079535d5e1bf5e2e7c856bae2483414ea21e137 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 23 Mar 2015 16:31:09 -0700 Subject: net: use for_each_netdev_safe() in rtnl_group_changelink() In case we move the whole dev group to another netns, we should call for_each_netdev_safe(), otherwise we get a soft lockup: NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [ip:798] irq event stamp: 255424 hardirqs last enabled at (255423): [] restore_args+0x0/0x30 hardirqs last disabled at (255424): [] apic_timer_interrupt+0x6a/0x80 softirqs last enabled at (255422): [] __do_softirq+0x2c1/0x3a9 softirqs last disabled at (255417): [] irq_exit+0x41/0x95 CPU: 0 PID: 798 Comm: ip Not tainted 4.0.0-rc4+ #881 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff8800d1b88000 ti: ffff880119530000 task.ti: ffff880119530000 RIP: 0010:[] [] debug_lockdep_rcu_enabled+0x28/0x30 RSP: 0018:ffff880119533778 EFLAGS: 00000246 RAX: ffff8800d1b88000 RBX: 0000000000000002 RCX: 0000000000000038 RDX: 0000000000000000 RSI: ffff8800d1b888c8 RDI: ffff8800d1b888c8 RBP: ffff880119533778 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 000000000000b5c2 R12: 0000000000000246 R13: ffff880119533708 R14: 00000000001d5a40 R15: ffff88011a7d5a40 FS: 00007fc01315f740(0000) GS:ffff88011a600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f367a120988 CR3: 000000011849c000 CR4: 00000000000007f0 Stack: ffff880119533798 ffffffff811ac868 ffffffff811ac831 ffffffff811ac828 ffff8801195337c8 ffffffff811ac8c9 ffff8801195339b0 ffff8801197633e0 0000000000000000 ffff8801195339b0 ffff8801195337d8 ffffffff811ad2d7 Call Trace: [] rcu_read_lock+0x37/0x6e [] ? rcu_read_unlock+0x5f/0x5f [] ? rcu_read_unlock+0x56/0x5f [] __fget+0x2a/0x7a [] fget+0x13/0x15 [] proc_ns_fget+0xe/0x38 [] get_net_ns_by_fd+0x11/0x59 [] rtnl_link_get_net+0x33/0x3e [] do_setlink+0x73/0x87b [] ? trace_hardirqs_off+0xd/0xf [] ? retint_restore_args+0xe/0xe [] rtnl_newlink+0x40c/0x699 [] ? rtnl_newlink+0xeb/0x699 [] ? _raw_spin_unlock+0x28/0x33 [] ? security_capable+0x18/0x1a [] ? ns_capable+0x4d/0x65 [] rtnetlink_rcv_msg+0x181/0x194 [] ? rtnl_lock+0x17/0x19 [] ? rtnl_lock+0x17/0x19 [] ? __rtnl_unlock+0x17/0x17 [] netlink_rcv_skb+0x4d/0x93 [] rtnetlink_rcv+0x26/0x2d [] netlink_unicast+0xcb/0x150 [] netlink_sendmsg+0x501/0x523 [] ? might_fault+0x59/0xa9 [] ? copy_from_user+0x2a/0x2c [] sock_sendmsg+0x34/0x3c [] ___sys_sendmsg+0x1b8/0x255 [] ? handle_pte_fault+0xbd5/0xd4a [] ? native_sched_clock+0x35/0x37 [] ? sched_clock_local+0x12/0x72 [] ? sched_clock_cpu+0x9e/0xb7 [] ? rcu_read_lock_held+0x3b/0x3d [] ? __fcheck_files+0x4c/0x58 [] ? __fget_light+0x2d/0x52 [] __sys_sendmsg+0x42/0x60 [] SyS_sendmsg+0x12/0x1c [] system_call_fastpath+0x12/0x17 Fixes: e7ed828f10bd8 ("netlink: support setting devgroup parameters") Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ee0608bb3bc0..7ebed55b5f7d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1932,10 +1932,10 @@ static int rtnl_group_changelink(const struct sk_buff *skb, struct ifinfomsg *ifm, struct nlattr **tb) { - struct net_device *dev; + struct net_device *dev, *aux; int err; - for_each_netdev(net, dev) { + for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { err = do_setlink(skb, dev, ifm, tb, NULL, 0); if (err < 0) -- cgit v1.2.3 From 4217291e592da0e4258b652e82e5428639d29acc Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 26 Mar 2015 17:56:38 +0100 Subject: netns: don't clear nsid too early on removal With the current code, ids are removed too early. Suppose you have an ipip interface that stands in the netns foo and its link part in the netns bar (so the netns bar has an nsid into the netns foo). Now, you remove the netns bar: - the bar nsid into the netns foo is removed - the netns exit method of ipip is called, thus our ipip iface is removed: => a netlink message is sent in the netns foo to advertise this deletion => this netlink message requests an nsid for bar, thus a new nsid is allocated for bar and never removed. We must remove nsids when we are sure that nobody will refer to netns currently cleaned. Fixes: 0c7aecd4bde4 ("netns: add rtnl cmd to add and get peer netns ids") Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/net_namespace.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index cb5290b8c428..5221f975a4cc 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -349,7 +349,7 @@ static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ static void cleanup_net(struct work_struct *work) { const struct pernet_operations *ops; - struct net *net, *tmp; + struct net *net, *tmp, *peer; struct list_head net_kill_list; LIST_HEAD(net_exit_list); @@ -365,14 +365,6 @@ static void cleanup_net(struct work_struct *work) list_for_each_entry(net, &net_kill_list, cleanup_list) { list_del_rcu(&net->list); list_add_tail(&net->exit_list, &net_exit_list); - for_each_net(tmp) { - int id = __peernet2id(tmp, net, false); - - if (id >= 0) - idr_remove(&tmp->netns_ids, id); - } - idr_destroy(&net->netns_ids); - } rtnl_unlock(); @@ -398,12 +390,26 @@ static void cleanup_net(struct work_struct *work) */ rcu_barrier(); + rtnl_lock(); /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { + /* Unreference net from all peers (no need to loop over + * net_exit_list because idr_destroy() will be called for each + * element of this list. + */ + for_each_net(peer) { + int id = __peernet2id(peer, net, false); + + if (id >= 0) + idr_remove(&peer->netns_ids, id); + } + idr_destroy(&net->netns_ids); + list_del_init(&net->exit_list); put_user_ns(net->user_ns); net_drop_ns(net); } + rtnl_unlock(); } static DECLARE_WORK(net_cleanup_work, cleanup_net); -- cgit v1.2.3 From 419df12fb5fa558451319276838c1842f2b11f8f Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 31 Mar 2015 11:01:46 -0700 Subject: net: move fib_rules_unregister() under rtnl lock We have to hold rtnl lock for fib_rules_unregister() otherwise the following race could happen: fib_rules_unregister(): fib_nl_delrule(): ... ... ... ops = lookup_rules_ops(); list_del_rcu(&ops->list); list_for_each_entry(ops->rules) { fib_rules_cleanup_ops(ops); ... list_del_rcu(); list_del_rcu(); } Note, net->rules_mod_lock is actually not needed at all, either upper layer netns code or rtnl lock guarantees we are safe. Cc: Alexander Duyck Cc: Thomas Graf Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/fib_rules.c | 2 +- net/decnet/dn_rules.c | 2 ++ net/ipv4/fib_frontend.c | 3 +-- net/ipv4/ipmr.c | 2 +- net/ipv6/fib6_rules.c | 2 ++ net/ipv6/ip6mr.c | 2 +- 6 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 44706e81b2e0..e4fdc9dfb2c7 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -175,9 +175,9 @@ void fib_rules_unregister(struct fib_rules_ops *ops) spin_lock(&net->rules_mod_lock); list_del_rcu(&ops->list); - fib_rules_cleanup_ops(ops); spin_unlock(&net->rules_mod_lock); + fib_rules_cleanup_ops(ops); call_rcu(&ops->rcu, fib_rules_put_rcu); } EXPORT_SYMBOL_GPL(fib_rules_unregister); diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c index faf7cc3483fe..9d66a0f72f90 100644 --- a/net/decnet/dn_rules.c +++ b/net/decnet/dn_rules.c @@ -248,7 +248,9 @@ void __init dn_fib_rules_init(void) void __exit dn_fib_rules_cleanup(void) { + rtnl_lock(); fib_rules_unregister(dn_fib_rules_ops); + rtnl_unlock(); rcu_barrier(); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 57be71dd6a9e..23b9b3e86f4c 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1111,11 +1111,10 @@ static void ip_fib_net_exit(struct net *net) { unsigned int i; + rtnl_lock(); #ifdef CONFIG_IP_MULTIPLE_TABLES fib4_rules_exit(net); #endif - - rtnl_lock(); for (i = 0; i < FIB_TABLE_HASHSZ; i++) { struct fib_table *tb; struct hlist_head *head; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index bc40115bc394..fe54eba6d00d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -283,8 +283,8 @@ static void __net_exit ipmr_rules_exit(struct net *net) list_del(&mrt->list); ipmr_free_table(mrt); } - rtnl_unlock(); fib_rules_unregister(net->ipv4.mr_rules_ops); + rtnl_unlock(); } #else #define ipmr_for_each_table(mrt, net) \ diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 27ca79682efb..70bc6abc0639 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -322,7 +322,9 @@ out_fib6_rules_ops: static void __net_exit fib6_rules_net_exit(struct net *net) { + rtnl_lock(); fib_rules_unregister(net->ipv6.fib6_rules_ops); + rtnl_unlock(); } static struct pernet_operations fib6_rules_net_ops = { diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 52028f449a89..2f1fd9ffcb34 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -267,8 +267,8 @@ static void __net_exit ip6mr_rules_exit(struct net *net) list_del(&mrt->list); ip6mr_free_table(mrt); } - rtnl_unlock(); fib_rules_unregister(net->ipv6.mr6_rules_ops); + rtnl_unlock(); } #else #define ip6mr_for_each_table(mrt, net) \ -- cgit v1.2.3 From 6d458f5b4ece8542a5c2281e40008823fec91814 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 3 Apr 2015 12:02:36 +0200 Subject: Revert "netns: don't clear nsid too early on removal" This reverts commit 4217291e592d ("netns: don't clear nsid too early on removal"). This is not the right fix, it introduces races. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/net_namespace.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 5221f975a4cc..cb5290b8c428 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -349,7 +349,7 @@ static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ static void cleanup_net(struct work_struct *work) { const struct pernet_operations *ops; - struct net *net, *tmp, *peer; + struct net *net, *tmp; struct list_head net_kill_list; LIST_HEAD(net_exit_list); @@ -365,6 +365,14 @@ static void cleanup_net(struct work_struct *work) list_for_each_entry(net, &net_kill_list, cleanup_list) { list_del_rcu(&net->list); list_add_tail(&net->exit_list, &net_exit_list); + for_each_net(tmp) { + int id = __peernet2id(tmp, net, false); + + if (id >= 0) + idr_remove(&tmp->netns_ids, id); + } + idr_destroy(&net->netns_ids); + } rtnl_unlock(); @@ -390,26 +398,12 @@ static void cleanup_net(struct work_struct *work) */ rcu_barrier(); - rtnl_lock(); /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { - /* Unreference net from all peers (no need to loop over - * net_exit_list because idr_destroy() will be called for each - * element of this list. - */ - for_each_net(peer) { - int id = __peernet2id(peer, net, false); - - if (id >= 0) - idr_remove(&peer->netns_ids, id); - } - idr_destroy(&net->netns_ids); - list_del_init(&net->exit_list); put_user_ns(net->user_ns); net_drop_ns(net); } - rtnl_unlock(); } static DECLARE_WORK(net_cleanup_work, cleanup_net); -- cgit v1.2.3 From 576b7cd2f6ff1e90b3fc0a000d2fe74f8a50a4bb Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 3 Apr 2015 12:02:37 +0200 Subject: netns: don't allocate an id for dead netns First, let's explain the problem. Suppose you have an ipip interface that stands in the netns foo and its link part in the netns bar (so the netns bar has an nsid into the netns foo). Now, you remove the netns bar: - the bar nsid into the netns foo is removed - the netns exit method of ipip is called, thus our ipip iface is removed: => a netlink message is built in the netns foo to advertise this deletion => this netlink message requests an nsid for bar, thus a new nsid is allocated for bar and never removed. This patch adds a check in peernet2id() so that an id cannot be allocated for a netns which is currently destroyed. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/core/net_namespace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index cb5290b8c428..70d3450588b2 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -198,8 +198,10 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc) */ int peernet2id(struct net *net, struct net *peer) { - int id = __peernet2id(net, peer, true); + bool alloc = atomic_read(&peer->count) == 0 ? false : true; + int id; + id = __peernet2id(net, peer, alloc); return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED; } EXPORT_SYMBOL(peernet2id); -- cgit v1.2.3 From f60e5990d9c1424af9dbca60a23ba2a1c7c1ce90 Mon Sep 17 00:00:00 2001 From: hannes@stressinduktion.org Date: Wed, 1 Apr 2015 17:07:44 +0200 Subject: ipv6: protect skb->sk accesses from recursive dereference inside the stack We should not consult skb->sk for output decisions in xmit recursion levels > 0 in the stack. Otherwise local socket settings could influence the result of e.g. tunnel encapsulation process. ipv6 does not conform with this in three places: 1) ip6_fragment: we do consult ipv6_npinfo for frag_size 2) sk_mc_loop in ipv6 uses skb->sk and checks if we should loop the packet back to the local socket 3) ip6_skb_dst_mtu could query the settings from the user socket and force a wrong MTU Furthermore: In sk_mc_loop we could potentially land in WARN_ON(1) if we use a PF_PACKET socket ontop of an IPv6-backed vxlan device. Reuse xmit_recursion as we are currently only interested in protecting tunnel devices. Cc: Jiri Pirko Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ include/net/ip.h | 16 ---------------- include/net/ip6_route.h | 3 ++- include/net/sock.h | 2 ++ net/core/dev.c | 4 +++- net/core/sock.c | 19 +++++++++++++++++++ net/ipv6/ip6_output.c | 3 ++- 7 files changed, 34 insertions(+), 19 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dcf6ec27739b..278738873703 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2185,6 +2185,12 @@ void netdev_freemem(struct net_device *dev); void synchronize_net(void); int init_dummy_netdev(struct net_device *dev); +DECLARE_PER_CPU(int, xmit_recursion); +static inline int dev_recursion_level(void) +{ + return this_cpu_read(xmit_recursion); +} + struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); diff --git a/include/net/ip.h b/include/net/ip.h index 025c61c0dffb..6cc1eafb153a 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -453,22 +453,6 @@ static __inline__ void inet_reset_saddr(struct sock *sk) #endif -static inline int sk_mc_loop(struct sock *sk) -{ - if (!sk) - return 1; - switch (sk->sk_family) { - case AF_INET: - return inet_sk(sk)->mc_loop; -#if IS_ENABLED(CONFIG_IPV6) - case AF_INET6: - return inet6_sk(sk)->mc_loop; -#endif - } - WARN_ON(1); - return 1; -} - bool ip_call_ra_chain(struct sk_buff *skb); /* diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 1d09b46c1e48..eda131d179d9 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -174,7 +174,8 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); static inline int ip6_skb_dst_mtu(struct sk_buff *skb) { - struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; + struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? + inet6_sk(skb->sk) : NULL; return (np && np->pmtudisc >= IPV6_PMTUDISC_PROBE) ? skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); diff --git a/include/net/sock.h b/include/net/sock.h index ab186b1d31ff..e4079c28e6b8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1762,6 +1762,8 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); +bool sk_mc_loop(struct sock *sk); + static inline bool sk_can_gso(const struct sock *sk) { return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type); diff --git a/net/core/dev.c b/net/core/dev.c index 962ee9d71964..45109b70664e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2848,7 +2848,9 @@ static void skb_update_prio(struct sk_buff *skb) #define skb_update_prio(skb) #endif -static DEFINE_PER_CPU(int, xmit_recursion); +DEFINE_PER_CPU(int, xmit_recursion); +EXPORT_SYMBOL(xmit_recursion); + #define RECURSION_LIMIT 10 /** diff --git a/net/core/sock.c b/net/core/sock.c index 78e89eb7eb70..71e3e5f1eaa0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -653,6 +653,25 @@ static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) sock_reset_flag(sk, bit); } +bool sk_mc_loop(struct sock *sk) +{ + if (dev_recursion_level()) + return false; + if (!sk) + return true; + switch (sk->sk_family) { + case AF_INET: + return inet_sk(sk)->mc_loop; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return inet6_sk(sk)->mc_loop; +#endif + } + WARN_ON(1); + return true; +} +EXPORT_SYMBOL(sk_mc_loop); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7e80b61b51ff..36cf0ab685a0 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -542,7 +542,8 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { struct sk_buff *frag; struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); - struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; + struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? + inet6_sk(skb->sk) : NULL; struct ipv6hdr *tmp_hdr; struct frag_hdr *fh; unsigned int mtu, hlen, left, len; -- cgit v1.2.3