From b5685d2687d6612adf5eac519eb7008f74dfd1ec Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 2 Jun 2018 17:26:34 +0200 Subject: batman-adv: Fix bat_ogm_iv best gw refcnt after netlink dump A reference for the best gateway is taken when the list of gateways in the mesh is sent via netlink. This is necessary to check whether the currently dumped entry is the currently selected gateway or not. This information is then transferred as flag BATADV_ATTR_FLAG_BEST. After the comparison of the current entry is done, batadv_iv_gw_dump_entry() has to decrease the reference counter again. Otherwise the reference will be held and thus prevents a proper shutdown of the batman-adv interfaces (and some of the interfaces enslaved in it). Fixes: efb766af06e3 ("batman-adv: add B.A.T.M.A.N. IV bat_gw_dump implementations") Reported-by: Andreas Ziegler Tested-by: Andreas Ziegler Signed-off-by: Sven Eckelmann Acked-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_iv_ogm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index be09a9883825..73bf6a93a3cf 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -2732,7 +2732,7 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, { struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_neigh_node *router; - struct batadv_gw_node *curr_gw; + struct batadv_gw_node *curr_gw = NULL; int ret = 0; void *hdr; @@ -2780,6 +2780,8 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, ret = 0; out: + if (curr_gw) + batadv_gw_node_put(curr_gw); if (router_ifinfo) batadv_neigh_ifinfo_put(router_ifinfo); if (router) -- cgit v1.2.3 From 9713cb0cf19f1cec6c007e3b37be0697042b6720 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 2 Jun 2018 17:26:35 +0200 Subject: batman-adv: Fix bat_v best gw refcnt after netlink dump A reference for the best gateway is taken when the list of gateways in the mesh is sent via netlink. This is necessary to check whether the currently dumped entry is the currently selected gateway or not. This information is then transferred as flag BATADV_ATTR_FLAG_BEST. After the comparison of the current entry is done, batadv_v_gw_dump_entry() has to decrease the reference counter again. Otherwise the reference will be held and thus prevents a proper shutdown of the batman-adv interfaces (and some of the interfaces enslaved in it). Fixes: b71bb6f924fe ("batman-adv: add B.A.T.M.A.N. V bat_gw_dump implementations") Signed-off-by: Sven Eckelmann Acked-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index ec93337ee259..6baec4e68898 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -927,7 +927,7 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, { struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_neigh_node *router; - struct batadv_gw_node *curr_gw; + struct batadv_gw_node *curr_gw = NULL; int ret = 0; void *hdr; @@ -995,6 +995,8 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, ret = 0; out: + if (curr_gw) + batadv_gw_node_put(curr_gw); if (router_ifinfo) batadv_neigh_ifinfo_put(router_ifinfo); if (router) -- cgit v1.2.3 From 36dc621ceca1be3ec885aeade5fdafbbcc452a6d Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 1 Jun 2018 19:24:23 +0200 Subject: batman-adv: Fix debugfs path for renamed hardif batman-adv is creating special debugfs directories in the init net_namespace for each valid hard-interface (net_device). But it is possible to rename a net_device to a completely different name then the original one. It can therefore happen that a user registers a new net_device which gets the name "wlan0" assigned by default. batman-adv is also adding a new directory under $debugfs/batman-adv/ with the name "wlan0". The user then decides to rename this device to "wl_pri" and registers a different device. The kernel may now decide to use the name "wlan0" again for this new device. batman-adv will detect it as a valid net_device and tries to create a directory with the name "wlan0" under $debugfs/batman-adv/. But there already exists one with this name under this path and thus this fails. batman-adv will detect a problem and rollback the registering of this device. batman-adv must therefore take care of renaming the debugfs directories for hard-interfaces whenever it detects such a net_device rename. Fixes: 5bc7c1eb44f2 ("batman-adv: add debugfs structure for information per interface") Reported-by: John Soros Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/debugfs.c | 20 ++++++++++++++++++++ net/batman-adv/debugfs.h | 6 ++++++ net/batman-adv/hard-interface.c | 3 +++ 3 files changed, 29 insertions(+) (limited to 'net') diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 4229b01ac7b5..7e5de7b9f6d5 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -19,6 +19,7 @@ #include "debugfs.h" #include "main.h" +#include #include #include #include @@ -343,6 +344,25 @@ out: return -ENOMEM; } +/** + * batadv_debugfs_rename_hardif() - Fix debugfs path for renamed hardif + * @hard_iface: hard interface which was renamed + */ +void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface) +{ + const char *name = hard_iface->net_dev->name; + struct dentry *dir; + struct dentry *d; + + dir = hard_iface->debug_dir; + if (!dir) + return; + + d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name); + if (!d) + pr_err("Can't rename debugfs dir to %s\n", name); +} + /** * batadv_debugfs_del_hardif() - delete the base directory for a hard interface * in debugfs. diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 37b069698b04..8538a7a75e93 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -32,6 +32,7 @@ void batadv_debugfs_destroy(void); int batadv_debugfs_add_meshif(struct net_device *dev); void batadv_debugfs_del_meshif(struct net_device *dev); int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface); +void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface); void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface); #else @@ -59,6 +60,11 @@ int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface) return 0; } +static inline +void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface) +{ +} + static inline void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface) { diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index c405d15befd6..dc2763b11107 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1051,6 +1051,9 @@ static int batadv_hard_if_event(struct notifier_block *this, if (batadv_is_wifi_hardif(hard_iface)) hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS; break; + case NETDEV_CHANGENAME: + batadv_debugfs_rename_hardif(hard_iface); + break; default: break; } -- cgit v1.2.3 From 6da7be7d24b2921f8215473ba7552796dff05fe1 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 1 Jun 2018 19:24:24 +0200 Subject: batman-adv: Fix debugfs path for renamed softif batman-adv is creating special debugfs directories in the init net_namespace for each created soft-interface (batadv net_device). But it is possible to rename a net_device to a completely different name then the original one. It can therefore happen that a user registers a new batadv net_device with the name "bat0". batman-adv is then also adding a new directory under $debugfs/batman-adv/ with the name "wlan0". The user then decides to rename this device to "bat1" and registers a different batadv device with the name "bat0". batman-adv will then try to create a directory with the name "bat0" under $debugfs/batman-adv/ again. But there already exists one with this name under this path and thus this fails. batman-adv will detect a problem and rollback the registering of this device. batman-adv must therefore take care of renaming the debugfs directories for soft-interfaces whenever it detects such a net_device rename. Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/debugfs.c | 20 ++++++++++++++++++++ net/batman-adv/debugfs.h | 5 +++++ net/batman-adv/hard-interface.c | 34 ++++++++++++++++++++++++++++------ 3 files changed, 53 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 7e5de7b9f6d5..87479c60670e 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -433,6 +433,26 @@ out: return -ENOMEM; } +/** + * batadv_debugfs_rename_meshif() - Fix debugfs path for renamed softif + * @dev: net_device which was renamed + */ +void batadv_debugfs_rename_meshif(struct net_device *dev) +{ + struct batadv_priv *bat_priv = netdev_priv(dev); + const char *name = dev->name; + struct dentry *dir; + struct dentry *d; + + dir = bat_priv->debug_dir; + if (!dir) + return; + + d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name); + if (!d) + pr_err("Can't rename debugfs dir to %s\n", name); +} + /** * batadv_debugfs_del_meshif() - Remove interface dependent debugfs entries * @dev: netdev struct of the soft interface diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 8538a7a75e93..08a592ffbee5 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -30,6 +30,7 @@ struct net_device; void batadv_debugfs_init(void); void batadv_debugfs_destroy(void); int batadv_debugfs_add_meshif(struct net_device *dev); +void batadv_debugfs_rename_meshif(struct net_device *dev); void batadv_debugfs_del_meshif(struct net_device *dev); int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface); void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface); @@ -50,6 +51,10 @@ static inline int batadv_debugfs_add_meshif(struct net_device *dev) return 0; } +static inline void batadv_debugfs_rename_meshif(struct net_device *dev) +{ +} + static inline void batadv_debugfs_del_meshif(struct net_device *dev) { } diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index dc2763b11107..2f0d42f2f913 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -989,6 +989,32 @@ void batadv_hardif_remove_interfaces(void) rtnl_unlock(); } +/** + * batadv_hard_if_event_softif() - Handle events for soft interfaces + * @event: NETDEV_* event to handle + * @net_dev: net_device which generated an event + * + * Return: NOTIFY_* result + */ +static int batadv_hard_if_event_softif(unsigned long event, + struct net_device *net_dev) +{ + struct batadv_priv *bat_priv; + + switch (event) { + case NETDEV_REGISTER: + batadv_sysfs_add_meshif(net_dev); + bat_priv = netdev_priv(net_dev); + batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS); + break; + case NETDEV_CHANGENAME: + batadv_debugfs_rename_meshif(net_dev); + break; + } + + return NOTIFY_DONE; +} + static int batadv_hard_if_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -997,12 +1023,8 @@ static int batadv_hard_if_event(struct notifier_block *this, struct batadv_hard_iface *primary_if = NULL; struct batadv_priv *bat_priv; - if (batadv_softif_is_valid(net_dev) && event == NETDEV_REGISTER) { - batadv_sysfs_add_meshif(net_dev); - bat_priv = netdev_priv(net_dev); - batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS); - return NOTIFY_DONE; - } + if (batadv_softif_is_valid(net_dev)) + return batadv_hard_if_event_softif(event, net_dev); hard_iface = batadv_hardif_get_by_netdev(net_dev); if (!hard_iface && (event == NETDEV_REGISTER || -- cgit v1.2.3 From 4a519b83da16927fb98fd32b0f598e639d1f1859 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Thu, 7 Jun 2018 00:46:23 +0200 Subject: batman-adv: Avoid storing non-TT-sync flags on singular entries too Since commit 54e22f265e87 ("batman-adv: fix TT sync flag inconsistencies") TT sync flags and TT non-sync'd flags are supposed to be stored separately. The previous patch missed to apply this separation on a TT entry with only a single TT orig entry. This is a minor fix because with only a single TT orig entry the DDoS issue the former patch solves does not apply. Fixes: 54e22f265e87 ("batman-adv: fix TT sync flag inconsistencies") Signed-off-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/translation-table.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 3986551397ca..61ce300091f3 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1705,7 +1705,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, ether_addr_copy(common->addr, tt_addr); common->vid = vid; - common->flags = flags; + common->flags = flags & (~BATADV_TT_SYNC_MASK); + tt_global_entry->roam_at = 0; /* node must store current time in case of roaming. This is * needed to purge this entry out on timeout (if nobody claims -- cgit v1.2.3 From a44ebeff6bbd6ef50db41b4195fca87b21aefd20 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Thu, 7 Jun 2018 00:46:24 +0200 Subject: batman-adv: Fix multicast TT issues with bogus ROAM flags When a (broken) node wrongly sends multicast TT entries with a ROAM flag then this causes any receiving node to drop all entries for the same multicast MAC address announced by other nodes, leading to packet loss. Fix this DoS vector by only storing TT sync flags. For multicast TT non-sync'ing flag bits like ROAM are unused so far anyway. Fixes: 1d8ab8d3c176 ("batman-adv: Modified forwarding behaviour for multicast packets") Reported-by: Leonardo Mörlein Signed-off-by: Linus Lüssing Signed-off-by: Simon Wunderlich --- net/batman-adv/translation-table.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 61ce300091f3..12a2b7d21376 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1705,7 +1705,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, ether_addr_copy(common->addr, tt_addr); common->vid = vid; - common->flags = flags & (~BATADV_TT_SYNC_MASK); + if (!is_multicast_ether_addr(common->addr)) + common->flags = flags & (~BATADV_TT_SYNC_MASK); tt_global_entry->roam_at = 0; /* node must store current time in case of roaming. This is @@ -1769,7 +1770,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, * TT_CLIENT_TEMP, therefore they have to be copied in the * client entry */ - common->flags |= flags & (~BATADV_TT_SYNC_MASK); + if (!is_multicast_ether_addr(common->addr)) + common->flags |= flags & (~BATADV_TT_SYNC_MASK); /* If there is the BATADV_TT_CLIENT_ROAM flag set, there is only * one originator left in the list and we previously received a -- cgit v1.2.3 From 20b52a75166086a40d838397ef3db28a4f2c5998 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 29 Jun 2018 09:48:17 +0200 Subject: xsk: fix potential lost completion message in SKB path The code in xskq_produce_addr erroneously checked if there was up to LAZY_UPDATE_THRESHOLD amount of space in the completion queue. It only needs to check if there is one slot left in the queue. This bug could under some circumstances lead to a WARN_ON_ONCE being triggered and the completion message to user space being lost. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Magnus Karlsson Reported-by: Pavel Odintsov Signed-off-by: Alexei Starovoitov --- net/xdp/xsk_queue.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index ef6a6f0ec949..52ecaf770642 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -62,14 +62,9 @@ static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) return (entries > dcnt) ? dcnt : entries; } -static inline u32 xskq_nb_free_lazy(struct xsk_queue *q, u32 producer) -{ - return q->nentries - (producer - q->cons_tail); -} - static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) { - u32 free_entries = xskq_nb_free_lazy(q, producer); + u32 free_entries = q->nentries - (producer - q->cons_tail); if (free_entries >= dcnt) return free_entries; @@ -129,7 +124,7 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - if (xskq_nb_free(q, q->prod_tail, LAZY_UPDATE_THRESHOLD) == 0) + if (xskq_nb_free(q, q->prod_tail, 1) == 0) return -ENOSPC; ring->desc[q->prod_tail++ & q->ring_mask] = addr; -- cgit v1.2.3 From fe5886852601fb2593cbc5a7549ef9fd2ef481ba Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 29 Jun 2018 09:48:18 +0200 Subject: xsk: frame could be completed more than once in SKB path Fixed a bug in which a frame could be completed more than once when an error was returned from dev_direct_xmit(). The code erroneously retried sending the message leading to multiple calls to the SKB destructor and therefore multiple completions of the same buffer to user space. The error code in this case has been changed from EAGAIN to EBUSY in order to tell user space that the sending of the packet failed and the buffer has been return to user space through the completion queue. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Magnus Karlsson Reported-by: Pavel Odintsov Signed-off-by: Alexei Starovoitov --- net/xdp/xsk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 59fb7d3c36a3..15aca73805fc 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -268,15 +268,15 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, skb->destructor = xsk_destruct_skb; err = dev_direct_xmit(skb, xs->queue_id); + xskq_discard_desc(xs->tx); /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { - err = -EAGAIN; - /* SKB consumed by dev_direct_xmit() */ + /* SKB completed but not sent */ + err = -EBUSY; goto out; } sent_frame = true; - xskq_discard_desc(xs->tx); } out: -- cgit v1.2.3 From a9744f7ca200c756e6f8c65b633770a2da711651 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 29 Jun 2018 09:48:20 +0200 Subject: xsk: fix potential race in SKB TX completion code There is a potential race in the TX completion code for the SKB case. One process enters the sendmsg code of an AF_XDP socket in order to send a frame. The execution eventually trickles down to the driver that is told to send the packet. However, it decides to drop the packet due to some error condition (e.g., rings full) and frees the SKB. This will trigger the SKB destructor and a completion will be sent to the AF_XDP user space through its single-producer/single-consumer queues. At the same time a TX interrupt has fired on another core and it dispatches the TX completion code in the driver. It does its HW specific things and ends up freeing the SKB associated with the transmitted packet. This will trigger the SKB destructor and a completion will be sent to the AF_XDP user space through its single-producer/single-consumer queues. With a pseudo call stack, it would look like this: Core 1: sendmsg() being called in the application netdev_start_xmit() Driver entered through ndo_start_xmit Driver decides to free the SKB for some reason (e.g., rings full) Destructor of SKB called xskq_produce_addr() is called to signal completion to user space Core 2: TX completion irq NAPI loop Driver irq handler for TX completions Frees the SKB Destructor of SKB called xskq_produce_addr() is called to signal completion to user space We now have a violation of the single-producer/single-consumer principle for our queues as there are two threads trying to produce at the same time on the same queue. Fixed by introducing a spin_lock in the destructor. In regards to the performance, I get around 1.74 Mpps for txonly before and after the introduction of the spinlock. There is of course some impact due to the spin lock but it is in the less significant digits that are too noisy for me to measure. But let us say that the version without the spin lock got 1.745 Mpps in the best case and the version with 1.735 Mpps in the worst case, then that would mean a maximum drop in performance of 0.5%. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 4 ++++ net/xdp/xsk.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 9fe472f2ac95..7161856bcf9c 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -60,6 +60,10 @@ struct xdp_sock { bool zc; /* Protects multiple processes in the control path */ struct mutex mutex; + /* Mutual exclusion of NAPI TX thread and sendmsg error paths + * in the SKB destructor callback. + */ + spinlock_t tx_completion_lock; u64 rx_dropped; }; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 15aca73805fc..7d220cbd09b6 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -199,8 +199,11 @@ static void xsk_destruct_skb(struct sk_buff *skb) { u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; struct xdp_sock *xs = xdp_sk(skb->sk); + unsigned long flags; + spin_lock_irqsave(&xs->tx_completion_lock, flags); WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr)); + spin_unlock_irqrestore(&xs->tx_completion_lock, flags); sock_wfree(skb); } @@ -755,6 +758,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, xs = xdp_sk(sk); mutex_init(&xs->mutex); + spin_lock_init(&xs->tx_completion_lock); local_bh_disable(); sock_prot_inuse_add(net, &xsk_proto, 1); -- cgit v1.2.3 From 52ee6ef36ee10dd493cf2067311e56ca8015eb8d Mon Sep 17 00:00:00 2001 From: Doron Roberts-Kedes Date: Mon, 2 Jul 2018 10:25:05 -0700 Subject: tls: fix skb_to_sgvec returning unhandled error. The current code does not inspect the return value of skb_to_sgvec. This can cause a nullptr kernel panic when the malformed sgvec is passed into the crypto request. Checking the return value of skb_to_sgvec and skipping decryption if it is negative fixes this problem. Fixes: c46234ebb4d1 ("tls: RX path for ktls") Acked-by: Dave Watson Signed-off-by: Doron Roberts-Kedes Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index d2380548f8f6..7818011fd250 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -701,6 +701,10 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb, nsg = skb_to_sgvec(skb, &sgin[1], rxm->offset + tls_ctx->rx.prepend_size, rxm->full_len - tls_ctx->rx.prepend_size); + if (nsg < 0) { + ret = nsg; + goto out; + } tls_make_aad(ctx->rx_aad_ciphertext, rxm->full_len - tls_ctx->rx.overhead_size, @@ -712,6 +716,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb, rxm->full_len - tls_ctx->rx.overhead_size, skb, sk->sk_allocation); +out: if (sgin != &sgin_arr[0]) kfree(sgin); -- cgit v1.2.3 From 410da1e12ffed61129d61df5b7adce4d08c7f17c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 3 Jul 2018 09:53:43 -0700 Subject: net/smc: fix up merge error with poll changes My networking merge (commit 4e33d7d47943: "Pull networking fixes from David Miller") got the poll() handling conflict wrong for af_smc. The conflict between my a11e1d432b51 ("Revert changes to convert to ->poll_mask() and aio IOCB_CMD_POLL") and Ursula Braun's 24ac3a08e658 ("net/smc: rebuild nonblocking connect") should have left the call to sock_poll_wait() in place, just without the socket lock release/retake. And I really should have realized that. But happily, I at least asked Ursula to double-check the merge, and she set me right. This also fixes an incidental whitespace issue nearby that annoyed me while looking at this. Pointed-out-by: Ursula Braun Cc: David Miller Signed-off-by: Linus Torvalds --- net/smc/af_smc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e017b6a4452b..3c1405df936c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1345,6 +1345,8 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, if (sk->sk_err) mask |= EPOLLERR; } else { + if (sk->sk_state != SMC_CLOSED) + sock_poll_wait(file, sk_sleep(sk), wait); if (sk->sk_err) mask |= EPOLLERR; if ((sk->sk_shutdown == SHUTDOWN_MASK) || @@ -1370,7 +1372,6 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, } if (smc->conn.urg_state == SMC_URG_VALID) mask |= EPOLLPRI; - } return mask; -- cgit v1.2.3 From d5a672ac9f48f81b20b1cad1d9ed7bbf4e418d4c Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Mon, 2 Jul 2018 22:52:20 +0200 Subject: gen_stats: Fix netlink stats dumping in the presence of padding The gen_stats facility will add a header for the toplevel nlattr of type TCA_STATS2 that contains all stats added by qdisc callbacks. A reference to this header is stored in the gnet_dump struct, and when all the per-qdisc callbacks have finished adding their stats, the length of the containing header will be adjusted to the right value. However, on architectures that need padding (i.e., that don't set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS), the padding nlattr is added before the stats, which means that the stored pointer will point to the padding, and so when the header is fixed up, the result is just a very big padding nlattr. Because most qdiscs also supply the legacy TCA_STATS struct, this problem has been mostly invisible, but we exposed it with the netlink attribute-based statistics in CAKE. Fix the issue by fixing up the stored pointer if it points to a padding nlattr. Tested-by: Pete Heist Tested-by: Kevin Darbyshire-Bryant Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/core/gen_stats.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index b2b2323bdc84..188d693cb251 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -77,8 +77,20 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, d->lock = lock; spin_lock_bh(lock); } - if (d->tail) - return gnet_stats_copy(d, type, NULL, 0, padattr); + if (d->tail) { + int ret = gnet_stats_copy(d, type, NULL, 0, padattr); + + /* The initial attribute added in gnet_stats_copy() may be + * preceded by a padding attribute, in which case d->tail will + * end up pointing at the padding instead of the real attribute. + * Fix this so gnet_stats_finish_copy() adjusts the length of + * the right attribute. + */ + if (ret == 0 && d->tail->nla_type == padattr) + d->tail = (struct nlattr *)((char *)d->tail + + NLA_ALIGN(d->tail->nla_len)); + return ret; + } return 0; } -- cgit v1.2.3 From 33bd5ac54dc47e002da4a395aaf9bf158dd17709 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 3 Jul 2018 14:36:21 -0700 Subject: net/ipv6: Revert attempt to simplify route replace and append NetworkManager likes to manage linklocal prefix routes and does so with the NLM_F_APPEND flag, breaking attempts to simplify the IPv6 route code and by extension enable multipath routes with device only nexthops. Revert f34436a43092 and these followup patches: 6eba08c3626b ("ipv6: Only emit append events for appended routes"). ce45bded6435 ("mlxsw: spectrum_router: Align with new route replace logic") 53b562df8c20 ("mlxsw: spectrum_router: Allow appending to dev-only routes") Update the fib_tests cases to reflect the old behavior. Fixes: f34436a43092 ("net/ipv6: Simplify route replace and appending into multipath route") Signed-off-by: David Ahern --- .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 48 +++---- include/net/ip6_route.h | 6 + net/ipv6/ip6_fib.c | 156 ++++++++++++--------- net/ipv6/route.c | 3 +- tools/testing/selftests/net/fib_tests.sh | 41 ------ 5 files changed, 117 insertions(+), 137 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 6aaaf3d9ba31..77b2adb29341 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -4756,6 +4756,12 @@ static void mlxsw_sp_rt6_destroy(struct mlxsw_sp_rt6 *mlxsw_sp_rt6) kfree(mlxsw_sp_rt6); } +static bool mlxsw_sp_fib6_rt_can_mp(const struct fib6_info *rt) +{ + /* RTF_CACHE routes are ignored */ + return (rt->fib6_flags & (RTF_GATEWAY | RTF_ADDRCONF)) == RTF_GATEWAY; +} + static struct fib6_info * mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry) { @@ -4765,11 +4771,11 @@ mlxsw_sp_fib6_entry_rt(const struct mlxsw_sp_fib6_entry *fib6_entry) static struct mlxsw_sp_fib6_entry * mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node, - const struct fib6_info *nrt, bool append) + const struct fib6_info *nrt, bool replace) { struct mlxsw_sp_fib6_entry *fib6_entry; - if (!append) + if (!mlxsw_sp_fib6_rt_can_mp(nrt) || replace) return NULL; list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) { @@ -4784,7 +4790,8 @@ mlxsw_sp_fib6_node_mp_entry_find(const struct mlxsw_sp_fib_node *fib_node, break; if (rt->fib6_metric < nrt->fib6_metric) continue; - if (rt->fib6_metric == nrt->fib6_metric) + if (rt->fib6_metric == nrt->fib6_metric && + mlxsw_sp_fib6_rt_can_mp(rt)) return fib6_entry; if (rt->fib6_metric > nrt->fib6_metric) break; @@ -5163,7 +5170,7 @@ static struct mlxsw_sp_fib6_entry * mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node, const struct fib6_info *nrt, bool replace) { - struct mlxsw_sp_fib6_entry *fib6_entry; + struct mlxsw_sp_fib6_entry *fib6_entry, *fallback = NULL; list_for_each_entry(fib6_entry, &fib_node->entry_list, common.list) { struct fib6_info *rt = mlxsw_sp_fib6_entry_rt(fib6_entry); @@ -5172,13 +5179,18 @@ mlxsw_sp_fib6_node_entry_find(const struct mlxsw_sp_fib_node *fib_node, continue; if (rt->fib6_table->tb6_id != nrt->fib6_table->tb6_id) break; - if (replace && rt->fib6_metric == nrt->fib6_metric) - return fib6_entry; + if (replace && rt->fib6_metric == nrt->fib6_metric) { + if (mlxsw_sp_fib6_rt_can_mp(rt) == + mlxsw_sp_fib6_rt_can_mp(nrt)) + return fib6_entry; + if (mlxsw_sp_fib6_rt_can_mp(nrt)) + fallback = fallback ?: fib6_entry; + } if (rt->fib6_metric > nrt->fib6_metric) - return fib6_entry; + return fallback ?: fib6_entry; } - return NULL; + return fallback; } static int @@ -5304,8 +5316,7 @@ static void mlxsw_sp_fib6_entry_replace(struct mlxsw_sp *mlxsw_sp, } static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp, - struct fib6_info *rt, bool replace, - bool append) + struct fib6_info *rt, bool replace) { struct mlxsw_sp_fib6_entry *fib6_entry; struct mlxsw_sp_fib_node *fib_node; @@ -5331,7 +5342,7 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp, /* Before creating a new entry, try to append route to an existing * multipath entry. */ - fib6_entry = mlxsw_sp_fib6_node_mp_entry_find(fib_node, rt, append); + fib6_entry = mlxsw_sp_fib6_node_mp_entry_find(fib_node, rt, replace); if (fib6_entry) { err = mlxsw_sp_fib6_entry_nexthop_add(mlxsw_sp, fib6_entry, rt); if (err) @@ -5339,14 +5350,6 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp, return 0; } - /* We received an append event, yet did not find any route to - * append to. - */ - if (WARN_ON(append)) { - err = -EINVAL; - goto err_fib6_entry_append; - } - fib6_entry = mlxsw_sp_fib6_entry_create(mlxsw_sp, fib_node, rt); if (IS_ERR(fib6_entry)) { err = PTR_ERR(fib6_entry); @@ -5364,7 +5367,6 @@ static int mlxsw_sp_router_fib6_add(struct mlxsw_sp *mlxsw_sp, err_fib6_node_entry_link: mlxsw_sp_fib6_entry_destroy(mlxsw_sp, fib6_entry); err_fib6_entry_create: -err_fib6_entry_append: err_fib6_entry_nexthop_add: mlxsw_sp_fib_node_put(mlxsw_sp, fib_node); return err; @@ -5715,7 +5717,7 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work) struct mlxsw_sp_fib_event_work *fib_work = container_of(work, struct mlxsw_sp_fib_event_work, work); struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp; - bool replace, append; + bool replace; int err; rtnl_lock(); @@ -5726,10 +5728,8 @@ static void mlxsw_sp_router_fib6_event_work(struct work_struct *work) case FIB_EVENT_ENTRY_APPEND: /* fall through */ case FIB_EVENT_ENTRY_ADD: replace = fib_work->event == FIB_EVENT_ENTRY_REPLACE; - append = fib_work->event == FIB_EVENT_ENTRY_APPEND; err = mlxsw_sp_router_fib6_add(mlxsw_sp, - fib_work->fen6_info.rt, replace, - append); + fib_work->fen6_info.rt, replace); if (err) mlxsw_sp_router_fib_abort(mlxsw_sp); mlxsw_sp_rt6_release(fib_work->fen6_info.rt); diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 59656fc580df..7b9c82de11cc 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -66,6 +66,12 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } +static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) +{ + return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == + RTF_GATEWAY; +} + void ip6_route_input(struct sk_buff *skb); struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 1fb2f3118d60..d212738e9d10 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -935,20 +935,19 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); - enum fib_event_type event = FIB_EVENT_ENTRY_ADD; - struct fib6_info *iter = NULL, *match = NULL; + struct fib6_info *iter = NULL; struct fib6_info __rcu **ins; + struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); - int append = (info->nlh && - (info->nlh->nlmsg_flags & NLM_F_APPEND)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; + bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); u16 nlflags = NLM_F_EXCL; int err; - if (append) + if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) nlflags |= NLM_F_APPEND; ins = &fn->leaf; @@ -970,8 +969,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, nlflags &= ~NLM_F_EXCL; if (replace) { - found++; - break; + if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { + found++; + break; + } + if (rt_can_ecmp) + fallback_ins = fallback_ins ?: ins; + goto next_iter; } if (rt6_duplicate_nexthop(iter, rt)) { @@ -986,51 +990,71 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } - - /* first route that matches */ - if (!match) - match = iter; + /* If we have the same destination and the same metric, + * but not the same gateway, then the route we try to + * add is sibling to this route, increment our counter + * of siblings, and later we will add our route to the + * list. + * Only static routes (which don't have flag + * RTF_EXPIRES) are used for ECMPv6. + * + * To avoid long list, we only had siblings if the + * route have a gateway. + */ + if (rt_can_ecmp && + rt6_qualify_for_ecmp(iter)) + rt->fib6_nsiblings++; } if (iter->fib6_metric > rt->fib6_metric) break; +next_iter: ins = &iter->fib6_next; } + if (fallback_ins && !found) { + /* No ECMP-able route found, replace first non-ECMP one */ + ins = fallback_ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + found++; + } + /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; /* Link this route to others same route. */ - if (append && match) { + if (rt->fib6_nsiblings) { + unsigned int fib6_nsiblings; struct fib6_info *sibling, *temp_sibling; - if (rt->fib6_flags & RTF_REJECT) { - NL_SET_ERR_MSG(extack, - "Can not append a REJECT route"); - return -EINVAL; - } else if (match->fib6_flags & RTF_REJECT) { - NL_SET_ERR_MSG(extack, - "Can not append to a REJECT route"); - return -EINVAL; + /* Find the first route that have the same metric */ + sibling = leaf; + while (sibling) { + if (sibling->fib6_metric == rt->fib6_metric && + rt6_qualify_for_ecmp(sibling)) { + list_add_tail(&rt->fib6_siblings, + &sibling->fib6_siblings); + break; + } + sibling = rcu_dereference_protected(sibling->fib6_next, + lockdep_is_held(&rt->fib6_table->tb6_lock)); } - event = FIB_EVENT_ENTRY_APPEND; - rt->fib6_nsiblings = match->fib6_nsiblings; - list_add_tail(&rt->fib6_siblings, &match->fib6_siblings); - match->fib6_nsiblings++; - /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ + fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, - &match->fib6_siblings, fib6_siblings) { + &rt->fib6_siblings, fib6_siblings) { sibling->fib6_nsiblings++; - BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings); + BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); + fib6_nsiblings++; } - - rt6_multipath_rebalance(match); + BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); + rt6_multipath_rebalance(temp_sibling); } /* @@ -1043,8 +1067,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, add: nlflags |= NLM_F_CREATE; - err = call_fib6_entry_notifiers(info->nl_net, event, rt, - extack); + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_ADD, + rt, extack); if (err) return err; @@ -1062,7 +1087,7 @@ add: } } else { - struct fib6_info *tmp; + int nsiblings; if (!found) { if (add) @@ -1077,57 +1102,48 @@ add: if (err) return err; - /* if route being replaced has siblings, set tmp to - * last one, otherwise tmp is current route. this is - * used to set fib6_next for new route - */ - if (iter->fib6_nsiblings) - tmp = list_last_entry(&iter->fib6_siblings, - struct fib6_info, - fib6_siblings); - else - tmp = iter; - - /* insert new route */ atomic_inc(&rt->fib6_ref); rcu_assign_pointer(rt->fib6_node, fn); - rt->fib6_next = tmp->fib6_next; + rt->fib6_next = iter->fib6_next; rcu_assign_pointer(*ins, rt); - if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } + nsiblings = iter->fib6_nsiblings; + iter->fib6_node = NULL; + fib6_purge_rt(iter, fn, info->nl_net); + if (rcu_access_pointer(fn->rr_ptr) == iter) + fn->rr_ptr = NULL; + fib6_info_release(iter); - /* delete old route */ - rt = iter; - - if (rt->fib6_nsiblings) { - struct fib6_info *tmp; - + if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ - list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings, - fib6_siblings) { - iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); - if (rcu_access_pointer(fn->rr_ptr) == iter) - fn->rr_ptr = NULL; - fib6_info_release(iter); - - rt->fib6_nsiblings--; - info->nl_net->ipv6.rt6_stats->fib_rt_entries--; + ins = &rt->fib6_next; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + while (iter) { + if (iter->fib6_metric > rt->fib6_metric) + break; + if (rt6_qualify_for_ecmp(iter)) { + *ins = iter->fib6_next; + iter->fib6_node = NULL; + fib6_purge_rt(iter, fn, info->nl_net); + if (rcu_access_pointer(fn->rr_ptr) == iter) + fn->rr_ptr = NULL; + fib6_info_release(iter); + nsiblings--; + info->nl_net->ipv6.rt6_stats->fib_rt_entries--; + } else { + ins = &iter->fib6_next; + } + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->fib6_table->tb6_lock)); } + WARN_ON(nsiblings != 0); } - - WARN_ON(rt->fib6_nsiblings != 0); - - rt->fib6_node = NULL; - fib6_purge_rt(rt, fn, info->nl_net); - if (rcu_access_pointer(fn->rr_ptr) == rt) - fn->rr_ptr = NULL; - fib6_info_release(rt); } return 0; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 86a0e4333d42..63f99411f0de 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3842,7 +3842,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric == rt->fib6_metric && - iter->fib6_nsiblings) + rt6_qualify_for_ecmp(iter)) return iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); @@ -4439,7 +4439,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, */ cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_REPLACE); - cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND; nhn++; } diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 78245d60d8bc..0f45633bd634 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -740,13 +740,6 @@ ipv6_rt_add() run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64" log_test $? 2 "Attempt to add duplicate route - reject route" - # iproute2 prepend only sets NLM_F_CREATE - # - adds a new route; does NOT convert existing route to ECMP - add_route6 "2001:db8:104::/64" "via 2001:db8:101::2" - run_cmd "$IP -6 ro prepend 2001:db8:104::/64 via 2001:db8:103::2" - check_route6 "2001:db8:104::/64 via 2001:db8:101::2 dev veth1 metric 1024 2001:db8:104::/64 via 2001:db8:103::2 dev veth3 metric 1024" - log_test $? 0 "Add new route for existing prefix (w/o NLM_F_EXCL)" - # route append with same prefix adds a new route # - iproute2 sets NLM_F_CREATE | NLM_F_APPEND add_route6 "2001:db8:104::/64" "via 2001:db8:101::2" @@ -754,27 +747,6 @@ ipv6_rt_add() check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1" log_test $? 0 "Append nexthop to existing route - gw" - add_route6 "2001:db8:104::/64" "via 2001:db8:101::2" - run_cmd "$IP -6 ro append 2001:db8:104::/64 dev veth3" - check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop dev veth3 weight 1" - log_test $? 0 "Append nexthop to existing route - dev only" - - # multipath route can not have a nexthop that is a reject route - add_route6 "2001:db8:104::/64" "via 2001:db8:101::2" - run_cmd "$IP -6 ro append unreachable 2001:db8:104::/64" - log_test $? 2 "Append nexthop to existing route - reject route" - - # reject route can not be converted to multipath route - run_cmd "$IP -6 ro flush 2001:db8:104::/64" - run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64" - run_cmd "$IP -6 ro append 2001:db8:104::/64 via 2001:db8:103::2" - log_test $? 2 "Append nexthop to existing reject route - gw" - - run_cmd "$IP -6 ro flush 2001:db8:104::/64" - run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64" - run_cmd "$IP -6 ro append 2001:db8:104::/64 dev veth3" - log_test $? 2 "Append nexthop to existing reject route - dev only" - # insert mpath directly add_route6 "2001:db8:104::/64" "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2" check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1" @@ -819,13 +791,6 @@ ipv6_rt_replace_single() check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::3 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1" log_test $? 0 "Single path with multipath" - # single path with reject - # - add_initial_route6 "nexthop via 2001:db8:101::2" - run_cmd "$IP -6 ro replace unreachable 2001:db8:104::/64" - check_route6 "unreachable 2001:db8:104::/64 dev lo metric 1024" - log_test $? 0 "Single path with reject route" - # single path with single path using MULTIPATH attribute # add_initial_route6 "via 2001:db8:101::2" @@ -873,12 +838,6 @@ ipv6_rt_replace_mpath() check_route6 "2001:db8:104::/64 via 2001:db8:101::3 dev veth1 metric 1024" log_test $? 0 "Multipath with single path via multipath attribute" - # multipath with reject - add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2" - run_cmd "$IP -6 ro replace unreachable 2001:db8:104::/64" - check_route6 "unreachable 2001:db8:104::/64 dev lo metric 1024" - log_test $? 0 "Multipath with reject route" - # route replace fails - invalid nexthop 1 add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2" run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:111::3 nexthop via 2001:db8:103::3" -- cgit v1.2.3 From a65925475571953da12a9bc2082aec29d4e2c0e7 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 3 Jul 2018 16:30:47 +0800 Subject: sctp: fix the issue that pathmtu may be set lower than MINSEGMENT After commit b6c5734db070 ("sctp: fix the handling of ICMP Frag Needed for too small MTUs"), sctp_transport_update_pmtu would refetch pathmtu from the dst and set it to transport's pathmtu without any check. The new pathmtu may be lower than MINSEGMENT if the dst is obsolete and updated by .get_dst() in sctp_transport_update_pmtu. In this case, it could have a smaller MTU as well, and thus we should validate it against MINSEGMENT instead. Syzbot reported a warning in sctp_mtu_payload caused by this. This patch refetches the pathmtu by calling sctp_dst_mtu where it does the check against MINSEGMENT. v1->v2: - refetch the pathmtu by calling sctp_dst_mtu instead as Marcelo's suggestion. Fixes: b6c5734db070 ("sctp: fix the handling of ICMP Frag Needed for too small MTUs") Reported-by: syzbot+f0d9d7cba052f9344b03@syzkaller.appspotmail.com Suggested-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 445b7ef61677..12cac85da994 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -282,7 +282,7 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) if (dst) { /* Re-fetch, as under layers may have a higher minimum size */ - pmtu = SCTP_TRUNC4(dst_mtu(dst)); + pmtu = sctp_dst_mtu(dst); change = t->pathmtu != pmtu; } t->pathmtu = pmtu; -- cgit v1.2.3 From b30c122c0bbb0a1dc413085e177ea09467e65fdb Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Mon, 2 Jul 2018 11:21:47 +0200 Subject: ieee802154: 6lowpan: set IFLA_LINK Otherwise NetworkManager (and iproute alike) is not able to identify the parent IEEE 802.15.4 interface of a 6LoWPAN link. Signed-off-by: Lubomir Rintel Acked-by: Alexander Aring Signed-off-by: Stefan Schmidt --- net/ieee802154/6lowpan/core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 275449b0d633..3297e7fa9945 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -90,12 +90,18 @@ static int lowpan_neigh_construct(struct net_device *dev, struct neighbour *n) return 0; } +static int lowpan_get_iflink(const struct net_device *dev) +{ + return lowpan_802154_dev(dev)->wdev->ifindex; +} + static const struct net_device_ops lowpan_netdev_ops = { .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, .ndo_open = lowpan_open, .ndo_stop = lowpan_stop, .ndo_neigh_construct = lowpan_neigh_construct, + .ndo_get_iflink = lowpan_get_iflink, }; static void lowpan_setup(struct net_device *ldev) -- cgit v1.2.3 From d376bef9c29b3c65aeee4e785fffcd97ef0a9a81 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 4 Jul 2018 20:25:32 +0200 Subject: netfilter: x_tables: set module owner for icmp(6) matches nft_compat relies on xt_request_find_match to increment refcount of the module that provides the match/target. The (builtin) icmp matches did't set the module owner so it was possible to rmmod ip(6)tables while icmp extensions were still in use. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/ip_tables.c | 1 + net/ipv6/netfilter/ip6_tables.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index ca0dad90803a..e77872c93c20 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1898,6 +1898,7 @@ static struct xt_match ipt_builtin_mt[] __read_mostly = { .checkentry = icmp_checkentry, .proto = IPPROTO_ICMP, .family = NFPROTO_IPV4, + .me = THIS_MODULE, }, }; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 7eab959734bc..daf2e9e9193d 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1909,6 +1909,7 @@ static struct xt_match ip6t_builtin_mt[] __read_mostly = { .checkentry = icmp6_checkentry, .proto = IPPROTO_ICMPV6, .family = NFPROTO_IPV6, + .me = THIS_MODULE, }, }; -- cgit v1.2.3 From a9ba23d48dbc6ffd08426bb10f05720e0b9f5c14 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 4 Jul 2018 09:58:05 -0400 Subject: ipv6: make ipv6_renew_options() interrupt/kernel safe At present the ipv6_renew_options_kern() function ends up calling into access_ok() which is problematic if done from inside an interrupt as access_ok() calls WARN_ON_IN_IRQ() on some (all?) architectures (x86-64 is affected). Example warning/backtrace is shown below: WARNING: CPU: 1 PID: 3144 at lib/usercopy.c:11 _copy_from_user+0x85/0x90 ... Call Trace: ipv6_renew_option+0xb2/0xf0 ipv6_renew_options+0x26a/0x340 ipv6_renew_options_kern+0x2c/0x40 calipso_req_setattr+0x72/0xe0 netlbl_req_setattr+0x126/0x1b0 selinux_netlbl_inet_conn_request+0x80/0x100 selinux_inet_conn_request+0x6d/0xb0 security_inet_conn_request+0x32/0x50 tcp_conn_request+0x35f/0xe00 ? __lock_acquire+0x250/0x16c0 ? selinux_socket_sock_rcv_skb+0x1ae/0x210 ? tcp_rcv_state_process+0x289/0x106b tcp_rcv_state_process+0x289/0x106b ? tcp_v6_do_rcv+0x1a7/0x3c0 tcp_v6_do_rcv+0x1a7/0x3c0 tcp_v6_rcv+0xc82/0xcf0 ip6_input_finish+0x10d/0x690 ip6_input+0x45/0x1e0 ? ip6_rcv_finish+0x1d0/0x1d0 ipv6_rcv+0x32b/0x880 ? ip6_make_skb+0x1e0/0x1e0 __netif_receive_skb_core+0x6f2/0xdf0 ? process_backlog+0x85/0x250 ? process_backlog+0x85/0x250 ? process_backlog+0xec/0x250 process_backlog+0xec/0x250 net_rx_action+0x153/0x480 __do_softirq+0xd9/0x4f7 do_softirq_own_stack+0x2a/0x40 ... While not present in the backtrace, ipv6_renew_option() ends up calling access_ok() via the following chain: access_ok() _copy_from_user() copy_from_user() ipv6_renew_option() The fix presented in this patch is to perform the userspace copy earlier in the call chain such that it is only called when the option data is actually coming from userspace; that place is do_ipv6_setsockopt(). Not only does this solve the problem seen in the backtrace above, it also allows us to simplify the code quite a bit by removing ipv6_renew_options_kern() completely. We also take this opportunity to cleanup ipv6_renew_options()/ipv6_renew_option() a small amount as well. This patch is heavily based on a rough patch by Al Viro. I've taken his original patch, converted a kmemdup() call in do_ipv6_setsockopt() to a memdup_user() call, made better use of the e_inval jump target in the same function, and cleaned up the use ipv6_renew_option() by ipv6_renew_options(). CC: Al Viro Signed-off-by: Paul Moore Signed-off-by: David S. Miller --- include/net/ipv6.h | 9 +--- net/ipv6/calipso.c | 9 ++-- net/ipv6/exthdrs.c | 111 +++++++++++++---------------------------------- net/ipv6/ipv6_sockglue.c | 27 ++++++++---- 4 files changed, 53 insertions(+), 103 deletions(-) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 16475c269749..d02881e4ad1f 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -355,14 +355,7 @@ struct ipv6_txoptions *ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, int newtype, - struct ipv6_opt_hdr __user *newopt, - int newoptlen); -struct ipv6_txoptions * -ipv6_renew_options_kern(struct sock *sk, - struct ipv6_txoptions *opt, - int newtype, - struct ipv6_opt_hdr *newopt, - int newoptlen); + struct ipv6_opt_hdr *newopt); struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt); diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index 1323b9679cf7..1c0bb9fb76e6 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -799,8 +799,7 @@ static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop) { struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts; - txopts = ipv6_renew_options_kern(sk, old, IPV6_HOPOPTS, - hop, hop ? ipv6_optlen(hop) : 0); + txopts = ipv6_renew_options(sk, old, IPV6_HOPOPTS, hop); txopt_put(old); if (IS_ERR(txopts)) return PTR_ERR(txopts); @@ -1222,8 +1221,7 @@ static int calipso_req_setattr(struct request_sock *req, if (IS_ERR(new)) return PTR_ERR(new); - txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, - new, new ? ipv6_optlen(new) : 0); + txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new); kfree(new); @@ -1260,8 +1258,7 @@ static void calipso_req_delattr(struct request_sock *req) if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new)) return; /* Nothing to do */ - txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, - new, new ? ipv6_optlen(new) : 0); + txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new); if (!IS_ERR(txopts)) { txopts = xchg(&req_inet->ipv6_opt, txopts); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 5bc2bf3733ab..20291c2036fc 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -1015,29 +1015,21 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) } EXPORT_SYMBOL_GPL(ipv6_dup_options); -static int ipv6_renew_option(void *ohdr, - struct ipv6_opt_hdr __user *newopt, int newoptlen, - int inherit, - struct ipv6_opt_hdr **hdr, - char **p) +static void ipv6_renew_option(int renewtype, + struct ipv6_opt_hdr **dest, + struct ipv6_opt_hdr *old, + struct ipv6_opt_hdr *new, + int newtype, char **p) { - if (inherit) { - if (ohdr) { - memcpy(*p, ohdr, ipv6_optlen((struct ipv6_opt_hdr *)ohdr)); - *hdr = (struct ipv6_opt_hdr *)*p; - *p += CMSG_ALIGN(ipv6_optlen(*hdr)); - } - } else { - if (newopt) { - if (copy_from_user(*p, newopt, newoptlen)) - return -EFAULT; - *hdr = (struct ipv6_opt_hdr *)*p; - if (ipv6_optlen(*hdr) > newoptlen) - return -EINVAL; - *p += CMSG_ALIGN(newoptlen); - } - } - return 0; + struct ipv6_opt_hdr *src; + + src = (renewtype == newtype ? new : old); + if (!src) + return; + + memcpy(*p, src, ipv6_optlen(src)); + *dest = (struct ipv6_opt_hdr *)*p; + *p += CMSG_ALIGN(ipv6_optlen(*dest)); } /** @@ -1063,13 +1055,11 @@ static int ipv6_renew_option(void *ohdr, */ struct ipv6_txoptions * ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, - int newtype, - struct ipv6_opt_hdr __user *newopt, int newoptlen) + int newtype, struct ipv6_opt_hdr *newopt) { int tot_len = 0; char *p; struct ipv6_txoptions *opt2; - int err; if (opt) { if (newtype != IPV6_HOPOPTS && opt->hopopt) @@ -1082,8 +1072,8 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt)); } - if (newopt && newoptlen) - tot_len += CMSG_ALIGN(newoptlen); + if (newopt) + tot_len += CMSG_ALIGN(ipv6_optlen(newopt)); if (!tot_len) return NULL; @@ -1098,29 +1088,19 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, opt2->tot_len = tot_len; p = (char *)(opt2 + 1); - err = ipv6_renew_option(opt ? opt->hopopt : NULL, newopt, newoptlen, - newtype != IPV6_HOPOPTS, - &opt2->hopopt, &p); - if (err) - goto out; - - err = ipv6_renew_option(opt ? opt->dst0opt : NULL, newopt, newoptlen, - newtype != IPV6_RTHDRDSTOPTS, - &opt2->dst0opt, &p); - if (err) - goto out; - - err = ipv6_renew_option(opt ? opt->srcrt : NULL, newopt, newoptlen, - newtype != IPV6_RTHDR, - (struct ipv6_opt_hdr **)&opt2->srcrt, &p); - if (err) - goto out; - - err = ipv6_renew_option(opt ? opt->dst1opt : NULL, newopt, newoptlen, - newtype != IPV6_DSTOPTS, - &opt2->dst1opt, &p); - if (err) - goto out; + ipv6_renew_option(IPV6_HOPOPTS, &opt2->hopopt, + (opt ? opt->hopopt : NULL), + newopt, newtype, &p); + ipv6_renew_option(IPV6_RTHDRDSTOPTS, &opt2->dst0opt, + (opt ? opt->dst0opt : NULL), + newopt, newtype, &p); + ipv6_renew_option(IPV6_RTHDR, + (struct ipv6_opt_hdr **)&opt2->srcrt, + (opt ? (struct ipv6_opt_hdr *)opt->srcrt : NULL), + newopt, newtype, &p); + ipv6_renew_option(IPV6_DSTOPTS, &opt2->dst1opt, + (opt ? opt->dst1opt : NULL), + newopt, newtype, &p); opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) + (opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) + @@ -1128,37 +1108,6 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0); return opt2; -out: - sock_kfree_s(sk, opt2, opt2->tot_len); - return ERR_PTR(err); -} - -/** - * ipv6_renew_options_kern - replace a specific ext hdr with a new one. - * - * @sk: sock from which to allocate memory - * @opt: original options - * @newtype: option type to replace in @opt - * @newopt: new option of type @newtype to replace (kernel-mem) - * @newoptlen: length of @newopt - * - * See ipv6_renew_options(). The difference is that @newopt is - * kernel memory, rather than user memory. - */ -struct ipv6_txoptions * -ipv6_renew_options_kern(struct sock *sk, struct ipv6_txoptions *opt, - int newtype, struct ipv6_opt_hdr *newopt, - int newoptlen) -{ - struct ipv6_txoptions *ret_val; - const mm_segment_t old_fs = get_fs(); - - set_fs(KERNEL_DS); - ret_val = ipv6_renew_options(sk, opt, newtype, - (struct ipv6_opt_hdr __user *)newopt, - newoptlen); - set_fs(old_fs); - return ret_val; } struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 4d780c7f0130..c95c3486d904 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -398,6 +398,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, case IPV6_DSTOPTS: { struct ipv6_txoptions *opt; + struct ipv6_opt_hdr *new = NULL; + + /* hop-by-hop / destination options are privileged option */ + retv = -EPERM; + if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) + break; /* remove any sticky options header with a zero option * length, per RFC3542. @@ -409,17 +415,22 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, else if (optlen < sizeof(struct ipv6_opt_hdr) || optlen & 0x7 || optlen > 8 * 255) goto e_inval; - - /* hop-by-hop / destination options are privileged option */ - retv = -EPERM; - if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) - break; + else { + new = memdup_user(optval, optlen); + if (IS_ERR(new)) { + retv = PTR_ERR(new); + break; + } + if (unlikely(ipv6_optlen(new) > optlen)) { + kfree(new); + goto e_inval; + } + } opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); - opt = ipv6_renew_options(sk, opt, optname, - (struct ipv6_opt_hdr __user *)optval, - optlen); + opt = ipv6_renew_options(sk, opt, optname, new); + kfree(new); if (IS_ERR(opt)) { retv = PTR_ERR(opt); break; -- cgit v1.2.3 From fdf5fd3975666804118e62c69de25dc85cc0909c Mon Sep 17 00:00:00 2001 From: Arun Kumar Neelakantam Date: Wed, 4 Jul 2018 19:49:32 +0530 Subject: net: qrtr: Broadcast messages only from control port The broadcast node id should only be sent with the control port id. Signed-off-by: Arun Kumar Neelakantam Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 2aa07b547b16..7ffc9a3a7dd4 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -764,6 +764,10 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) node = NULL; if (addr->sq_node == QRTR_NODE_BCAST) { enqueue_fn = qrtr_bcast_enqueue; + if (addr->sq_port != QRTR_PORT_CTRL) { + release_sock(sk); + return -ENOTCONN; + } } else if (addr->sq_node == ipc->us.sq_node) { enqueue_fn = qrtr_local_enqueue; } else { -- cgit v1.2.3 From d27e77a3de2866b0a772803fd03cd667b5ff8a9a Mon Sep 17 00:00:00 2001 From: Arun Kumar Neelakantam Date: Wed, 4 Jul 2018 19:49:33 +0530 Subject: net: qrtr: Reset the node and port ID of broadcast messages All the control messages broadcast to remote routers are using QRTR_NODE_BCAST instead of using local router NODE ID which cause the packets to be dropped on remote router due to invalid NODE ID. Signed-off-by: Arun Kumar Neelakantam Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 7ffc9a3a7dd4..86e1e37eb4e8 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -191,8 +191,13 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, hdr->type = cpu_to_le32(type); hdr->src_node_id = cpu_to_le32(from->sq_node); hdr->src_port_id = cpu_to_le32(from->sq_port); - hdr->dst_node_id = cpu_to_le32(to->sq_node); - hdr->dst_port_id = cpu_to_le32(to->sq_port); + if (to->sq_port == QRTR_PORT_CTRL) { + hdr->dst_node_id = cpu_to_le32(node->nid); + hdr->dst_port_id = cpu_to_le32(QRTR_NODE_BCAST); + } else { + hdr->dst_node_id = cpu_to_le32(to->sq_node); + hdr->dst_port_id = cpu_to_le32(to->sq_port); + } hdr->size = cpu_to_le32(len); hdr->confirm_rx = 0; -- cgit v1.2.3 From 70ba5b6db96ff7324b8cfc87e0d0383cf59c9677 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Thu, 5 Jul 2018 18:49:23 +0000 Subject: ipv4: Return EINVAL when ping_group_range sysctl doesn't map to user ns The low and high values of the net.ipv4.ping_group_range sysctl were being silently forced to the default disabled state when a write to the sysctl contained GIDs that didn't map to the associated user namespace. Confusingly, the sysctl's write operation would return success and then a subsequent read of the sysctl would indicate that the low and high values are the overflowgid. This patch changes the behavior by clearly returning an error when the sysctl write operation receives a GID range that doesn't map to the associated user namespace. In such a situation, the previous value of the sysctl is preserved and that range will be returned in a subsequent read of the sysctl. Signed-off-by: Tyler Hicks Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index af0a857d8352..5fa335fd3852 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -189,8 +189,9 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, if (write && ret == 0) { low = make_kgid(user_ns, urange[0]); high = make_kgid(user_ns, urange[1]); - if (!gid_valid(low) || !gid_valid(high) || - (urange[1] < urange[0]) || gid_lt(high, low)) { + if (!gid_valid(low) || !gid_valid(high)) + return -EINVAL; + if (urange[1] < urange[0] || gid_lt(high, low)) { low = make_kgid(&init_user_ns, 1); high = make_kgid(&init_user_ns, 0); } -- cgit v1.2.3 From 5711b4e89319c2912f20b2a4f371c1525fc9551d Mon Sep 17 00:00:00 2001 From: Máté Eckl Date: Thu, 5 Jul 2018 12:01:53 +0200 Subject: netfilter: nf_tproxy: fix possible non-linear access to transport header This patch fixes a silent out-of-bound read possibility that was present because of the misuse of this function. Mostly it was called with a struct udphdr *hp which had only the udphdr part linearized by the skb_header_pointer, however nf_tproxy_get_sock_v{4,6} uses it as a tcphdr pointer, so some reads for tcp specific attributes may be invalid. Fixes: a583636a83ea ("inet: refactor inet[6]_lookup functions to take skb") Signed-off-by: Máté Eckl Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tproxy.h | 4 ++-- net/ipv4/netfilter/nf_tproxy_ipv4.c | 18 ++++++++++++------ net/ipv6/netfilter/nf_tproxy_ipv6.c | 18 ++++++++++++------ net/netfilter/xt_TPROXY.c | 8 ++++---- 4 files changed, 30 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tproxy.h b/include/net/netfilter/nf_tproxy.h index 9754a50ecde9..4cc64c8446eb 100644 --- a/include/net/netfilter/nf_tproxy.h +++ b/include/net/netfilter/nf_tproxy.h @@ -64,7 +64,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, * belonging to established connections going through that one. */ struct sock * -nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, +nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, @@ -103,7 +103,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, struct sock *sk); struct sock * -nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, +nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, const u8 protocol, const struct in6_addr *saddr, const struct in6_addr *daddr, const __be16 sport, const __be16 dport, diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index 805e83ec3ad9..164714104965 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -37,7 +37,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, + sk2 = nf_tproxy_get_sock_v4(net, skb, iph->protocol, iph->saddr, laddr ? laddr : iph->daddr, hp->source, lport ? lport : hp->dest, skb->dev, NF_TPROXY_LOOKUP_LISTENER); @@ -71,7 +71,7 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) EXPORT_SYMBOL_GPL(nf_tproxy_laddr4); struct sock * -nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, +nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, @@ -79,16 +79,21 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, const enum nf_tproxy_lookup_t lookup_type) { struct sock *sk; - struct tcphdr *tcph; switch (protocol) { - case IPPROTO_TCP: + case IPPROTO_TCP: { + struct tcphdr _hdr, *hp; + + hp = skb_header_pointer(skb, ip_hdrlen(skb), + sizeof(struct tcphdr), &_hdr); + if (hp == NULL) + return NULL; + switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - tcph = hp; sk = inet_lookup_listener(net, &tcp_hashinfo, skb, ip_hdrlen(skb) + - __tcp_hdrlen(tcph), + __tcp_hdrlen(hp), saddr, sport, daddr, dport, in->ifindex, 0); @@ -110,6 +115,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp, BUG(); } break; + } case IPPROTO_UDP: sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, in->ifindex); diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c index bf1d6c421e3b..5dfd33af6451 100644 --- a/net/ipv6/netfilter/nf_tproxy_ipv6.c +++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c @@ -55,7 +55,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, * to a listener socket if there's one */ struct sock *sk2; - sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, hp, tproto, + sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, tproto, &iph->saddr, nf_tproxy_laddr6(skb, laddr, &iph->daddr), hp->source, @@ -72,7 +72,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait6); struct sock * -nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, +nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, const u8 protocol, const struct in6_addr *saddr, const struct in6_addr *daddr, const __be16 sport, const __be16 dport, @@ -80,15 +80,20 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, const enum nf_tproxy_lookup_t lookup_type) { struct sock *sk; - struct tcphdr *tcph; switch (protocol) { - case IPPROTO_TCP: + case IPPROTO_TCP: { + struct tcphdr _hdr, *hp; + + hp = skb_header_pointer(skb, thoff, + sizeof(struct tcphdr), &_hdr); + if (hp == NULL) + return NULL; + switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - tcph = hp; sk = inet6_lookup_listener(net, &tcp_hashinfo, skb, - thoff + __tcp_hdrlen(tcph), + thoff + __tcp_hdrlen(hp), saddr, sport, daddr, ntohs(dport), in->ifindex, 0); @@ -110,6 +115,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp, BUG(); } break; + } case IPPROTO_UDP: sk = udp6_lib_lookup(net, saddr, sport, daddr, dport, in->ifindex); diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 58fce4e749a9..d76550a8b642 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -61,7 +61,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, + sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol, iph->saddr, iph->daddr, hp->source, hp->dest, skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED); @@ -77,7 +77,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport, else if (!sk) /* no, there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol, + sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol, iph->saddr, laddr, hp->source, lport, skb->dev, NF_TPROXY_LOOKUP_LISTENER); @@ -150,7 +150,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) * addresses, this happens if the redirect already happened * and the current packet belongs to an already established * connection */ - sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto, + sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, tproto, &iph->saddr, &iph->daddr, hp->source, hp->dest, xt_in(par), NF_TPROXY_LOOKUP_ESTABLISHED); @@ -171,7 +171,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) else if (!sk) /* no there's no established connection, check if * there's a listener on the redirected addr/port */ - sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, + sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, tproto, &iph->saddr, laddr, hp->source, lport, xt_in(par), NF_TPROXY_LOOKUP_LISTENER); -- cgit v1.2.3 From e240cd0df48185a28c153f83a39ba3940e3e9b86 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 6 Jul 2018 19:06:43 +0200 Subject: netfilter: nf_tables: place all set backends in one single module This patch disallows rbtree with single elements, which is causing problems with the recent timeout support. Before this patch, you could opt out individual set representations per module, which is just adding extra complexity. Fixes: 8d8540c4f5e0("netfilter: nft_set_rbtree: add timeout support") Reported-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 6 ++++++ net/netfilter/Kconfig | 25 +++++++------------------ net/netfilter/Makefile | 7 ++++--- net/netfilter/nf_tables_set_core.c | 28 ++++++++++++++++++++++++++++ net/netfilter/nft_set_bitmap.c | 19 +------------------ net/netfilter/nft_set_hash.c | 29 +++-------------------------- net/netfilter/nft_set_rbtree.c | 19 +------------------ 7 files changed, 50 insertions(+), 83 deletions(-) create mode 100644 net/netfilter/nf_tables_set_core.c (limited to 'net') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index e0c0c2558ec4..a05134507e7b 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -65,4 +65,10 @@ extern const struct nft_expr_ops nft_payload_fast_ops; extern struct static_key_false nft_counters_enabled; extern struct static_key_false nft_trace_enabled; +extern struct nft_set_type nft_set_rhash_type; +extern struct nft_set_type nft_set_hash_type; +extern struct nft_set_type nft_set_hash_fast_type; +extern struct nft_set_type nft_set_rbtree_type; +extern struct nft_set_type nft_set_bitmap_type; + #endif /* _NET_NF_TABLES_CORE_H */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index dbd7d1fad277..f0a1c536ef15 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -460,6 +460,13 @@ config NF_TABLES if NF_TABLES +config NF_TABLES_SET + tristate "Netfilter nf_tables set infrastructure" + help + This option enables the nf_tables set infrastructure that allows to + look up for elements in a set and to build one-way mappings between + matchings and actions. + config NF_TABLES_INET depends on IPV6 select NF_TABLES_IPV4 @@ -493,24 +500,6 @@ config NFT_FLOW_OFFLOAD This option adds the "flow_offload" expression that you can use to choose what flows are placed into the hardware. -config NFT_SET_RBTREE - tristate "Netfilter nf_tables rbtree set module" - help - This option adds the "rbtree" set type (Red Black tree) that is used - to build interval-based sets. - -config NFT_SET_HASH - tristate "Netfilter nf_tables hash set module" - help - This option adds the "hash" set type that is used to build one-way - mappings between matchings and actions. - -config NFT_SET_BITMAP - tristate "Netfilter nf_tables bitmap set module" - help - This option adds the "bitmap" set type that is used to build sets - whose keys are smaller or equal to 16 bits. - config NFT_COUNTER tristate "Netfilter nf_tables counter module" help diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 44449389e527..8a76dced974d 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -78,7 +78,11 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o +nf_tables_set-objs := nf_tables_set_core.o \ + nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o + obj-$(CONFIG_NF_TABLES) += nf_tables.o +obj-$(CONFIG_NF_TABLES_SET) += nf_tables_set.o obj-$(CONFIG_NFT_COMPAT) += nft_compat.o obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o @@ -91,9 +95,6 @@ obj-$(CONFIG_NFT_QUEUE) += nft_queue.o obj-$(CONFIG_NFT_QUOTA) += nft_quota.o obj-$(CONFIG_NFT_REJECT) += nft_reject.o obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o -obj-$(CONFIG_NFT_SET_RBTREE) += nft_set_rbtree.o -obj-$(CONFIG_NFT_SET_HASH) += nft_set_hash.o -obj-$(CONFIG_NFT_SET_BITMAP) += nft_set_bitmap.o obj-$(CONFIG_NFT_COUNTER) += nft_counter.o obj-$(CONFIG_NFT_LOG) += nft_log.o obj-$(CONFIG_NFT_MASQ) += nft_masq.o diff --git a/net/netfilter/nf_tables_set_core.c b/net/netfilter/nf_tables_set_core.c new file mode 100644 index 000000000000..814789644bd3 --- /dev/null +++ b/net/netfilter/nf_tables_set_core.c @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include + +static int __init nf_tables_set_module_init(void) +{ + nft_register_set(&nft_set_hash_fast_type); + nft_register_set(&nft_set_hash_type); + nft_register_set(&nft_set_rhash_type); + nft_register_set(&nft_set_bitmap_type); + nft_register_set(&nft_set_rbtree_type); + + return 0; +} + +static void __exit nf_tables_set_module_exit(void) +{ + nft_unregister_set(&nft_set_rbtree_type); + nft_unregister_set(&nft_set_bitmap_type); + nft_unregister_set(&nft_set_rhash_type); + nft_unregister_set(&nft_set_hash_type); + nft_unregister_set(&nft_set_hash_fast_type); +} + +module_init(nf_tables_set_module_init); +module_exit(nf_tables_set_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index d6626e01c7ee..128bc16f52dd 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -296,7 +296,7 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, return true; } -static struct nft_set_type nft_bitmap_type __read_mostly = { +struct nft_set_type nft_set_bitmap_type __read_mostly = { .owner = THIS_MODULE, .ops = { .privsize = nft_bitmap_privsize, @@ -314,20 +314,3 @@ static struct nft_set_type nft_bitmap_type __read_mostly = { .get = nft_bitmap_get, }, }; - -static int __init nft_bitmap_module_init(void) -{ - return nft_register_set(&nft_bitmap_type); -} - -static void __exit nft_bitmap_module_exit(void) -{ - nft_unregister_set(&nft_bitmap_type); -} - -module_init(nft_bitmap_module_init); -module_exit(nft_bitmap_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Pablo Neira Ayuso "); -MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 6f9a1365a09f..72ef35b51cac 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -654,7 +654,7 @@ static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features return true; } -static struct nft_set_type nft_rhash_type __read_mostly = { +struct nft_set_type nft_set_rhash_type __read_mostly = { .owner = THIS_MODULE, .features = NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT | NFT_SET_EVAL, @@ -677,7 +677,7 @@ static struct nft_set_type nft_rhash_type __read_mostly = { }, }; -static struct nft_set_type nft_hash_type __read_mostly = { +struct nft_set_type nft_set_hash_type __read_mostly = { .owner = THIS_MODULE, .features = NFT_SET_MAP | NFT_SET_OBJECT, .ops = { @@ -697,7 +697,7 @@ static struct nft_set_type nft_hash_type __read_mostly = { }, }; -static struct nft_set_type nft_hash_fast_type __read_mostly = { +struct nft_set_type nft_set_hash_fast_type __read_mostly = { .owner = THIS_MODULE, .features = NFT_SET_MAP | NFT_SET_OBJECT, .ops = { @@ -716,26 +716,3 @@ static struct nft_set_type nft_hash_fast_type __read_mostly = { .get = nft_hash_get, }, }; - -static int __init nft_hash_module_init(void) -{ - if (nft_register_set(&nft_hash_fast_type) || - nft_register_set(&nft_hash_type) || - nft_register_set(&nft_rhash_type)) - return 1; - return 0; -} - -static void __exit nft_hash_module_exit(void) -{ - nft_unregister_set(&nft_rhash_type); - nft_unregister_set(&nft_hash_type); - nft_unregister_set(&nft_hash_fast_type); -} - -module_init(nft_hash_module_init); -module_exit(nft_hash_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy "); -MODULE_ALIAS_NFT_SET(); diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 7f3a9a211034..1f8f257cb518 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -462,7 +462,7 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, return true; } -static struct nft_set_type nft_rbtree_type __read_mostly = { +struct nft_set_type nft_set_rbtree_type __read_mostly = { .owner = THIS_MODULE, .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { @@ -481,20 +481,3 @@ static struct nft_set_type nft_rbtree_type __read_mostly = { .get = nft_rbtree_get, }, }; - -static int __init nft_rbtree_module_init(void) -{ - return nft_register_set(&nft_rbtree_type); -} - -static void __exit nft_rbtree_module_exit(void) -{ - nft_unregister_set(&nft_rbtree_type); -} - -module_init(nft_rbtree_module_init); -module_exit(nft_rbtree_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy "); -MODULE_ALIAS_NFT_SET(); -- cgit v1.2.3 From 2a57f182420174c7fd4b19db979a2d135231a963 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 6 Jul 2018 20:10:03 +0200 Subject: tipc: fix wrong return value from function tipc_node_try_addr() The function for checking if there is an node address conflict is supposed to return a suggestion for a new address if it finds a conflict, and zero otherwise. But in case the peer being checked is previously unknown it does instead return a "suggestion" for the checked address itself. This results in a DSC_TRIAL_FAIL_MSG being sent unecessarily to the peer, and sometimes makes the trial period starting over again. Fixes: 25b0b9c4e835 ("tipc: handle collisions of 32-bit node address hash values") Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/node.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index 6a44eb812baf..0453bd451ce8 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -797,6 +797,7 @@ static u32 tipc_node_suggest_addr(struct net *net, u32 addr) } /* tipc_node_try_addr(): Check if addr can be used by peer, suggest other if not + * Returns suggested address if any, otherwise 0 */ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) { @@ -819,12 +820,14 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) if (n) { addr = n->addr; tipc_node_put(n); + return addr; } - /* Even this node may be in trial phase */ + + /* Even this node may be in conflict */ if (tn->trial_addr == addr) return tipc_node_suggest_addr(net, addr); - return addr; + return 0; } void tipc_node_check_dest(struct net *net, u32 addr, -- cgit v1.2.3 From e415577f57f4452150642500364cbe5fa6112813 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 6 Jul 2018 20:10:04 +0200 Subject: tipc: correct discovery message handling during address trial period With the duplicate address discovery protocol for tipc nodes addresses we introduced a one second trial period before a node is allocated a hash number to use as address. Unfortunately, we miss to handle the case when a regular LINK REQUEST/ RESPONSE arrives from a cluster node during the trial period. Such messages are not ignored as they should be, leading to links setup attempts while the node still has no address. Fixes: 25b0b9c4e835 ("tipc: handle collisions of 32-bit node address hash values") Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/discover.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 9f666e0650e2..dcadc10dffd1 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -133,6 +133,8 @@ static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr, } /* tipc_disc_addr_trial(): - handle an address uniqueness trial from peer + * Returns true if message should be dropped by caller, i.e., if it is a + * trial message or we are inside trial period. Otherwise false. */ static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, struct tipc_media_addr *maddr, @@ -168,8 +170,9 @@ static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); } + /* Accept regular link requests/responses only after trial period */ if (mtyp != DSC_TRIAL_MSG) - return false; + return trial; sugg_addr = tipc_node_try_addr(net, peer_id, src); if (sugg_addr) -- cgit v1.2.3 From 92018c7ca959ccd346d6235dac03cf7fc1ba51f7 Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 6 Jul 2018 20:10:05 +0200 Subject: tipc: fix correct setting of message type in second discoverer The duplicate address discovery protocol is not safe against two discoverers running in parallel. The one executing first after the trial period is over will set the node address and change its own message type to DSC_REQ_MSG. The one executing last may find that the node address is already set, and never change message type, with the result that its links may never be established. In this commmit we ensure that the message type always is set correctly after the trial period is over. Fixes: 25b0b9c4e835 ("tipc: handle collisions of 32-bit node address hash values") Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/discover.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/tipc/discover.c b/net/tipc/discover.c index dcadc10dffd1..2830709957bd 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -287,7 +287,6 @@ static void tipc_disc_timeout(struct timer_list *t) { struct tipc_discoverer *d = from_timer(d, t, timer); struct tipc_net *tn = tipc_net(d->net); - u32 self = tipc_own_addr(d->net); struct tipc_media_addr maddr; struct sk_buff *skb = NULL; struct net *net = d->net; @@ -301,12 +300,14 @@ static void tipc_disc_timeout(struct timer_list *t) goto exit; } - /* Did we just leave the address trial period ? */ - if (!self && !time_before(jiffies, tn->addr_trial_end)) { - self = tn->trial_addr; - tipc_net_finalize(net, self); - msg_set_prevnode(buf_msg(d->skb), self); + /* Trial period over ? */ + if (!time_before(jiffies, tn->addr_trial_end)) { + /* Did we just leave it ? */ + if (!tipc_own_addr(net)) + tipc_net_finalize(net, tn->trial_addr); + msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); + msg_set_prevnode(buf_msg(d->skb), tipc_own_addr(net)); } /* Adjust timeout interval according to discovery phase */ -- cgit v1.2.3 From 9faa89d4ed9d7d326f4763d262842270450f9b1f Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Fri, 6 Jul 2018 20:10:06 +0200 Subject: tipc: make function tipc_net_finalize() thread safe The setting of the node address is not thread safe, meaning that two discoverers may decide to set it simultanously, with a duplicate entry in the name table as result. We fix that with this commit. Fixes: 25b0b9c4e835 ("tipc: handle collisions of 32-bit node address hash values") Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/net.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/tipc/net.c b/net/tipc/net.c index 4fbaa0464405..a7f6964c3a4b 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -121,12 +121,17 @@ int tipc_net_init(struct net *net, u8 *node_id, u32 addr) void tipc_net_finalize(struct net *net, u32 addr) { - tipc_set_node_addr(net, addr); - smp_mb(); - tipc_named_reinit(net); - tipc_sk_reinit(net); - tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, - TIPC_CLUSTER_SCOPE, 0, addr); + struct tipc_net *tn = tipc_net(net); + + spin_lock_bh(&tn->node_list_lock); + if (!tipc_own_addr(net)) { + tipc_set_node_addr(net, addr); + tipc_named_reinit(net); + tipc_sk_reinit(net); + tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr, + TIPC_CLUSTER_SCOPE, 0, addr); + } + spin_unlock_bh(&tn->node_list_lock); } void tipc_net_stop(struct net *net) -- cgit v1.2.3 From e1bbdd57047454dad068dc36612dd60a57f4c58f Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 5 Jul 2018 16:15:30 +0200 Subject: net/smc: reduce sock_put() for fallback sockets smc_release() calls a sock_put() for smc fallback sockets to cover the passive closing sock_hold() in __smc_connect() and smc_tcp_listen_work(). This does not make sense for sockets in state SMC_LISTEN and SMC_INIT. An SMC socket stays in state SMC_INIT if connect fails. The sock_put in smc_connect_abort() does not cover all failures. Move it into smc_connect_decline_fallback(). Fixes: ee9dfbef02d18 ("net/smc: handle sockopts forcing fallback") Reported-by: syzbot+3a0748c8f2f210c0ef9b@syzkaller.appspotmail.com Reported-by: syzbot+9e60d2428a42049a592a@syzkaller.appspotmail.com Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 15 ++++++++++----- net/smc/smc_close.c | 2 ++ 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e017b6a4452b..5334157f5065 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -147,7 +147,8 @@ static int smc_release(struct socket *sock) smc->clcsock = NULL; } if (smc->use_fallback) { - sock_put(sk); /* passive closing */ + if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT) + sock_put(sk); /* passive closing */ sk->sk_state = SMC_CLOSED; sk->sk_state_change(sk); } @@ -417,12 +418,18 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code) { int rc; - if (reason_code < 0) /* error, fallback is not possible */ + if (reason_code < 0) { /* error, fallback is not possible */ + if (smc->sk.sk_state == SMC_INIT) + sock_put(&smc->sk); /* passive closing */ return reason_code; + } if (reason_code != SMC_CLC_DECL_REPLY) { rc = smc_clc_send_decline(smc, reason_code); - if (rc < 0) + if (rc < 0) { + if (smc->sk.sk_state == SMC_INIT) + sock_put(&smc->sk); /* passive closing */ return rc; + } } return smc_connect_fallback(smc); } @@ -435,8 +442,6 @@ static int smc_connect_abort(struct smc_sock *smc, int reason_code, smc_lgr_forget(smc->conn.lgr); mutex_unlock(&smc_create_lgr_pending); smc_conn_free(&smc->conn); - if (reason_code < 0 && smc->sk.sk_state == SMC_INIT) - sock_put(&smc->sk); /* passive closing */ return reason_code; } diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index fa41d9881741..ac961dfb1ea1 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -107,6 +107,8 @@ static void smc_close_active_abort(struct smc_sock *smc) } switch (sk->sk_state) { case SMC_INIT: + sk->sk_state = SMC_PEERABORTWAIT; + break; case SMC_ACTIVE: sk->sk_state = SMC_PEERABORTWAIT; release_sock(sk); -- cgit v1.2.3 From 11a245e2f7bf25fc21f47e4c9c8491841b128890 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 6 Jul 2018 21:01:05 +0200 Subject: net/sched: act_csum: fix NULL dereference when 'goto chain' is used the control action in the common member of struct tcf_csum must be a valid value, as it can contain the chain index when 'goto chain' is used. Ensure that the control action can be read as x->tcfa_action, when x is a pointer to struct tc_action and x->ops->type is TCA_ACT_CSUM, to prevent the following command: # tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \ > $tcflags dst_mac $h2mac action csum ip or tcp or udp or sctp goto chain 1 from triggering a NULL pointer dereference when a matching packet is received. BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 PGD 800000010416b067 P4D 800000010416b067 PUD 1041be067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 0 PID: 3072 Comm: mausezahn Tainted: G E 4.18.0-rc2.auguri+ #421 Hardware name: Hewlett-Packard HP Z220 CMT Workstation/1790, BIOS K51 v01.58 02/07/2013 RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffffa020dea03c40 EFLAGS: 00010246 RAX: 0000000020000001 RBX: ffffa020d7ccef00 RCX: 0000000000000054 RDX: 0000000000000000 RSI: ffffa020ca5ae000 RDI: ffffa020d7ccef00 RBP: ffffa020dea03e60 R08: 0000000000000000 R09: ffffa020dea03c9c R10: ffffa020dea03c78 R11: 0000000000000008 R12: ffffa020d3fe4f00 R13: ffffa020d3fe4f08 R14: 0000000000000001 R15: ffffa020d53ca300 FS: 00007f5a46942740(0000) GS:ffffa020dea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000104218002 CR4: 00000000001606f0 Call Trace: fl_classify+0x1ad/0x1c0 [cls_flower] ? arp_rcv+0x121/0x1b0 ? __x2apic_send_IPI_dest+0x40/0x40 ? smp_reschedule_interrupt+0x1c/0xd0 ? reschedule_interrupt+0xf/0x20 ? reschedule_interrupt+0xa/0x20 ? device_is_rmrr_locked+0xe/0x50 ? iommu_should_identity_map+0x49/0xd0 ? __intel_map_single+0x30/0x140 ? e1000e_update_rdt_wa.isra.52+0x22/0xb0 [e1000e] ? e1000_alloc_rx_buffers+0x233/0x250 [e1000e] ? kmem_cache_alloc+0x38/0x1c0 tcf_classify+0x89/0x140 __netif_receive_skb_core+0x5ea/0xb70 ? enqueue_task_fair+0xb6/0x7d0 ? process_backlog+0x97/0x150 process_backlog+0x97/0x150 net_rx_action+0x14b/0x3e0 __do_softirq+0xde/0x2b4 do_softirq_own_stack+0x2a/0x40 do_softirq.part.18+0x49/0x50 __local_bh_enable_ip+0x49/0x50 __dev_queue_xmit+0x4ab/0x8a0 ? wait_woken+0x80/0x80 ? packet_sendmsg+0x38f/0x810 ? __dev_queue_xmit+0x8a0/0x8a0 packet_sendmsg+0x38f/0x810 sock_sendmsg+0x36/0x40 __sys_sendto+0x10e/0x140 ? do_vfs_ioctl+0xa4/0x630 ? syscall_trace_enter+0x1df/0x2e0 ? __audit_syscall_exit+0x22a/0x290 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x5b/0x180 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f5a45cbec93 Code: 48 8b 0d 18 83 20 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 59 c7 20 00 00 75 13 49 89 ca b8 2c 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 34 c3 48 83 ec 08 e8 2b f7 ff ff 48 89 04 24 RSP: 002b:00007ffd0ee6d748 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 0000000001161010 RCX: 00007f5a45cbec93 RDX: 0000000000000062 RSI: 0000000001161322 RDI: 0000000000000003 RBP: 00007ffd0ee6d780 R08: 00007ffd0ee6d760 R09: 0000000000000014 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000062 R13: 0000000001161322 R14: 00007ffd0ee6d760 R15: 0000000000000003 Modules linked in: act_csum act_gact cls_flower sch_ingress vrf veth act_tunnel_key(E) xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter intel_rapl x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel snd_hda_codec_hdmi snd_hda_codec_realtek kvm snd_hda_codec_generic hp_wmi iTCO_wdt sparse_keymap rfkill mei_wdt iTCO_vendor_support wmi_bmof gpio_ich irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel snd_hda_intel crypto_simd cryptd snd_hda_codec glue_helper snd_hda_core snd_hwdep snd_seq snd_seq_device snd_pcm pcspkr i2c_i801 snd_timer snd sg lpc_ich soundcore wmi mei_me mei ie31200_edac nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sr_mod cdrom sd_mod ahci libahci crc32c_intel i915 ixgbe serio_raw libata video dca i2c_algo_bit sfc drm_kms_helper syscopyarea mtd sysfillrect mdio sysimgblt fb_sys_fops drm e1000e i2c_core CR2: 0000000000000000 ---[ end trace 3c9e9d1a77df4026 ]--- RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffffa020dea03c40 EFLAGS: 00010246 RAX: 0000000020000001 RBX: ffffa020d7ccef00 RCX: 0000000000000054 RDX: 0000000000000000 RSI: ffffa020ca5ae000 RDI: ffffa020d7ccef00 RBP: ffffa020dea03e60 R08: 0000000000000000 R09: ffffa020dea03c9c R10: ffffa020dea03c78 R11: 0000000000000008 R12: ffffa020d3fe4f00 R13: ffffa020d3fe4f08 R14: 0000000000000001 R15: ffffa020d53ca300 FS: 00007f5a46942740(0000) GS:ffffa020dea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 0000000104218002 CR4: 00000000001606f0 Kernel panic - not syncing: Fatal exception in interrupt Kernel Offset: 0x26400000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- Fixes: 9c5f69bbd75a ("net/sched: act_csum: don't use spinlock in the fast path") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- include/net/tc_act/tc_csum.h | 1 - net/sched/act_csum.c | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h index 9470fd7e4350..32d2454c0479 100644 --- a/include/net/tc_act/tc_csum.h +++ b/include/net/tc_act/tc_csum.h @@ -7,7 +7,6 @@ #include struct tcf_csum_params { - int action; u32 update_flags; struct rcu_head rcu; }; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 526a8e491626..6e7124e57918 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -91,7 +91,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, } params_old = rtnl_dereference(p->params); - params_new->action = parm->action; + p->tcf_action = parm->action; params_new->update_flags = parm->update_flags; rcu_assign_pointer(p->params, params_new); if (params_old) @@ -561,7 +561,7 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a, tcf_lastuse_update(&p->tcf_tm); bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb); - action = params->action; + action = READ_ONCE(p->tcf_action); if (unlikely(action == TC_ACT_SHOT)) goto drop_stats; @@ -599,11 +599,11 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, .index = p->tcf_index, .refcnt = p->tcf_refcnt - ref, .bindcnt = p->tcf_bindcnt - bind, + .action = p->tcf_action, }; struct tcf_t t; params = rtnl_dereference(p->params); - opt.action = params->action; opt.update_flags = params->update_flags; if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) -- cgit v1.2.3 From 38230a3e0e0933bbcf5df6fa469ba0667f667568 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 6 Jul 2018 21:01:06 +0200 Subject: net/sched: act_tunnel_key: fix NULL dereference when 'goto chain' is used the control action in the common member of struct tcf_tunnel_key must be a valid value, as it can contain the chain index when 'goto chain' is used. Ensure that the control action can be read as x->tcfa_action, when x is a pointer to struct tc_action and x->ops->type is TCA_ACT_TUNNEL_KEY, to prevent the following command: # tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \ > $tcflags dst_mac $h2mac action tunnel_key unset goto chain 1 from causing a NULL dereference when a matching packet is received: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 PGD 80000001097ac067 P4D 80000001097ac067 PUD 103b0a067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 0 PID: 3491 Comm: mausezahn Tainted: G E 4.18.0-rc2.auguri+ #421 Hardware name: Hewlett-Packard HP Z220 CMT Workstation/1790, BIOS K51 v01.58 02/07/2013 RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffff95145ea03c40 EFLAGS: 00010246 RAX: 0000000020000001 RBX: ffff9514499e5800 RCX: 0000000000000001 RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000000000 RBP: ffff95145ea03e60 R08: 0000000000000000 R09: ffff95145ea03c9c R10: ffff95145ea03c78 R11: 0000000000000008 R12: ffff951456a69800 R13: ffff951456a69808 R14: 0000000000000001 R15: ffff95144965ee40 FS: 00007fd67ee11740(0000) GS:ffff95145ea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000001038a2006 CR4: 00000000001606f0 Call Trace: fl_classify+0x1ad/0x1c0 [cls_flower] ? __update_load_avg_se.isra.47+0x1ca/0x1d0 ? __update_load_avg_se.isra.47+0x1ca/0x1d0 ? update_load_avg+0x665/0x690 ? update_load_avg+0x665/0x690 ? kmem_cache_alloc+0x38/0x1c0 tcf_classify+0x89/0x140 __netif_receive_skb_core+0x5ea/0xb70 ? enqueue_entity+0xd0/0x270 ? process_backlog+0x97/0x150 process_backlog+0x97/0x150 net_rx_action+0x14b/0x3e0 __do_softirq+0xde/0x2b4 do_softirq_own_stack+0x2a/0x40 do_softirq.part.18+0x49/0x50 __local_bh_enable_ip+0x49/0x50 __dev_queue_xmit+0x4ab/0x8a0 ? wait_woken+0x80/0x80 ? packet_sendmsg+0x38f/0x810 ? __dev_queue_xmit+0x8a0/0x8a0 packet_sendmsg+0x38f/0x810 sock_sendmsg+0x36/0x40 __sys_sendto+0x10e/0x140 ? do_vfs_ioctl+0xa4/0x630 ? syscall_trace_enter+0x1df/0x2e0 ? __audit_syscall_exit+0x22a/0x290 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x5b/0x180 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7fd67e18dc93 Code: 48 8b 0d 18 83 20 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 59 c7 20 00 00 75 13 49 89 ca b8 2c 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 34 c3 48 83 ec 08 e8 2b f7 ff ff 48 89 04 24 RSP: 002b:00007ffe0189b748 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00000000020ca010 RCX: 00007fd67e18dc93 RDX: 0000000000000062 RSI: 00000000020ca322 RDI: 0000000000000003 RBP: 00007ffe0189b780 R08: 00007ffe0189b760 R09: 0000000000000014 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000062 R13: 00000000020ca322 R14: 00007ffe0189b760 R15: 0000000000000003 Modules linked in: act_tunnel_key act_gact cls_flower sch_ingress vrf veth act_csum(E) xt_CHECKSUM iptable_mangle ipt_MASQUERADE iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter intel_rapl snd_hda_codec_hdmi x86_pkg_temp_thermal intel_powerclamp snd_hda_codec_realtek coretemp snd_hda_codec_generic kvm_intel kvm irqbypass snd_hda_intel crct10dif_pclmul crc32_pclmul hp_wmi ghash_clmulni_intel pcbc snd_hda_codec aesni_intel sparse_keymap rfkill snd_hda_core snd_hwdep snd_seq crypto_simd iTCO_wdt gpio_ich iTCO_vendor_support wmi_bmof cryptd mei_wdt glue_helper snd_seq_device snd_pcm pcspkr snd_timer snd i2c_i801 lpc_ich sg soundcore wmi mei_me mei ie31200_edac nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod sr_mod cdrom i915 video i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ahci crc32c_intel libahci serio_raw sfc libata mtd drm ixgbe mdio i2c_core e1000e dca CR2: 0000000000000000 ---[ end trace 1ab8b5b5d4639dfc ]--- RIP: 0010:tcf_action_exec+0xb8/0x100 Code: 00 00 00 20 74 1d 83 f8 03 75 09 49 83 c4 08 4d 39 ec 75 bc 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 49 8b 97 a8 00 00 00 <48> 8b 12 48 89 55 00 48 83 c4 10 5b 5d 41 5c 41 5d 41 5e 41 5f c3 RSP: 0018:ffff95145ea03c40 EFLAGS: 00010246 RAX: 0000000020000001 RBX: ffff9514499e5800 RCX: 0000000000000001 RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000000000 RBP: ffff95145ea03e60 R08: 0000000000000000 R09: ffff95145ea03c9c R10: ffff95145ea03c78 R11: 0000000000000008 R12: ffff951456a69800 R13: ffff951456a69808 R14: 0000000000000001 R15: ffff95144965ee40 FS: 00007fd67ee11740(0000) GS:ffff95145ea00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000001038a2006 CR4: 00000000001606f0 Kernel panic - not syncing: Fatal exception in interrupt Kernel Offset: 0x11400000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- Fixes: d0f6dd8a914f ("net/sched: Introduce act_tunnel_key") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- include/net/tc_act/tc_tunnel_key.h | 1 - net/sched/act_tunnel_key.c | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h index efef0b4b1b2b..46b8c7f1c8d5 100644 --- a/include/net/tc_act/tc_tunnel_key.h +++ b/include/net/tc_act/tc_tunnel_key.h @@ -18,7 +18,6 @@ struct tcf_tunnel_key_params { struct rcu_head rcu; int tcft_action; - int action; struct metadata_dst *tcft_enc_metadata; }; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 626dac81a48a..9bc6c2ae98a5 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -36,7 +36,7 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, tcf_lastuse_update(&t->tcf_tm); bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb); - action = params->action; + action = READ_ONCE(t->tcf_action); switch (params->tcft_action) { case TCA_TUNNEL_KEY_ACT_RELEASE: @@ -182,7 +182,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, params_old = rtnl_dereference(t->params); - params_new->action = parm->action; + t->tcf_action = parm->action; params_new->tcft_action = parm->t_action; params_new->tcft_enc_metadata = metadata; @@ -254,13 +254,13 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, .index = t->tcf_index, .refcnt = t->tcf_refcnt - ref, .bindcnt = t->tcf_bindcnt - bind, + .action = t->tcf_action, }; struct tcf_t tm; params = rtnl_dereference(t->params); opt.t_action = params->tcft_action; - opt.action = params->action; if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt)) goto nla_put_failure; -- cgit v1.2.3 From 0c6bc6e531a6db36f49622f1f115770160f7afb0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Jul 2018 08:49:59 -0700 Subject: bpf: fix sk_skb programs without skb->dev assigned Multiple BPF helpers in use by sk_skb programs calculate the max skb length using the __bpf_skb_max_len function. However, this calculates the max length using the skb->dev pointer which can be NULL when an sk_skb program is paired with an sk_msg program. To force this a sk_msg program needs to redirect into the ingress path of a sock with an attach sk_skb program. Then the the sk_skb program would need to call one of the helpers that adjust the skb size. To fix the null ptr dereference use SKB_MAX_ALLOC size if no dev is available. Fixes: 8934ce2fd081 ("bpf: sockmap redirect ingress support") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 0ca6907d7efe..3095f1ba7015 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2779,7 +2779,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) static u32 __bpf_skb_max_len(const struct sk_buff *skb) { - return skb->dev->mtu + skb->dev->hard_header_len; + return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len : + SKB_MAX_ALLOC; } static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) -- cgit v1.2.3 From 0ea488ff8d23c93da383fcf424825c298b13b1fb Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 5 Jul 2018 08:50:15 -0700 Subject: bpf: sockmap, convert bpf_compute_data_pointers to bpf_*_sk_skb In commit 'bpf: bpf_compute_data uses incorrect cb structure' (8108a7751512) we added the routine bpf_compute_data_end_sk_skb() to compute the correct data_end values, but this has since been lost. In kernel v4.14 this was correct and the above patch was applied in it entirety. Then when v4.14 was merged into v4.15-rc1 net-next tree we lost the piece that renamed bpf_compute_data_pointers to the new function bpf_compute_data_end_sk_skb. This was done here, e1ea2f9856b7 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net") When it conflicted with the following rename patch, 6aaae2b6c433 ("bpf: rename bpf_compute_data_end into bpf_compute_data_pointers") Finally, after a refactor I thought even the function bpf_compute_data_end_sk_skb() was no longer needed and it was erroneously removed. However, we never reverted the sk_skb_convert_ctx_access() usage of tcp_skb_cb which had been committed and survived the merge conflict. Here we fix this by adding back the helper and *_data_end_sk_skb() usage. Using the bpf_skc_data_end mapping is not correct because it expects a qdisc_skb_cb object but at the sock layer this is not the case. Even though it happens to work here because we don't overwrite any data in-use at the socket layer and the cb structure is cleared later this has potential to create some subtle issues. But, even more concretely the filter.c access check uses tcp_skb_cb. And by some act of chance though, struct bpf_skb_data_end { struct qdisc_skb_cb qdisc_cb; /* 0 28 */ /* XXX 4 bytes hole, try to pack */ void * data_meta; /* 32 8 */ void * data_end; /* 40 8 */ /* size: 48, cachelines: 1, members: 3 */ /* sum members: 44, holes: 1, sum holes: 4 */ /* last cacheline: 48 bytes */ }; and then tcp_skb_cb, struct tcp_skb_cb { [...] struct { __u32 flags; /* 24 4 */ struct sock * sk_redir; /* 32 8 */ void * data_end; /* 40 8 */ } bpf; /* 24 */ }; So when we use offset_of() to track down the byte offset we get 40 in either case and everything continues to work. Fix this mess and use correct structures its unclear how long this might actually work for until someone moves the structs around. Reported-by: Martin KaFai Lau Fixes: e1ea2f9856b7 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net") Fixes: 6aaae2b6c433 ("bpf: rename bpf_compute_data_end into bpf_compute_data_pointers") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 4 +++ kernel/bpf/sockmap.c | 4 +-- net/core/filter.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 97 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 800582b5dd54..af3ec72d5d41 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -828,6 +828,10 @@ struct tcp_skb_cb { #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) +static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); +} #if IS_ENABLED(CONFIG_IPV6) /* This is the variant of inet6_iif() that must be used by TCP, diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index dfc8a8a07c1f..98fb7938beea 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1236,7 +1236,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) */ TCP_SKB_CB(skb)->bpf.sk_redir = NULL; skb->sk = psock->sock; - bpf_compute_data_pointers(skb); + bpf_compute_data_end_sk_skb(skb); preempt_disable(); rc = (*prog->bpf_func)(skb, prog->insnsi); preempt_enable(); @@ -1491,7 +1491,7 @@ static int smap_parse_func_strparser(struct strparser *strp, * any socket yet. */ skb->sk = psock->sock; - bpf_compute_data_pointers(skb); + bpf_compute_data_end_sk_skb(skb); rc = (*prog->bpf_func)(skb, prog->insnsi); skb->sk = NULL; rcu_read_unlock(); diff --git a/net/core/filter.c b/net/core/filter.c index 3095f1ba7015..470268024a40 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1762,6 +1762,37 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = { .arg2_type = ARG_ANYTHING, }; +static inline int sk_skb_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + int err = __bpf_try_make_writable(skb, write_len); + + bpf_compute_data_end_sk_skb(skb); + return err; +} + +BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) +{ + /* Idea is the following: should the needed direct read/write + * test fail during runtime, we can pull in more data and redo + * again, since implicitly, we invalidate previous checks here. + * + * Or, since we know how much we need to make read/writeable, + * this can be done once at the program beginning for direct + * access case. By this we overcome limitations of only current + * headroom being accessible. + */ + return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb)); +} + +static const struct bpf_func_proto sk_skb_pull_data_proto = { + .func = sk_skb_pull_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { @@ -2864,8 +2895,8 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) return __skb_trim_rcsum(skb, new_len); } -BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, - u64, flags) +static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, + u64 flags) { u32 max_len = __bpf_skb_max_len(skb); u32 min_len = __bpf_skb_min_len(skb); @@ -2901,6 +2932,13 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, if (!ret && skb_is_gso(skb)) skb_gso_reset(skb); } + return ret; +} + +BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, + u64, flags) +{ + int ret = __bpf_skb_change_tail(skb, new_len, flags); bpf_compute_data_pointers(skb); return ret; @@ -2915,8 +2953,26 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, +BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) +{ + int ret = __bpf_skb_change_tail(skb, new_len, flags); + + bpf_compute_data_end_sk_skb(skb); + return ret; +} + +static const struct bpf_func_proto sk_skb_change_tail_proto = { + .func = sk_skb_change_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, + u64 flags) { u32 max_len = __bpf_skb_max_len(skb); u32 new_len = skb->len + head_room; @@ -2942,8 +2998,16 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, skb_reset_mac_header(skb); } + return ret; +} + +BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + int ret = __bpf_skb_change_head(skb, head_room, flags); + bpf_compute_data_pointers(skb); - return 0; + return ret; } static const struct bpf_func_proto bpf_skb_change_head_proto = { @@ -2955,6 +3019,23 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, + u64, flags) +{ + int ret = __bpf_skb_change_head(skb, head_room, flags); + + bpf_compute_data_end_sk_skb(skb); + return ret; +} + +static const struct bpf_func_proto sk_skb_change_head_proto = { + .func = sk_skb_change_head, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) { return xdp_data_meta_unsupported(xdp) ? 0 : @@ -4618,9 +4699,12 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_skb_store_bytes || func == bpf_skb_change_proto || func == bpf_skb_change_head || + func == sk_skb_change_head || func == bpf_skb_change_tail || + func == sk_skb_change_tail || func == bpf_skb_adjust_room || func == bpf_skb_pull_data || + func == sk_skb_pull_data || func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || @@ -4872,11 +4956,11 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_pull_data: - return &bpf_skb_pull_data_proto; + return &sk_skb_pull_data_proto; case BPF_FUNC_skb_change_tail: - return &bpf_skb_change_tail_proto; + return &sk_skb_change_tail_proto; case BPF_FUNC_skb_change_head: - return &bpf_skb_change_head_proto; + return &sk_skb_change_head_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: -- cgit v1.2.3 From d8d7218ad842e18fc6976b87c08ed749e8d56313 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 6 Jul 2018 11:49:00 +0900 Subject: xdp: XDP_REDIRECT should check IFF_UP and MTU Otherwise we end up with attempting to send packets from down devices or to send oversized packets, which may cause unexpected driver/device behaviour. Generic XDP has already done this check, so reuse the logic in native XDP. Fixes: 814abfabef3c ("xdp: add bpf_redirect helper function") Signed-off-by: Toshiaki Makita Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 6 +++--- kernel/bpf/devmap.c | 7 ++++++- net/core/filter.c | 9 +++++++-- 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/linux/filter.h b/include/linux/filter.h index 300baad62c88..c73dd7396886 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -765,8 +765,8 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); -static inline int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, - struct net_device *fwd) +static inline int xdp_ok_fwd_dev(const struct net_device *fwd, + unsigned int pktlen) { unsigned int len; @@ -774,7 +774,7 @@ static inline int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, return -ENETDOWN; len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; - if (skb->len > len) + if (pktlen > len) return -EMSGSIZE; return 0; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 642c97f6d1b8..d361fc1e3bf3 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -334,10 +334,15 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; struct xdp_frame *xdpf; + int err; if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; + err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + if (unlikely(err)) + return err; + xdpf = convert_to_xdp_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; @@ -350,7 +355,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, { int err; - err = __xdp_generic_ok_fwd_dev(skb, dst->dev); + err = xdp_ok_fwd_dev(dst->dev, skb->len); if (unlikely(err)) return err; skb->dev = dst->dev; diff --git a/net/core/filter.c b/net/core/filter.c index 470268024a40..5fa66a33927f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3128,12 +3128,16 @@ static int __bpf_tx_xdp(struct net_device *dev, u32 index) { struct xdp_frame *xdpf; - int sent; + int err, sent; if (!dev->netdev_ops->ndo_xdp_xmit) { return -EOPNOTSUPP; } + err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + if (unlikely(err)) + return err; + xdpf = convert_to_xdp_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; @@ -3367,7 +3371,8 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, goto err; } - if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) + err = xdp_ok_fwd_dev(fwd, skb->len); + if (unlikely(err)) goto err; skb->dev = fwd; -- cgit v1.2.3 From e7372197e15856ec4ee66b668020a662994db103 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 7 Jul 2018 16:15:26 -0700 Subject: net/ipv4: Set oif in fib_compute_spec_dst Xin reported that icmp replies may not use the address on the device the echo request is received if the destination address is broadcast. Instead a route lookup is done without considering VRF context. Fix by setting oif in flow struct to the master device if it is enslaved. That directs the lookup to the VRF table. If the device is not enslaved, oif is still 0 so no affect. Fixes: cd2fbe1b6b51 ("net: Use VRF device index for lookups on RX") Reported-by: Xin Long Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b21833651394..e46cdd310e5f 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -300,6 +300,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { struct flowi4 fl4 = { .flowi4_iif = LOOPBACK_IFINDEX, + .flowi4_oif = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), .flowi4_scope = scope, -- cgit v1.2.3 From acc2cf4e37174646a24cba42fa53c668b2338d4e Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Sat, 7 Jul 2018 16:31:40 +0900 Subject: net: diag: Don't double-free TCP_NEW_SYN_RECV sockets in tcp_abort When tcp_diag_destroy closes a TCP_NEW_SYN_RECV socket, it first frees it by calling inet_csk_reqsk_queue_drop_and_and_put in tcp_abort, and then frees it again by calling sock_gen_put. Since tcp_abort only has one caller, and all the other codepaths in tcp_abort don't free the socket, just remove the free in that function. Cc: David Ahern Tested: passes Android sock_diag_test.py, which exercises this codepath Fixes: d7226c7a4dd1 ("net: diag: Fix refcnt leak in error path destroying socket") Signed-off-by: Lorenzo Colitti Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Tested-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e7b53d2a971f..c959bb6ea4ed 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3720,8 +3720,7 @@ int tcp_abort(struct sock *sk, int err) struct request_sock *req = inet_reqsk(sk); local_bh_disable(); - inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, - req); + inet_csk_reqsk_queue_drop(req->rsk_listener, req); local_bh_enable(); return 0; } -- cgit v1.2.3 From f6f2a4a2eb92bc73671204198bb2f8ab53ff59fb Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 6 Jul 2018 12:30:20 +0200 Subject: ipfrag: really prevent allocation on netns exit Setting the low threshold to 0 has no effect on frags allocation, we need to clear high_thresh instead. The code was pre-existent to commit 648700f76b03 ("inet: frags: use rhashtables for reassembly units"), but before the above, such assignment had a different role: prevent concurrent eviction from the worker and the netns cleanup helper. Fixes: 648700f76b03 ("inet: frags: use rhashtables for reassembly units") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/inet_fragment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index c9e35b81d093..1e4cf3ab560f 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -90,7 +90,7 @@ static void inet_frags_free_cb(void *ptr, void *arg) void inet_frags_exit_net(struct netns_frags *nf) { - nf->low_thresh = 0; /* prevent creation of new frags */ + nf->high_thresh = 0; /* prevent creation of new frags */ rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); } -- cgit v1.2.3 From 6508b6781be076f889e3077a1a5fadf1930a569d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Jul 2018 23:00:01 -0700 Subject: tcp: cleanup copied_seq and urg_data in tcp_disconnect tcp_zerocopy_receive() relies on tcp_inq() to limit number of bytes requested by user. syzbot found that after tcp_disconnect(), tcp_inq() was returning a stale value (number of bytes in queue before the disconnect). Note that after this patch, ioctl(fd, SIOCINQ, &val) is also fixed and returns 0, so this might be a candidate for all known linux kernels. While we are at this, we probably also should clear urg_data to avoid other syzkaller reports after it discovers how to deal with urgent data. syzkaller repro : socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 3 bind(3, {sa_family=AF_INET, sin_port=htons(20000), sin_addr=inet_addr("224.0.0.1")}, 16) = 0 connect(3, {sa_family=AF_INET, sin_port=htons(20000), sin_addr=inet_addr("127.0.0.1")}, 16) = 0 send(3, ..., 4096, 0) = 4096 connect(3, {sa_family=AF_UNSPEC, sa_data="\0\0\0\0\0\0\0\0\0\0\0\0\0\0"}, 128) = 0 getsockopt(3, SOL_TCP, TCP_ZEROCOPY_RECEIVE, ..., [16]) = 0 // CRASH Fixes: 05255b823a61 ("tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index c959bb6ea4ed..0d43705dd001 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2562,6 +2562,8 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); + tp->copied_seq = tp->rcv_nxt; + tp->urg_data = 0; tcp_write_queue_purge(sk); tcp_fastopen_active_disable_ofo_check(sk); skb_rbtree_purge(&tp->out_of_order_queue); -- cgit v1.2.3 From 21d5e078192d244df3d6049f9464fff2f72cfd68 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 6 Jul 2018 20:06:05 +0200 Subject: netfilter: nft_compat: explicitly reject ERROR and standard target iptables-nft never requests these, but make this explicitly illegal. If it were quested, kernel could oops as ->eval is NULL, furthermore, the builtin targets have no owning module so its possible to rmmod eb/ip/ip6_tables module even if they would be loaded. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_compat.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 8d1ff654e5af..32535eea51b2 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -832,10 +832,18 @@ nft_target_select_ops(const struct nft_ctx *ctx, rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV])); family = ctx->family; + if (strcmp(tg_name, XT_ERROR_TARGET) == 0 || + strcmp(tg_name, XT_STANDARD_TARGET) == 0 || + strcmp(tg_name, "standard") == 0) + return ERR_PTR(-EINVAL); + /* Re-use the existing target if it's already loaded. */ list_for_each_entry(nft_target, &nft_target_list, head) { struct xt_target *target = nft_target->ops.data; + if (!target->target) + continue; + if (nft_target_cmp(target, tg_name, rev, family)) return &nft_target->ops; } @@ -844,6 +852,11 @@ nft_target_select_ops(const struct nft_ctx *ctx, if (IS_ERR(target)) return ERR_PTR(-ENOENT); + if (!target->target) { + err = -EINVAL; + goto err; + } + if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO])) { err = -EINVAL; goto err; -- cgit v1.2.3 From 2045cdfa1b40d66f126f3fd05604fc7c754f0022 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 6 Jul 2018 16:38:53 +0300 Subject: netfilter: nf_conntrack: Fix possible possible crash on module loading. Loading the nf_conntrack module with doubled hashsize parameter, i.e. modprobe nf_conntrack hashsize=12345 hashsize=12345 causes NULL-ptr deref. If 'hashsize' specified twice, the nf_conntrack_set_hashsize() function will be called also twice. The first nf_conntrack_set_hashsize() call will set the 'nf_conntrack_htable_size' variable: nf_conntrack_set_hashsize() ... /* On boot, we can set this without any fancy locking. */ if (!nf_conntrack_htable_size) return param_set_uint(val, kp); But on the second invocation, the nf_conntrack_htable_size is already set, so the nf_conntrack_set_hashsize() will take a different path and call the nf_conntrack_hash_resize() function. Which will crash on the attempt to dereference 'nf_conntrack_hash' pointer: BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 RIP: 0010:nf_conntrack_hash_resize+0x255/0x490 [nf_conntrack] Call Trace: nf_conntrack_set_hashsize+0xcd/0x100 [nf_conntrack] parse_args+0x1f9/0x5a0 load_module+0x1281/0x1a50 __se_sys_finit_module+0xbe/0xf0 do_syscall_64+0x7c/0x390 entry_SYSCALL_64_after_hwframe+0x49/0xbe Fix this, by checking !nf_conntrack_hash instead of !nf_conntrack_htable_size. nf_conntrack_hash will be initialized only after the module loaded, so the second invocation of the nf_conntrack_set_hashsize() won't crash, it will just reinitialize nf_conntrack_htable_size again. Signed-off-by: Andrey Ryabinin Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3465da2a98bd..3d5280425027 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2043,7 +2043,7 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) return -EOPNOTSUPP; /* On boot, we can set this without any fancy locking. */ - if (!nf_conntrack_htable_size) + if (!nf_conntrack_hash) return param_set_uint(val, kp); rc = kstrtouint(val, 0, &hashsize); -- cgit v1.2.3 From 84379c9afe011020e797e3f50a662b08a6355dcf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 9 Jul 2018 13:43:38 +0200 Subject: netfilter: ipv6: nf_defrag: drop skb dst before queueing Eric Dumazet reports: Here is a reproducer of an annoying bug detected by syzkaller on our production kernel [..] ./b78305423 enable_conntrack Then : sleep 60 dmesg | tail -10 [ 171.599093] unregister_netdevice: waiting for lo to become free. Usage count = 2 [ 181.631024] unregister_netdevice: waiting for lo to become free. Usage count = 2 [ 191.687076] unregister_netdevice: waiting for lo to become free. Usage count = 2 [ 201.703037] unregister_netdevice: waiting for lo to become free. Usage count = 2 [ 211.711072] unregister_netdevice: waiting for lo to become free. Usage count = 2 [ 221.959070] unregister_netdevice: waiting for lo to become free. Usage count = 2 Reproducer sends ipv6 fragment that hits nfct defrag via LOCAL_OUT hook. skb gets queued until frag timer expiry -- 1 minute. Normally nf_conntrack_reasm gets called during prerouting, so skb has no dst yet which might explain why this wasn't spotted earlier. Reported-by: Eric Dumazet Reported-by: John Sperbeck Signed-off-by: Florian Westphal Tested-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nf_conntrack_reasm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index a452d99c9f52..e4d9e6976d3c 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -585,6 +585,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) fq->q.meat == fq->q.len && nf_ct_frag6_reasm(fq, skb, dev)) ret = 0; + else + skb_dst_drop(skb); out_unlock: spin_unlock_bh(&fq->q.lock); -- cgit v1.2.3 From 59ee4129a279070d8e2f9dc1660330f6593c7808 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 10 Jul 2018 00:43:22 +0200 Subject: bpf: fix ldx in ld_abs rewrite for large offsets Mark reported that syzkaller triggered a KASAN detected slab-out-of-bounds bug in ___bpf_prog_run() with a BPF_LD | BPF_ABS word load at offset 0x8001. After further investigation it became clear that the issue was the BPF_LDX_MEM() which takes offset as an argument whereas it cannot encode larger than S16_MAX offsets into it. For this synthetical case we need to move the full address into tmp register instead and do the LDX without immediate value. Fixes: e0cea7ce988c ("bpf: implement ld_abs/ld_ind in native bpf") Reported-by: syzbot Reported-by: Mark Rutland Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 5fa66a33927f..a13f5b1f1636 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -459,11 +459,21 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) (!unaligned_ok && offset >= 0 && offset + ip_align >= 0 && offset + ip_align % size == 0))) { + bool ldx_off_ok = offset <= S16_MAX; + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); - *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian); - *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D, - offset); + *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, + size, 2 + endian + (!ldx_off_ok * 2)); + if (ldx_off_ok) { + *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, + BPF_REG_D, offset); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset); + *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, + BPF_REG_TMP, 0); + } if (endian) *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); *insn++ = BPF_JMP_A(8); -- cgit v1.2.3 From 61d769807f273fda962866f3d4c677cda9974d3c Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Tue, 10 Jul 2018 16:54:02 +0000 Subject: bpf: fix availability probing for seg6 helpers bpf_lwt_seg6_* helpers require CONFIG_IPV6_SEG6_BPF, and currently return -EOPNOTSUPP to indicate unavailability. This patch forces the BPF verifier to reject programs using these helpers when !CONFIG_IPV6_SEG6_BPF, allowing users to more easily probe if they are available or not. Signed-off-by: Mathieu Xhonneux Signed-off-by: Daniel Borkmann --- net/core/filter.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index a13f5b1f1636..06da770f543f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4536,10 +4536,10 @@ static const struct bpf_func_proto bpf_lwt_push_encap_proto = { .arg4_type = ARG_CONST_SIZE }; +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, const void *, from, u32, len) { -#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); void *srh_tlvs, *srh_end, *ptr; @@ -4565,9 +4565,6 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, memcpy(skb->data + offset, from, len); return 0; -#else /* CONFIG_IPV6_SEG6_BPF */ - return -EOPNOTSUPP; -#endif } static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { @@ -4583,7 +4580,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, u32, action, void *, param, u32, param_len) { -#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); struct ipv6_sr_hdr *srh; @@ -4631,9 +4627,6 @@ BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, default: return -EINVAL; } -#else /* CONFIG_IPV6_SEG6_BPF */ - return -EOPNOTSUPP; -#endif } static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { @@ -4649,7 +4642,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, s32, len) { -#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); void *srh_end, *srh_tlvs, *ptr; @@ -4693,9 +4685,6 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, srh_state->hdrlen += len; srh_state->valid = 0; return 0; -#else /* CONFIG_IPV6_SEG6_BPF */ - return -EOPNOTSUPP; -#endif } static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { @@ -4706,6 +4695,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; +#endif /* CONFIG_IPV6_SEG6_BPF */ bool bpf_helper_changes_pkt_data(void *func) { @@ -4727,11 +4717,12 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_xdp_adjust_meta || func == bpf_msg_pull_data || func == bpf_xdp_adjust_tail || - func == bpf_lwt_push_encap || +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) func == bpf_lwt_seg6_store_bytes || func == bpf_lwt_seg6_adjust_srh || - func == bpf_lwt_seg6_action - ) + func == bpf_lwt_seg6_action || +#endif + func == bpf_lwt_push_encap) return true; return false; @@ -5066,12 +5057,14 @@ static const struct bpf_func_proto * lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { +#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) case BPF_FUNC_lwt_seg6_store_bytes: return &bpf_lwt_seg6_store_bytes_proto; case BPF_FUNC_lwt_seg6_action: return &bpf_lwt_seg6_action_proto; case BPF_FUNC_lwt_seg6_adjust_srh: return &bpf_lwt_seg6_adjust_srh_proto; +#endif default: return lwt_out_func_proto(func_id, prog); } -- cgit v1.2.3 From 6e6fddc78323533be570873abb728b7e0ba7e024 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 11 Jul 2018 15:30:14 +0200 Subject: bpf: fix panic due to oob in bpf_prog_test_run_skb sykzaller triggered several panics similar to the below: [...] [ 248.851531] BUG: KASAN: use-after-free in _copy_to_user+0x5c/0x90 [ 248.857656] Read of size 985 at addr ffff8808017ffff2 by task a.out/1425 [...] [ 248.865902] CPU: 1 PID: 1425 Comm: a.out Not tainted 4.18.0-rc4+ #13 [ 248.865903] Hardware name: Supermicro SYS-5039MS-H12TRF/X11SSE-F, BIOS 2.1a 03/08/2018 [ 248.865905] Call Trace: [ 248.865910] dump_stack+0xd6/0x185 [ 248.865911] ? show_regs_print_info+0xb/0xb [ 248.865913] ? printk+0x9c/0xc3 [ 248.865915] ? kmsg_dump_rewind_nolock+0xe4/0xe4 [ 248.865919] print_address_description+0x6f/0x270 [ 248.865920] kasan_report+0x25b/0x380 [ 248.865922] ? _copy_to_user+0x5c/0x90 [ 248.865924] check_memory_region+0x137/0x190 [ 248.865925] kasan_check_read+0x11/0x20 [ 248.865927] _copy_to_user+0x5c/0x90 [ 248.865930] bpf_test_finish.isra.8+0x4f/0xc0 [ 248.865932] bpf_prog_test_run_skb+0x6a0/0xba0 [...] After scrubbing the BPF prog a bit from the noise, turns out it called bpf_skb_change_head() for the lwt_xmit prog with headroom of 2. Nothing wrong in that, however, this was run with repeat >> 0 in bpf_prog_test_run_skb() and the same skb thus keeps changing until the pskb_expand_head() called from skb_cow() keeps bailing out in atomic alloc context with -ENOMEM. So upon return we'll basically have 0 headroom left yet blindly do the __skb_push() of 14 bytes and keep copying data from there in bpf_test_finish() out of bounds. Fix to check if we have enough headroom and if pskb_expand_head() fails, bail out with error. Another bug independent of this fix (but related in triggering above) is that BPF_PROG_TEST_RUN should be reworked to reset the skb/xdp buffer to it's original state from input as otherwise repeating the same test in a loop won't work for benchmarking when underlying input buffer is getting changed by the prog each time and reused for the next run leading to unexpected results. Fixes: 1cf1cae963c2 ("bpf: introduce BPF_PROG_TEST_RUN command") Reported-by: syzbot+709412e651e55ed96498@syzkaller.appspotmail.com Reported-by: syzbot+54f39d6ab58f39720a55@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- net/bpf/test_run.c | 17 ++++++++++++++--- tools/testing/selftests/bpf/test_verifier.c | 23 ++++++++++++++++++++++- 2 files changed, 36 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 68c3578343b4..22a78eedf4b1 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -96,6 +96,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; u32 retval, duration; + int hh_len = ETH_HLEN; struct sk_buff *skb; void *data; int ret; @@ -131,12 +132,22 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, skb_reset_network_header(skb); if (is_l2) - __skb_push(skb, ETH_HLEN); + __skb_push(skb, hh_len); if (is_direct_pkt_access) bpf_compute_data_pointers(skb); retval = bpf_test_run(prog, skb, repeat, &duration); - if (!is_l2) - __skb_push(skb, ETH_HLEN); + if (!is_l2) { + if (skb_headroom(skb) < hh_len) { + int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); + + if (pskb_expand_head(skb, nhead, 0, GFP_USER)) { + kfree_skb(skb); + return -ENOMEM; + } + } + memset(__skb_push(skb, hh_len), 0, hh_len); + } + size = skb->len; /* bpf program can never convert linear skb to non-linear */ if (WARN_ON_ONCE(skb_is_nonlinear(skb))) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 2ecd27b670d7..f5f7bcc96046 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -4974,6 +4974,24 @@ static struct bpf_test tests[] = { .result = ACCEPT, .prog_type = BPF_PROG_TYPE_LWT_XMIT, }, + { + "make headroom for LWT_XMIT", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_MOV64_IMM(BPF_REG_2, 34), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_skb_change_head), + /* split for s390 to succeed */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_2, 42), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_skb_change_head), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_LWT_XMIT, + }, { "invalid access of tc_classid for LWT_IN", .insns = { @@ -12554,8 +12572,11 @@ static void do_test_single(struct bpf_test *test, bool unpriv, } if (fd_prog >= 0) { + __u8 tmp[TEST_DATA_LEN << 2]; + __u32 size_tmp = sizeof(tmp); + err = bpf_prog_test_run(fd_prog, 1, test->data, - sizeof(test->data), NULL, NULL, + sizeof(test->data), tmp, &size_tmp, &retval, NULL); if (err && errno != 524/*ENOTSUPP*/ && errno != EPERM) { printf("Unexpected bpf_prog_test_run error\n"); -- cgit v1.2.3 From 83fe6b8709f65bc505b10235bd82ece12c4c5099 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 10 Jul 2018 14:22:27 -0700 Subject: sch_fq_codel: zero q->flows_cnt when fq_codel_init fails When fq_codel_init fails, qdisc_create_dflt will cleanup by using qdisc_destroy. This function calls the ->reset() op prior to calling the ->destroy() op. Unfortunately, during the failure flow for sch_fq_codel, the ->flows parameter is not initialized, so the fq_codel_reset function will null pointer dereference. kernel: BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 kernel: IP: fq_codel_reset+0x58/0xd0 [sch_fq_codel] kernel: PGD 0 P4D 0 kernel: Oops: 0000 [#1] SMP PTI kernel: Modules linked in: i40iw i40e(OE) xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack tun bridge stp llc devlink ebtable_filter ebtables ip6table_filter ip6_tables rpcrdma ib_isert iscsi_target_mod sunrpc ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel intel_cstate iTCO_wdt iTCO_vendor_support intel_uncore ib_core intel_rapl_perf mei_me mei joydev i2c_i801 lpc_ich ioatdma shpchp wmi sch_fq_codel xfs libcrc32c mgag200 ixgbe drm_kms_helper isci ttm firewire_ohci kernel: mdio drm igb libsas crc32c_intel firewire_core ptp pps_core scsi_transport_sas crc_itu_t dca i2c_algo_bit ipmi_si ipmi_devintf ipmi_msghandler [last unloaded: i40e] kernel: CPU: 10 PID: 4219 Comm: ip Tainted: G OE 4.16.13custom-fq-codel-test+ #3 kernel: Hardware name: Intel Corporation S2600CO/S2600CO, BIOS SE5C600.86B.02.05.0004.051120151007 05/11/2015 kernel: RIP: 0010:fq_codel_reset+0x58/0xd0 [sch_fq_codel] kernel: RSP: 0018:ffffbfbf4c1fb620 EFLAGS: 00010246 kernel: RAX: 0000000000000400 RBX: 0000000000000000 RCX: 00000000000005b9 kernel: RDX: 0000000000000000 RSI: ffff9d03264a60c0 RDI: ffff9cfd17b31c00 kernel: RBP: 0000000000000001 R08: 00000000000260c0 R09: ffffffffb679c3e9 kernel: R10: fffff1dab06a0e80 R11: ffff9cfd163af800 R12: ffff9cfd17b31c00 kernel: R13: 0000000000000001 R14: ffff9cfd153de600 R15: 0000000000000001 kernel: FS: 00007fdec2f92800(0000) GS:ffff9d0326480000(0000) knlGS:0000000000000000 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 kernel: CR2: 0000000000000008 CR3: 0000000c1956a006 CR4: 00000000000606e0 kernel: Call Trace: kernel: qdisc_destroy+0x56/0x140 kernel: qdisc_create_dflt+0x8b/0xb0 kernel: mq_init+0xc1/0xf0 kernel: qdisc_create_dflt+0x5a/0xb0 kernel: dev_activate+0x205/0x230 kernel: __dev_open+0xf5/0x160 kernel: __dev_change_flags+0x1a3/0x210 kernel: dev_change_flags+0x21/0x60 kernel: do_setlink+0x660/0xdf0 kernel: ? down_trylock+0x25/0x30 kernel: ? xfs_buf_trylock+0x1a/0xd0 [xfs] kernel: ? rtnl_newlink+0x816/0x990 kernel: ? _xfs_buf_find+0x327/0x580 [xfs] kernel: ? _cond_resched+0x15/0x30 kernel: ? kmem_cache_alloc+0x20/0x1b0 kernel: ? rtnetlink_rcv_msg+0x200/0x2f0 kernel: ? rtnl_calcit.isra.30+0x100/0x100 kernel: ? netlink_rcv_skb+0x4c/0x120 kernel: ? netlink_unicast+0x19e/0x260 kernel: ? netlink_sendmsg+0x1ff/0x3c0 kernel: ? sock_sendmsg+0x36/0x40 kernel: ? ___sys_sendmsg+0x295/0x2f0 kernel: ? ebitmap_cmp+0x6d/0x90 kernel: ? dev_get_by_name_rcu+0x73/0x90 kernel: ? skb_dequeue+0x52/0x60 kernel: ? __inode_wait_for_writeback+0x7f/0xf0 kernel: ? bit_waitqueue+0x30/0x30 kernel: ? fsnotify_grab_connector+0x3c/0x60 kernel: ? __sys_sendmsg+0x51/0x90 kernel: ? do_syscall_64+0x74/0x180 kernel: ? entry_SYSCALL_64_after_hwframe+0x3d/0xa2 kernel: Code: 00 00 48 89 87 00 02 00 00 8b 87 a0 01 00 00 85 c0 0f 84 84 00 00 00 31 ed 48 63 dd 83 c5 01 48 c1 e3 06 49 03 9c 24 90 01 00 00 <48> 8b 73 08 48 8b 3b e8 6c 9a 4f f6 48 8d 43 10 48 c7 03 00 00 kernel: RIP: fq_codel_reset+0x58/0xd0 [sch_fq_codel] RSP: ffffbfbf4c1fb620 kernel: CR2: 0000000000000008 kernel: ---[ end trace e81a62bede66274e ]--- This is caused because flows_cnt is non-zero, but flows hasn't been initialized. fq_codel_init has left the private data in a partially initialized state. To fix this, reset flows_cnt to 0 when we fail to initialize. Additionally, to make the state more consistent, also cleanup the flows pointer when the allocation of backlogs fails. This fixes the NULL pointer dereference, since both the for-loop and memset in fq_codel_reset will be no-ops when flow_cnt is zero. Signed-off-by: Jacob Keller Signed-off-by: David S. Miller --- net/sched/sch_fq_codel.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index cd2e0e342fb6..6c0a9d5dbf94 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -479,24 +479,28 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt, q->cparams.mtu = psched_mtu(qdisc_dev(sch)); if (opt) { - int err = fq_codel_change(sch, opt, extack); + err = fq_codel_change(sch, opt, extack); if (err) - return err; + goto init_failure; } err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) - return err; + goto init_failure; if (!q->flows) { q->flows = kvcalloc(q->flows_cnt, sizeof(struct fq_codel_flow), GFP_KERNEL); - if (!q->flows) - return -ENOMEM; + if (!q->flows) { + err = -ENOMEM; + goto init_failure; + } q->backlogs = kvcalloc(q->flows_cnt, sizeof(u32), GFP_KERNEL); - if (!q->backlogs) - return -ENOMEM; + if (!q->backlogs) { + err = -ENOMEM; + goto alloc_failure; + } for (i = 0; i < q->flows_cnt; i++) { struct fq_codel_flow *flow = q->flows + i; @@ -509,6 +513,13 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt, else sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; + +alloc_failure: + kvfree(q->flows); + q->flows = NULL; +init_failure: + q->flows_cnt = 0; + return err; } static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) -- cgit v1.2.3 From 21684dc46c598e477707487c009f9773f7c0382d Mon Sep 17 00:00:00 2001 From: Stefan Baranoff Date: Tue, 10 Jul 2018 17:25:20 -0400 Subject: tcp: fix sequence numbers for repaired sockets re-using TIME-WAIT sockets This patch fixes a bug where the sequence numbers of a socket created using TCP repair functionality are lower than set after connect is called. This occurs when the repair socket overlaps with a TIME-WAIT socket and triggers the re-use code. The amount lower is equal to the number of times that a particular IP/port set is re-used and then put back into TIME-WAIT. Re-using the first time the sequence number is 1 lower, closing that socket and then re-opening (with repair) a new socket with the same addresses/ports puts the sequence number 2 lower than set via setsockopt. The third time is 3 lower, etc. I have not tested what the limit of this acrewal is, if any. The fix is, if a socket is in repair mode, to respect the already set sequence number and timestamp when it would have already re-used the TIME-WAIT socket. Signed-off-by: Stefan Baranoff Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index bea17f1e8302..3b2711e33e4c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -156,11 +156,24 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) */ if (tcptw->tw_ts_recent_stamp && (!twp || (reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; - tp->rx_opt.ts_recent = tcptw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + /* In case of repair and re-using TIME-WAIT sockets we still + * want to be sure that it is safe as above but honor the + * sequence numbers and time stamps set as part of the repair + * process. + * + * Without this check re-using a TIME-WAIT socket with TCP + * repair would accumulate a -1 on the repair assigned + * sequence number. The first time it is reused the sequence + * is -1, the second time -2, etc. This fixes that issue + * without appearing to create any others. + */ + if (likely(!tp->repair)) { + tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; + if (tp->write_seq == 0) + tp->write_seq = 1; + tp->rx_opt.ts_recent = tcptw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + } sock_hold(sktw); return 1; } -- cgit v1.2.3 From 70b7ff130224d2d22a158c7f4aa5e7fb1c95949d Mon Sep 17 00:00:00 2001 From: Stefan Baranoff Date: Tue, 10 Jul 2018 17:31:10 -0400 Subject: tcp: allow user to create repair socket without window probes Under rare conditions where repair code may be used it is possible that window probes are either unnecessary or undesired. If the user knows that window probes are not wanted or needed this change allows them to skip sending them when a socket comes out of repair. Signed-off-by: Stefan Baranoff Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0d43705dd001..8e5e2ca9ab1b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2823,14 +2823,16 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_REPAIR: if (!tcp_can_repair_sock(sk)) err = -EPERM; - else if (val == 1) { - tp->repair = 1; + /* 1 for normal repair, 2 for no window probes */ + else if (val == 1 || val == 2) { + tp->repair = val; sk->sk_reuse = SK_FORCE_REUSE; tp->repair_queue = TCP_NO_QUEUE; } else if (val == 0) { tp->repair = 0; sk->sk_reuse = SK_NO_REUSE; - tcp_send_window_probe(sk); + if (tp->repair == 1) + tcp_send_window_probe(sk); } else err = -EINVAL; -- cgit v1.2.3 From 8b7008620b8452728cadead460a36f64ed78c460 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Wed, 11 Jul 2018 14:39:42 +0200 Subject: net: Don't copy pfmemalloc flag in __copy_skb_header() The pfmemalloc flag indicates that the skb was allocated from the PFMEMALLOC reserves, and the flag is currently copied on skb copy and clone. However, an skb copied from an skb flagged with pfmemalloc wasn't necessarily allocated from PFMEMALLOC reserves, and on the other hand an skb allocated that way might be copied from an skb that wasn't. So we should not copy the flag on skb copy, and rather decide whether to allow an skb to be associated with sockets unrelated to page reclaim depending only on how it was allocated. Move the pfmemalloc flag before headers_start[0] using an existing 1-bit hole, so that __copy_skb_header() doesn't copy it. When cloning, we'll now take care of this flag explicitly, contravening to the warning comment of __skb_clone(). While at it, restore the newline usage introduced by commit b19372273164 ("net: reorganize sk_buff for faster __copy_skb_header()") to visually separate bytes used in bitfields after headers_start[0], that was gone after commit a9e419dc7be6 ("netfilter: merge ctinfo into nfct pointer storage area"), and describe the pfmemalloc flag in the kernel-doc structure comment. This doesn't change the size of sk_buff or cacheline boundaries, but consolidates the 15 bits hole before tc_index into a 2 bytes hole before csum, that could now be filled more easily. Reported-by: Patrick Talbert Fixes: c93bdd0e03e8 ("netvm: allow skb allocation to use PFMEMALLOC reserves") Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller --- include/linux/skbuff.h | 10 +++++----- net/core/skbuff.c | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 164cdedf6012..610a201126ee 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -630,6 +630,7 @@ typedef unsigned char *sk_buff_data_t; * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices * @xmit_more: More SKBs are pending for this queue + * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves * @ndisc_nodetype: router type (from link layer) * @ooo_okay: allow the mapping of a socket to a queue to be changed * @l4_hash: indicate hash is a canonical 4-tuple hash over transport @@ -735,7 +736,7 @@ struct sk_buff { peeked:1, head_frag:1, xmit_more:1, - __unused:1; /* one bit hole */ + pfmemalloc:1; /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() @@ -754,31 +755,30 @@ struct sk_buff { __u8 __pkt_type_offset[0]; __u8 pkt_type:3; - __u8 pfmemalloc:1; __u8 ignore_df:1; - __u8 nf_trace:1; __u8 ip_summed:2; __u8 ooo_okay:1; + __u8 l4_hash:1; __u8 sw_hash:1; __u8 wifi_acked_valid:1; __u8 wifi_acked:1; - __u8 no_fcs:1; /* Indicates the inner headers are valid in the skbuff. */ __u8 encapsulation:1; __u8 encap_hdr_csum:1; __u8 csum_valid:1; + __u8 csum_complete_sw:1; __u8 csum_level:2; __u8 csum_not_inet:1; - __u8 dst_pending_confirm:1; #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif __u8 ipvs_property:1; + __u8 inner_protocol_type:1; __u8 remcsum_offload:1; #ifdef CONFIG_NET_SWITCHDEV diff --git a/net/core/skbuff.c b/net/core/skbuff.c index eba8dae22c25..4df3164bb5fc 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -858,6 +858,8 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) n->cloned = 1; n->nohdr = 0; n->peeked = 0; + if (skb->pfmemalloc) + n->pfmemalloc = 1; n->destructor = NULL; C(tail); C(end); -- cgit v1.2.3 From bab2c80e5a6c855657482eac9e97f5f3eedb509a Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 11 Jul 2018 12:00:44 -0400 Subject: nsh: set mac len based on inner packet When pulling the NSH header in nsh_gso_segment, set the mac length based on the encapsulated packet type. skb_reset_mac_len computes an offset to the network header, which here still points to the outer packet: > skb_reset_network_header(skb); > [...] > __skb_pull(skb, nsh_len); > skb_reset_mac_header(skb); // now mac hdr starts nsh_len == 8B after net hdr > skb_reset_mac_len(skb); // mac len = net hdr - mac hdr == (u16) -8 == 65528 > [..] > skb_mac_gso_segment(skb, ..) Link: http://lkml.kernel.org/r/CAF=yD-KeAcTSOn4AxirAxL8m7QAS8GBBe1w09eziYwvPbbUeYA@mail.gmail.com Reported-by: syzbot+7b9ed9872dab8c32305d@syzkaller.appspotmail.com Fixes: c411ed854584 ("nsh: add GSO support") Signed-off-by: Willem de Bruijn Acked-by: Jiri Benc Signed-off-by: David S. Miller --- net/nsh/nsh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c index 9696ef96b719..1a30e165eeb4 100644 --- a/net/nsh/nsh.c +++ b/net/nsh/nsh.c @@ -104,7 +104,7 @@ static struct sk_buff *nsh_gso_segment(struct sk_buff *skb, __skb_pull(skb, nsh_len); skb_reset_mac_header(skb); - skb_reset_mac_len(skb); + skb->mac_len = proto == htons(ETH_P_TEB) ? ETH_HLEN : 0; skb->protocol = proto; features &= NETIF_F_SG; -- cgit v1.2.3 From 993675a3100b16a4c80dfd70cbcde8ea7127b31d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 11 Jul 2018 12:00:45 -0400 Subject: packet: reset network header if packet shorter than ll reserved space If variable length link layer headers result in a packet shorter than dev->hard_header_len, reset the network header offset. Else skb->mac_len may exceed skb->len after skb_mac_reset_len. packet_sendmsg_spkt already has similar logic. Fixes: b84bbaf7a6c8 ("packet: in packet_snd start writing at link layer allocation") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 57634bc3da74..9b27d0cd766d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2878,6 +2878,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_free; } else if (reserve) { skb_reserve(skb, -reserve); + if (len < reserve) + skb_reset_network_header(skb); } /* Returns -EFAULT on error */ -- cgit v1.2.3 From 509d7648135f914a3dd64c17484b33df5dd0a19c Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 11 Jul 2018 10:12:49 +0200 Subject: xsk: do not return ENXIO from TX copy mode This patch removes the ENXIO return code from TX copy-mode when someone has forcefully changed the number of queues on the device so that the queue bound to the socket is no longer available. Just silently stop sending anything as in zero-copy mode so the error reporting gets consistent between the two modes. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 7d220cbd09b6..08d09115093e 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -244,10 +244,8 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, goto out; } - if (xs->queue_id >= xs->dev->real_num_tx_queues) { - err = -ENXIO; + if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; - } skb = sock_alloc_send_skb(sk, len, 1, &err); if (unlikely(!skb)) { -- cgit v1.2.3 From 9684f5e7c8cdf076aeec81344d4893a30f7aa6a1 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 11 Jul 2018 10:12:50 +0200 Subject: xsk: do not return EAGAIN from sendmsg when completion queue is full This patch stops returning EAGAIN in TX copy mode when the completion queue is full as zero-copy does not do this. Instead this situation can be detected by comparing the head and tail pointers of the completion queue in both modes. In any case, EAGAIN was not the correct error code here since no amount of calling sendmsg will solve the problem. Only consuming one or more messages on the completion queue will fix this. With this patch, the error reporting becomes consistent between copy mode and zero-copy mode. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 08d09115093e..87567232d0f8 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -233,10 +233,8 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, goto out; } - if (xskq_reserve_addr(xs->umem->cq)) { - err = -EAGAIN; + if (xskq_reserve_addr(xs->umem->cq)) goto out; - } len = desc.len; if (unlikely(len > xs->dev->mtu)) { -- cgit v1.2.3 From 6efb4436f7fcc50cc3fb9a113d0f16e3968172b1 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 11 Jul 2018 10:12:51 +0200 Subject: xsk: always return ENOBUFS from sendmsg if there is no TX queue This patch makes sure ENOBUFS is always returned from sendmsg if there is no TX queue configured. This was not the case for zero-copy mode. With this patch this error reporting is consistent between copy mode and zero-copy mode. Fixes: ac98d8aab61b ("xsk: wire upp Tx zero-copy functions") Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 87567232d0f8..9c784307f7b0 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -218,9 +218,6 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, struct sk_buff *skb; int err = 0; - if (unlikely(!xs->tx)) - return -ENOBUFS; - mutex_lock(&xs->mutex); while (xskq_peek_desc(xs->tx, &desc)) { @@ -296,6 +293,8 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) return -ENXIO; if (unlikely(!(xs->dev->flags & IFF_UP))) return -ENETDOWN; + if (unlikely(!xs->tx)) + return -ENOBUFS; if (need_wait) return -EOPNOTSUPP; -- cgit v1.2.3 From 09210c4bcc065d9d91ef3c051902ad18252cd3c0 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 11 Jul 2018 10:12:52 +0200 Subject: xsk: do not return EMSGSIZE in copy mode for packets larger than MTU This patch stops returning EMSGSIZE from sendmsg in copy mode when the size of the packet is larger than the MTU. Just send it to the device so that it will drop it as in zero-copy mode. This makes the error reporting consistent between copy mode and zero-copy mode. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 9c784307f7b0..72335c2e8108 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -233,15 +233,10 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, if (xskq_reserve_addr(xs->umem->cq)) goto out; - len = desc.len; - if (unlikely(len > xs->dev->mtu)) { - err = -EMSGSIZE; - goto out; - } - if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; + len = desc.len; skb = sock_alloc_send_skb(sk, len, 1, &err); if (unlikely(!skb)) { err = -EAGAIN; -- cgit v1.2.3 From e78bfb0751d4e312699106ba7efbed2bab1a53ca Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Fri, 13 Jul 2018 13:21:07 +0200 Subject: skbuff: Unconditionally copy pfmemalloc in __skb_clone() Commit 8b7008620b84 ("net: Don't copy pfmemalloc flag in __copy_skb_header()") introduced a different handling for the pfmemalloc flag in copy and clone paths. In __skb_clone(), now, the flag is set only if it was set in the original skb, but not cleared if it wasn't. This is wrong and might lead to socket buffers being flagged with pfmemalloc even if the skb data wasn't allocated from pfmemalloc reserves. Copy the flag instead of ORing it. Reported-by: Sabrina Dubroca Fixes: 8b7008620b84 ("net: Don't copy pfmemalloc flag in __copy_skb_header()") Signed-off-by: Stefano Brivio Tested-by: Sabrina Dubroca Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4df3164bb5fc..8e51f8555e11 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -858,8 +858,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) n->cloned = 1; n->nohdr = 0; n->peeked = 0; - if (skb->pfmemalloc) - n->pfmemalloc = 1; + C(pfmemalloc); n->destructor = NULL; C(tail); C(end); -- cgit v1.2.3 From b0c05d0e99d98d7f0cd41efc1eeec94efdc3325d Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 12 Jul 2018 06:04:52 -0700 Subject: tcp: fix dctcp delayed ACK schedule Previously, when a data segment was sent an ACK was piggybacked on the data segment without generating a CA_EVENT_NON_DELAYED_ACK event to notify congestion control modules. So the DCTCP ca->delayed_ack_reserved flag could incorrectly stay set when in fact there were no delayed ACKs being reserved. This could result in sending a special ECN notification ACK that carries an older ACK sequence, when in fact there was no need for such an ACK. DCTCP keeps track of the delayed ACK status with its own separate state ca->delayed_ack_reserved. Previously it may accidentally cancel the delayed ACK without updating this field upon sending a special ACK that carries a older ACK sequence. This inconsistency would lead to DCTCP receiver never acknowledging the latest data until the sender times out and retry in some cases. Packetdrill script (provided by Larry Brakmo) 0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0 0.000 bind(3, ..., ...) = 0 0.000 listen(3, 1) = 0 0.100 < [ect0] SEW 0:0(0) win 32792 0.100 > SE. 0:0(0) ack 1 0.110 < [ect0] . 1:1(0) ack 1 win 257 0.200 accept(3, ..., ...) = 4 0.200 < [ect0] . 1:1001(1000) ack 1 win 257 0.200 > [ect01] . 1:1(0) ack 1001 0.200 write(4, ..., 1) = 1 0.200 > [ect01] P. 1:2(1) ack 1001 0.200 < [ect0] . 1001:2001(1000) ack 2 win 257 0.200 write(4, ..., 1) = 1 0.200 > [ect01] P. 2:3(1) ack 2001 0.200 < [ect0] . 2001:3001(1000) ack 3 win 257 0.200 < [ect0] . 3001:4001(1000) ack 3 win 257 0.200 > [ect01] . 3:3(0) ack 4001 0.210 < [ce] P. 4001:4501(500) ack 3 win 257 +0.001 read(4, ..., 4500) = 4500 +0 write(4, ..., 1) = 1 +0 > [ect01] PE. 3:4(1) ack 4501 +0.010 < [ect0] W. 4501:5501(1000) ack 4 win 257 // Previously the ACK sequence below would be 4501, causing a long RTO +0.040~+0.045 > [ect01] . 4:4(0) ack 5501 // delayed ack +0.311 < [ect0] . 5501:6501(1000) ack 4 win 257 // More data +0 > [ect01] . 4:4(0) ack 6501 // now acks everything +0.500 < F. 9501:9501(0) ack 4 win 257 Reported-by: Larry Brakmo Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Lawrence Brakmo Signed-off-by: David S. Miller --- net/ipv4/tcp_dctcp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 5f5e5936760e..89f88b0d8167 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -134,7 +134,8 @@ static void dctcp_ce_state_0_to_1(struct sock *sk) /* State has changed from CE=0 to CE=1 and delayed * ACK has not sent yet. */ - if (!ca->ce_state && ca->delayed_ack_reserved) { + if (!ca->ce_state && + inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { u32 tmp_rcv_nxt; /* Save current rcv_nxt. */ @@ -164,7 +165,8 @@ static void dctcp_ce_state_1_to_0(struct sock *sk) /* State has changed from CE=1 to CE=0 and delayed * ACK has not sent yet. */ - if (ca->ce_state && ca->delayed_ack_reserved) { + if (ca->ce_state && + inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { u32 tmp_rcv_nxt; /* Save current rcv_nxt. */ -- cgit v1.2.3 From a69258f7aa2623e0930212f09c586fd06674ad79 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 12 Jul 2018 06:04:53 -0700 Subject: tcp: remove DELAYED ACK events in DCTCP After fixing the way DCTCP tracking delayed ACKs, the delayed-ACK related callbacks are no longer needed Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Acked-by: Neal Cardwell Acked-by: Lawrence Brakmo Signed-off-by: David S. Miller --- include/net/tcp.h | 2 -- net/ipv4/tcp_dctcp.c | 25 ------------------------- net/ipv4/tcp_output.c | 4 ---- 3 files changed, 31 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index af3ec72d5d41..3482d13d655b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -912,8 +912,6 @@ enum tcp_ca_event { CA_EVENT_LOSS, /* loss timeout */ CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ - CA_EVENT_DELAYED_ACK, /* Delayed ack is sent */ - CA_EVENT_NON_DELAYED_ACK, }; /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 89f88b0d8167..5869f89ca656 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -55,7 +55,6 @@ struct dctcp { u32 dctcp_alpha; u32 next_seq; u32 ce_state; - u32 delayed_ack_reserved; u32 loss_cwnd; }; @@ -96,7 +95,6 @@ static void dctcp_init(struct sock *sk) ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); - ca->delayed_ack_reserved = 0; ca->loss_cwnd = 0; ca->ce_state = 0; @@ -250,25 +248,6 @@ static void dctcp_state(struct sock *sk, u8 new_state) } } -static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev) -{ - struct dctcp *ca = inet_csk_ca(sk); - - switch (ev) { - case CA_EVENT_DELAYED_ACK: - if (!ca->delayed_ack_reserved) - ca->delayed_ack_reserved = 1; - break; - case CA_EVENT_NON_DELAYED_ACK: - if (ca->delayed_ack_reserved) - ca->delayed_ack_reserved = 0; - break; - default: - /* Don't care for the rest. */ - break; - } -} - static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) { switch (ev) { @@ -278,10 +257,6 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) case CA_EVENT_ECN_NO_CE: dctcp_ce_state_1_to_0(sk); break; - case CA_EVENT_DELAYED_ACK: - case CA_EVENT_NON_DELAYED_ACK: - dctcp_update_ack_reserved(sk, ev); - break; default: /* Don't care for the rest. */ break; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8e08b409c71e..00e5a300ddb9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3523,8 +3523,6 @@ void tcp_send_delayed_ack(struct sock *sk) int ato = icsk->icsk_ack.ato; unsigned long timeout; - tcp_ca_event(sk, CA_EVENT_DELAYED_ACK); - if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ / 2; @@ -3581,8 +3579,6 @@ void tcp_send_ack(struct sock *sk) if (sk->sk_state == TCP_CLOSE) return; - tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK); - /* We are not putting this on the write queue, so * tcp_transmit_skb() will set the ownership to this * sock. -- cgit v1.2.3 From c290fba8c4ce6530cd941ea14db5a4ac2f77183f Mon Sep 17 00:00:00 2001 From: piaojun Date: Fri, 13 Jul 2018 16:59:06 -0700 Subject: net/9p/client.c: put refcount of trans_mod in error case in parse_opts() In my testing, the second mount will fail after umounting successfully. The reason is that we put refcount of trans_mod in the correct case rather than the error case in parse_opts() at last. That will cause the refcount decrease to -1, and when we try to get trans_mod again in try_module_get(), we could only increase refcount to 0 which will cause failure as follows: parse_opts v9fs_get_trans_by_name try_module_get : return NULL to caller which cause error So we should put refcount of trans_mod in error case. Link: http://lkml.kernel.org/r/5B3F39A0.2030509@huawei.com Fixes: 9421c3e64137ec ("net/9p/client.c: fix potential refcnt problem of trans module") Signed-off-by: Jun Piao Reviewed-by: Yiwen Jiang Reviewed-by: Greg Kurz Reviewed-by: Dominique Martinet Tested-by: Dominique Martinet Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/9p/client.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index 18c5271910dc..5c1343195292 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -225,7 +225,8 @@ static int parse_opts(char *opts, struct p9_client *clnt) } free_and_return: - v9fs_put_trans(clnt->trans_mod); + if (ret) + v9fs_put_trans(clnt->trans_mod); kfree(tmp_options); return ret; } -- cgit v1.2.3 From 6e2059b53f9885f202b086d7b4ca10a98926e974 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 10 Jul 2018 22:41:26 +0800 Subject: ipv4/igmp: init group mode as INCLUDE when join source group Based on RFC3376 5.1 If no interface state existed for that multicast address before the change (i.e., the change consisted of creating a new per-interface record), or if no state exists after the change (i.e., the change consisted of deleting a per-interface record), then the "non-existent" state is considered to have a filter mode of INCLUDE and an empty source list. Which means a new multicast group should start with state IN(). Function ip_mc_join_group() works correctly for IGMP ASM(Any-Source Multicast) mode. It adds a group with state EX() and inits crcount to mc_qrv, so the kernel will send a TO_EX() report message after adding group. But for IGMPv3 SSM(Source-specific multicast) JOIN_SOURCE_GROUP mode, we split the group joining into two steps. First we join the group like ASM, i.e. via ip_mc_join_group(). So the state changes from IN() to EX(). Then we add the source-specific address with INCLUDE mode. So the state changes from EX() to IN(A). Before the first step sends a group change record, we finished the second step. So we will only send the second change record. i.e. TO_IN(A). Regarding the RFC stands, we should actually send an ALLOW(A) message for SSM JOIN_SOURCE_GROUP as the state should mimic the 'IN() to IN(A)' transition. The issue was exposed by commit a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change"). Before this change, we used to send both ALLOW(A) and TO_IN(A). After this change we only send TO_IN(A). Fix it by adding a new parameter to init group mode. Also add new wrapper functions so we don't need to change too much code. v1 -> v2: In my first version I only cleared the group change record. But this is not enough. Because when a new group join, it will init as EXCLUDE and trigger an filter mode change in ip/ip6_mc_add_src(), which will clear all source addresses' sf_crcount. This will prevent early joined address sending state change records if multi source addressed joined at the same time. In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM JOIN_SOURCE_GROUP. I also split the original patch into two separated patches for IPv4 and IPv6. Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change") Reviewed-by: Stefano Brivio Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/linux/igmp.h | 2 ++ net/ipv4/igmp.c | 58 ++++++++++++++++++++++++++++++++++++-------------- net/ipv4/ip_sockglue.c | 4 ++-- 3 files changed, 46 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index f8231854b5d6..119f53941c12 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -109,6 +109,8 @@ struct ip_mc_list { extern int ip_check_mc_rcu(struct in_device *dev, __be32 mc_addr, __be32 src_addr, u8 proto); extern int igmp_rcv(struct sk_buff *); extern int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr); +extern int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr, + unsigned int mode); extern int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr); extern void ip_mc_drop_socket(struct sock *sk); extern int ip_mc_source(int add, int omode, struct sock *sk, diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 85b617b655bc..b3c899a630a0 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1200,13 +1200,14 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) spin_lock_bh(&im->lock); if (pmc) { im->interface = pmc->interface; - im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; im->sfmode = pmc->sfmode; if (pmc->sfmode == MCAST_INCLUDE) { im->tomb = pmc->tomb; im->sources = pmc->sources; for (psf = im->sources; psf; psf = psf->sf_next) - psf->sf_crcount = im->crcount; + psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + } else { + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; } in_dev_put(pmc->interface); kfree(pmc); @@ -1288,7 +1289,7 @@ static void igmp_group_dropped(struct ip_mc_list *im) #endif } -static void igmp_group_added(struct ip_mc_list *im) +static void igmp_group_added(struct ip_mc_list *im, unsigned int mode) { struct in_device *in_dev = im->interface; #ifdef CONFIG_IP_MULTICAST @@ -1316,7 +1317,13 @@ static void igmp_group_added(struct ip_mc_list *im) } /* else, v3 */ - im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + /* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should + * not send filter-mode change record as the mode should be from + * IN() to IN(A). + */ + if (mode == MCAST_EXCLUDE) + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + igmp_ifc_event(in_dev); #endif } @@ -1381,8 +1388,7 @@ static void ip_mc_hash_remove(struct in_device *in_dev, /* * A socket has joined a multicast group on device dev. */ - -void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) +void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, unsigned int mode) { struct ip_mc_list *im; #ifdef CONFIG_IP_MULTICAST @@ -1394,7 +1400,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == addr) { im->users++; - ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0); + ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0); goto out; } } @@ -1408,8 +1414,8 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) in_dev_hold(in_dev); im->multiaddr = addr; /* initial mode is (EX, empty) */ - im->sfmode = MCAST_EXCLUDE; - im->sfcount[MCAST_EXCLUDE] = 1; + im->sfmode = mode; + im->sfcount[mode] = 1; refcount_set(&im->refcnt, 1); spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST @@ -1426,12 +1432,17 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, im); #endif - igmp_group_added(im); + igmp_group_added(im, mode); if (!in_dev->dead) ip_rt_multicast_event(in_dev); out: return; } + +void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) +{ + __ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE); +} EXPORT_SYMBOL(ip_mc_inc_group); static int ip_mc_check_iphdr(struct sk_buff *skb) @@ -1688,7 +1699,7 @@ void ip_mc_remap(struct in_device *in_dev) #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif - igmp_group_added(pmc); + igmp_group_added(pmc, pmc->sfmode); } } @@ -1751,7 +1762,7 @@ void ip_mc_up(struct in_device *in_dev) #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif - igmp_group_added(pmc); + igmp_group_added(pmc, pmc->sfmode); } } @@ -2130,8 +2141,8 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc) /* Join a multicast group */ - -int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) +static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr, + unsigned int mode) { __be32 addr = imr->imr_multiaddr.s_addr; struct ip_mc_socklist *iml, *i; @@ -2172,15 +2183,30 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) memcpy(&iml->multi, imr, sizeof(*imr)); iml->next_rcu = inet->mc_list; iml->sflist = NULL; - iml->sfmode = MCAST_EXCLUDE; + iml->sfmode = mode; rcu_assign_pointer(inet->mc_list, iml); - ip_mc_inc_group(in_dev, addr); + __ip_mc_inc_group(in_dev, addr, mode); err = 0; done: return err; } + +/* Join ASM (Any-Source Multicast) group + */ +int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) +{ + return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE); +} EXPORT_SYMBOL(ip_mc_join_group); +/* Join SSM (Source-Specific Multicast) group + */ +int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr, + unsigned int mode) +{ + return __ip_mc_join_group(sk, imr, mode); +} + static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, struct in_device *in_dev) { diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index fc32fdbeefa6..64c76dcf7386 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -984,7 +984,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr; mreq.imr_address.s_addr = mreqs.imr_interface; mreq.imr_ifindex = 0; - err = ip_mc_join_group(sk, &mreq); + err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) break; omode = MCAST_INCLUDE; @@ -1061,7 +1061,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, mreq.imr_multiaddr = psin->sin_addr; mreq.imr_address.s_addr = 0; mreq.imr_ifindex = greqs.gsr_interface; - err = ip_mc_join_group(sk, &mreq); + err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); if (err && err != -EADDRINUSE) break; greqs.gsr_interface = mreq.imr_ifindex; -- cgit v1.2.3 From c7ea20c9da5b94e400c8dcc0adb99411f2e430a6 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 10 Jul 2018 22:41:27 +0800 Subject: ipv6/mcast: init as INCLUDE when join SSM INCLUDE group This an IPv6 version patch of "ipv4/igmp: init group mode as INCLUDE when join source group". From RFC3810, part 6.1: If no per-interface state existed for that multicast address before the change (i.e., the change consisted of creating a new per-interface record), or if no state exists after the change (i.e., the change consisted of deleting a per-interface record), then the "non-existent" state is considered to have an INCLUDE filter mode and an empty source list. Which means a new multicast group should start with state IN(). Currently, for MLDv2 SSM JOIN_SOURCE_GROUP mode, we first call ipv6_sock_mc_join(), then ip6_mc_source(), which will trigger a TO_IN() message instead of ALLOW(). The issue was exposed by commit a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change"). Before this change, we sent both ALLOW(A) and TO_IN(A). Now, we only send TO_IN(A). Fix it by adding a new parameter to init group mode. Also add some wrapper functions to avoid changing too much code. v1 -> v2: In the first version I only cleared the group change record. But this is not enough. Because when a new group join, it will init as EXCLUDE and trigger a filter mode change in ip/ip6_mc_add_src(), which will clear all source addresses sf_crcount. This will prevent early joined address sending state change records if multi source addressed joined at the same time. In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM JOIN_SOURCE_GROUP. I also split the original patch into two separated patches for IPv4 and IPv6. There is also a difference between v4 and v6 version. For IPv6, when the interface goes down and up, we will send correct state change record with unspecified IPv6 address (::) with function ipv6_mc_up(). But after DAD is completed, we resend the change record TO_IN() in mld_send_initial_cr(). Fix it by sending ALLOW() for INCLUDE mode in mld_send_initial_cr(). Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change") Reviewed-by: Stefano Brivio Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 ++ net/ipv6/ipv6_sockglue.c | 5 ++-- net/ipv6/mcast.c | 64 ++++++++++++++++++++++++++++++++++-------------- 3 files changed, 50 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index d02881e4ad1f..7528632bcf2a 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1100,6 +1100,8 @@ void ipv6_sysctl_unregister(void); int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr); +int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex, + const struct in6_addr *addr, unsigned int mode); int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); #endif /* _NET_IPV6_H */ diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index c95c3486d904..568ca4187cd1 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -729,8 +729,9 @@ done: struct sockaddr_in6 *psin6; psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; - retv = ipv6_sock_mc_join(sk, greqs.gsr_interface, - &psin6->sin6_addr); + retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface, + &psin6->sin6_addr, + MCAST_INCLUDE); /* prior join w/ different source is ok */ if (retv && retv != -EADDRINUSE) break; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index c0c74088f2af..2699be7202be 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -95,6 +95,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, int delta); static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, struct inet6_dev *idev); +static int __ipv6_dev_mc_inc(struct net_device *dev, + const struct in6_addr *addr, unsigned int mode); #define MLD_QRV_DEFAULT 2 /* RFC3810, 9.2. Query Interval */ @@ -132,7 +134,8 @@ static int unsolicited_report_interval(struct inet6_dev *idev) return iv > 0 ? iv : 1; } -int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) +static int __ipv6_sock_mc_join(struct sock *sk, int ifindex, + const struct in6_addr *addr, unsigned int mode) { struct net_device *dev = NULL; struct ipv6_mc_socklist *mc_lst; @@ -179,7 +182,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) } mc_lst->ifindex = dev->ifindex; - mc_lst->sfmode = MCAST_EXCLUDE; + mc_lst->sfmode = mode; rwlock_init(&mc_lst->sflock); mc_lst->sflist = NULL; @@ -187,7 +190,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) * now add/increase the group membership on the device */ - err = ipv6_dev_mc_inc(dev, addr); + err = __ipv6_dev_mc_inc(dev, addr, mode); if (err) { sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); @@ -199,8 +202,19 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) return 0; } + +int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr) +{ + return __ipv6_sock_mc_join(sk, ifindex, addr, MCAST_EXCLUDE); +} EXPORT_SYMBOL(ipv6_sock_mc_join); +int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex, + const struct in6_addr *addr, unsigned int mode) +{ + return __ipv6_sock_mc_join(sk, ifindex, addr, mode); +} + /* * socket leave on multicast group */ @@ -646,7 +660,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, return rv; } -static void igmp6_group_added(struct ifmcaddr6 *mc) +static void igmp6_group_added(struct ifmcaddr6 *mc, unsigned int mode) { struct net_device *dev = mc->idev->dev; char buf[MAX_ADDR_LEN]; @@ -672,7 +686,13 @@ static void igmp6_group_added(struct ifmcaddr6 *mc) } /* else v2 */ - mc->mca_crcount = mc->idev->mc_qrv; + /* Based on RFC3810 6.1, for newly added INCLUDE SSM, we + * should not send filter-mode change record as the mode + * should be from IN() to IN(A). + */ + if (mode == MCAST_EXCLUDE) + mc->mca_crcount = mc->idev->mc_qrv; + mld_ifc_event(mc->idev); } @@ -770,13 +790,14 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) spin_lock_bh(&im->mca_lock); if (pmc) { im->idev = pmc->idev; - im->mca_crcount = idev->mc_qrv; im->mca_sfmode = pmc->mca_sfmode; if (pmc->mca_sfmode == MCAST_INCLUDE) { im->mca_tomb = pmc->mca_tomb; im->mca_sources = pmc->mca_sources; for (psf = im->mca_sources; psf; psf = psf->sf_next) - psf->sf_crcount = im->mca_crcount; + psf->sf_crcount = idev->mc_qrv; + } else { + im->mca_crcount = idev->mc_qrv; } in6_dev_put(pmc->idev); kfree(pmc); @@ -831,7 +852,8 @@ static void ma_put(struct ifmcaddr6 *mc) } static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, - const struct in6_addr *addr) + const struct in6_addr *addr, + unsigned int mode) { struct ifmcaddr6 *mc; @@ -849,9 +871,8 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, refcount_set(&mc->mca_refcnt, 1); spin_lock_init(&mc->mca_lock); - /* initial mode is (EX, empty) */ - mc->mca_sfmode = MCAST_EXCLUDE; - mc->mca_sfcount[MCAST_EXCLUDE] = 1; + mc->mca_sfmode = mode; + mc->mca_sfcount[mode] = 1; if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) || IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) @@ -863,7 +884,8 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, /* * device multicast group inc (add if not found) */ -int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) +static int __ipv6_dev_mc_inc(struct net_device *dev, + const struct in6_addr *addr, unsigned int mode) { struct ifmcaddr6 *mc; struct inet6_dev *idev; @@ -887,14 +909,13 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) if (ipv6_addr_equal(&mc->mca_addr, addr)) { mc->mca_users++; write_unlock_bh(&idev->lock); - ip6_mc_add_src(idev, &mc->mca_addr, MCAST_EXCLUDE, 0, - NULL, 0); + ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0); in6_dev_put(idev); return 0; } } - mc = mca_alloc(idev, addr); + mc = mca_alloc(idev, addr, mode); if (!mc) { write_unlock_bh(&idev->lock); in6_dev_put(idev); @@ -911,11 +932,16 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) write_unlock_bh(&idev->lock); mld_del_delrec(idev, mc); - igmp6_group_added(mc); + igmp6_group_added(mc, mode); ma_put(mc); return 0; } +int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) +{ + return __ipv6_dev_mc_inc(dev, addr, MCAST_EXCLUDE); +} + /* * device multicast group del */ @@ -1751,7 +1777,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, psf_next = psf->sf_next; - if (!is_in(pmc, psf, type, gdeleted, sdeleted)) { + if (!is_in(pmc, psf, type, gdeleted, sdeleted) && !crsend) { psf_prev = psf; continue; } @@ -2066,7 +2092,7 @@ static void mld_send_initial_cr(struct inet6_dev *idev) if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_CHANGE_TO_EXCLUDE; else - type = MLD2_CHANGE_TO_INCLUDE; + type = MLD2_ALLOW_NEW_SOURCES; skb = add_grec(skb, pmc, type, 0, 0, 1); spin_unlock_bh(&pmc->mca_lock); } @@ -2546,7 +2572,7 @@ void ipv6_mc_up(struct inet6_dev *idev) ipv6_mc_reset(idev); for (i = idev->mc_list; i; i = i->next) { mld_del_delrec(idev, i); - igmp6_group_added(i); + igmp6_group_added(i, i->mca_sfmode); } read_unlock_bh(&idev->lock); } -- cgit v1.2.3 From c604cb767049b78b3075497b80ebb8fd530ea2cc Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 11 Jul 2018 10:46:29 -0700 Subject: KEYS: DNS: fix parsing multiple options My recent fix for dns_resolver_preparse() printing very long strings was incomplete, as shown by syzbot which still managed to hit the WARN_ONCE() in set_precision() by adding a crafted "dns_resolver" key: precision 50001 too large WARNING: CPU: 7 PID: 864 at lib/vsprintf.c:2164 vsnprintf+0x48a/0x5a0 The bug this time isn't just a printing bug, but also a logical error when multiple options ("#"-separated strings) are given in the key payload. Specifically, when separating an option string into name and value, if there is no value then the name is incorrectly considered to end at the end of the key payload, rather than the end of the current option. This bypasses validation of the option length, and also means that specifying multiple options is broken -- which presumably has gone unnoticed as there is currently only one valid option anyway. A similar problem also applied to option values, as the kstrtoul() when parsing the "dnserror" option will read past the end of the current option and into the next option. Fix these bugs by correctly computing the length of the option name and by copying the option value, null-terminated, into a temporary buffer. Reproducer for the WARN_ONCE() that syzbot hit: perl -e 'print "#A#", "\0" x 50000' | keyctl padd dns_resolver desc @s Reproducer for "dnserror" option being parsed incorrectly (expected behavior is to fail when seeing the unknown option "foo", actual behavior was to read the dnserror value as "1#foo" and fail there): perl -e 'print "#dnserror=1#foo\0"' | keyctl padd dns_resolver desc @s Reported-by: syzbot Fixes: 4a2d789267e0 ("DNS: If the DNS server returns an error, allow that to be cached [ver #2]") Signed-off-by: Eric Biggers Signed-off-by: David S. Miller --- net/dns_resolver/dns_key.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index 40c851693f77..0c9478b91fa5 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -86,35 +86,39 @@ dns_resolver_preparse(struct key_preparsed_payload *prep) opt++; kdebug("options: '%s'", opt); do { + int opt_len, opt_nlen; const char *eq; - int opt_len, opt_nlen, opt_vlen, tmp; + char optval[128]; next_opt = memchr(opt, '#', end - opt) ?: end; opt_len = next_opt - opt; - if (opt_len <= 0 || opt_len > 128) { + if (opt_len <= 0 || opt_len > sizeof(optval)) { pr_warn_ratelimited("Invalid option length (%d) for dns_resolver key\n", opt_len); return -EINVAL; } - eq = memchr(opt, '=', opt_len) ?: end; - opt_nlen = eq - opt; - eq++; - opt_vlen = next_opt - eq; /* will be -1 if no value */ + eq = memchr(opt, '=', opt_len); + if (eq) { + opt_nlen = eq - opt; + eq++; + memcpy(optval, eq, next_opt - eq); + optval[next_opt - eq] = '\0'; + } else { + opt_nlen = opt_len; + optval[0] = '\0'; + } - tmp = opt_vlen >= 0 ? opt_vlen : 0; - kdebug("option '%*.*s' val '%*.*s'", - opt_nlen, opt_nlen, opt, tmp, tmp, eq); + kdebug("option '%*.*s' val '%s'", + opt_nlen, opt_nlen, opt, optval); /* see if it's an error number representing a DNS error * that's to be recorded as the result in this key */ if (opt_nlen == sizeof(DNS_ERRORNO_OPTION) - 1 && memcmp(opt, DNS_ERRORNO_OPTION, opt_nlen) == 0) { kdebug("dns error number option"); - if (opt_vlen <= 0) - goto bad_option_value; - ret = kstrtoul(eq, 10, &derrno); + ret = kstrtoul(optval, 10, &derrno); if (ret < 0) goto bad_option_value; -- cgit v1.2.3 From 32da12216e467dea70a09cd7094c30779ce0f9db Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Thu, 12 Jul 2018 08:03:43 -0700 Subject: tls: Stricter error checking in zerocopy sendmsg path In the zerocopy sendmsg() path, there are error checks to revert the zerocopy if we get any error code. syzkaller has discovered that tls_push_record can return -ECONNRESET, which is fatal, and happens after the point at which it is safe to revert the iter, as we've already passed the memory to do_tcp_sendpages. Previously this code could return -ENOMEM and we would want to revert the iter, but AFAIK this no longer returns ENOMEM after a447da7d004 ("tls: fix waitall behavior in tls_sw_recvmsg"), so we fail for all error codes. Reported-by: syzbot+c226690f7b3126c5ee04@syzkaller.appspotmail.com Reported-by: syzbot+709f2810a6a05f11d4d3@syzkaller.appspotmail.com Signed-off-by: Dave Watson Fixes: 3c4d7559159b ("tls: kernel TLS support") Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 7818011fd250..4618f1c31137 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -440,7 +440,7 @@ alloc_encrypted: ret = tls_push_record(sk, msg->msg_flags, record_type); if (!ret) continue; - if (ret == -EAGAIN) + if (ret < 0) goto send_end; copied -= try_to_copy; -- cgit v1.2.3 From b7ed879425be371905d856410d19e9a42a62bcf3 Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Fri, 13 Jul 2018 14:40:50 +0900 Subject: net: ip6_gre: get ipv6hdr after skb_cow_head() A KASAN:use-after-free bug was found related to ip6-erspan while running selftests/net/ip6_gre_headroom.sh It happens because of following sequence: - ipv6hdr pointer is obtained from skb - skb_cow_head() is called, skb->head memory is reallocated - old data is accessed using ipv6hdr pointer skb_cow_head() call was added in e41c7c68ea77 ("ip6erspan: make sure enough headroom at xmit."), but looking at the history there was a chance of similar bug because gre_handle_offloads() and pskb_trim() can also reallocate skb->head memory. Fixes tag points to commit which introduced possibility of this bug. This patch moves ipv6hdr pointer assignment after skb_cow_head() call. Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support") Signed-off-by: Prashant Bhole Reviewed-by: Greg Rose Acked-by: William Tu Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index c8cf2fdbb13b..cd2cfb04e5d8 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -927,7 +927,6 @@ tx_err: static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { - struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct ip6_tnl *t = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); struct net_device_stats *stats; @@ -1010,6 +1009,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, goto tx_err; } } else { + struct ipv6hdr *ipv6h = ipv6_hdr(skb); + switch (skb->protocol) { case htons(ETH_P_IP): memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); -- cgit v1.2.3 From e66515999b627368892ccc9b3a13a506f2ea1357 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 13 Jul 2018 17:21:42 +0200 Subject: ipv6: make DAD fail with enhanced DAD when nonce length differs Commit adc176c54722 ("ipv6 addrconf: Implemented enhanced DAD (RFC7527)") added enhanced DAD with a nonce length of 6 bytes. However, RFC7527 doesn't specify the length of the nonce, other than being 6 + 8*k bytes, with integer k >= 0 (RFC3971 5.3.2). The current implementation simply assumes that the nonce will always be 6 bytes, but others systems are free to choose different sizes. If another system sends a nonce of different length but with the same 6 bytes prefix, it shouldn't be considered as the same nonce. Thus, check that the length of the received nonce is the same as the length we sent. Ugly scapy test script running on veth0: def loop(): pkt=sniff(iface="veth0", filter="icmp6", count=1) pkt = pkt[0] b = bytearray(pkt[Raw].load) b[1] += 1 b += b'\xde\xad\xbe\xef\xde\xad\xbe\xef' pkt[Raw].load = bytes(b) pkt[IPv6].plen += 8 # fixup checksum after modifying the payload pkt[IPv6].payload.cksum -= 0x3b44 if pkt[IPv6].payload.cksum < 0: pkt[IPv6].payload.cksum += 0xffff sendp(pkt, iface="veth0") This should result in DAD failure for any address added to veth0's peer, but is currently ignored. Fixes: adc176c54722 ("ipv6 addrconf: Implemented enhanced DAD (RFC7527)") Signed-off-by: Sabrina Dubroca Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index e640d2f3c55c..0ec273997d1d 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -811,7 +811,7 @@ static void ndisc_recv_ns(struct sk_buff *skb) return; } } - if (ndopts.nd_opts_nonce) + if (ndopts.nd_opts_nonce && ndopts.nd_opts_nonce->nd_opt_len == 1) memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6); inc = ipv6_addr_is_multicast(daddr); -- cgit v1.2.3 From 31048d7aedf31bf0f69c54a662944632f29d82f2 Mon Sep 17 00:00:00 2001 From: Stefan Baranoff Date: Sun, 15 Jul 2018 11:36:37 -0400 Subject: tcp: Fix broken repair socket window probe patch Correct previous bad attempt at allowing sockets to come out of TCP repair without sending window probes. To avoid changing size of the repair variable in struct tcp_sock, this lets the decision for sending probes or not to be made when coming out of repair by introducing two ways to turn it off. v2: * Remove erroneous comment; defines now make behavior clear Fixes: 70b7ff130224 ("tcp: allow user to create repair socket without window probes") Signed-off-by: Stefan Baranoff Signed-off-by: Eric Dumazet Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 4 ++++ net/ipv4/tcp.c | 13 +++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 29eb659aa77a..e3f6ed8a7064 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -127,6 +127,10 @@ enum { #define TCP_CM_INQ TCP_INQ +#define TCP_REPAIR_ON 1 +#define TCP_REPAIR_OFF 0 +#define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */ + struct tcp_repair_opt { __u32 opt_code; __u32 opt_val; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8e5e2ca9ab1b..ec2186e3087f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2823,16 +2823,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_REPAIR: if (!tcp_can_repair_sock(sk)) err = -EPERM; - /* 1 for normal repair, 2 for no window probes */ - else if (val == 1 || val == 2) { - tp->repair = val; + else if (val == TCP_REPAIR_ON) { + tp->repair = 1; sk->sk_reuse = SK_FORCE_REUSE; tp->repair_queue = TCP_NO_QUEUE; - } else if (val == 0) { + } else if (val == TCP_REPAIR_OFF) { + tp->repair = 0; + sk->sk_reuse = SK_NO_REUSE; + tcp_send_window_probe(sk); + } else if (val == TCP_REPAIR_OFF_NO_WP) { tp->repair = 0; sk->sk_reuse = SK_NO_REUSE; - if (tp->repair == 1) - tcp_send_window_probe(sk); } else err = -EINVAL; -- cgit v1.2.3 From b5d2d75e079a918be686957b1a8d2f6c5cc95a0a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 15 Jul 2018 09:35:19 -0700 Subject: net/ipv6: Do not allow device only routes via the multipath API Eric reported that reverting the patch that fixed and simplified IPv6 multipath routes means reverting back to invalid userspace notifications. eg., $ ip -6 route add 2001:db8:1::/64 nexthop dev eth0 nexthop dev eth1 only generates a single notification: 2001:db8:1::/64 dev eth0 metric 1024 pref medium While working on a fix for this problem I found another case that is just broken completely - a multipath route with a gateway followed by device followed by gateway: $ ip -6 ro add 2001:db8:103::/64 nexthop via 2001:db8:1::64 nexthop dev dummy2 nexthop via 2001:db8:3::64 In this case the device only route is dropped completely - no notification to userpsace but no addition to the FIB either: $ ip -6 ro ls 2001:db8:1::/64 dev dummy1 proto kernel metric 256 pref medium 2001:db8:2::/64 dev dummy2 proto kernel metric 256 pref medium 2001:db8:3::/64 dev dummy3 proto kernel metric 256 pref medium 2001:db8:103::/64 metric 1024 nexthop via 2001:db8:1::64 dev dummy1 weight 1 nexthop via 2001:db8:3::64 dev dummy3 weight 1 pref medium fe80::/64 dev dummy1 proto kernel metric 256 pref medium fe80::/64 dev dummy2 proto kernel metric 256 pref medium fe80::/64 dev dummy3 proto kernel metric 256 pref medium Really, IPv6 multipath is just FUBAR'ed beyond repair when it comes to device only routes, so do not allow it all. This change will break any scripts relying on the mpath api for insert, but I don't see any other way to handle the permutations. Besides, since the routes are added to the FIB as standalone (non-multipath) routes the kernel is not doing what the user requested, so it might as well tell the user that. Reported-by: Eric Dumazet Signed-off-by: David Ahern Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 63f99411f0de..2ce0bd17de4f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4388,6 +4388,13 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rt = NULL; goto cleanup; } + if (!rt6_qualify_for_ecmp(rt)) { + err = -EINVAL; + NL_SET_ERR_MSG(extack, + "Device only routes can not be added for IPv6 using the multipath API."); + fib6_info_release(rt); + goto cleanup; + } rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1; -- cgit v1.2.3 From 1992d99882afda6dc17f9d49c06150856a91282f Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 16 Jul 2018 13:56:52 +0200 Subject: net/smc: take sock lock in smc_ioctl() SMC ioctl processing requires the sock lock to work properly in all thinkable scenarios. Problem has been found with RaceFuzzer and fixes: KASAN: null-ptr-deref Read in smc_ioctl Reported-by: Byoungyoung Lee Reported-by: syzbot+35b2c5aa76fd398b9fd4@syzkaller.appspotmail.com Signed-off-by: Ursula Braun Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- net/smc/af_smc.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5334157f5065..c12a7fc18f56 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1524,10 +1524,13 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd, return -EBADF; return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); } + lock_sock(&smc->sk); switch (cmd) { case SIOCINQ: /* same as FIONREAD */ - if (smc->sk.sk_state == SMC_LISTEN) + if (smc->sk.sk_state == SMC_LISTEN) { + release_sock(&smc->sk); return -EINVAL; + } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) answ = 0; @@ -1536,8 +1539,10 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd, break; case SIOCOUTQ: /* output queue size (not send + not acked) */ - if (smc->sk.sk_state == SMC_LISTEN) + if (smc->sk.sk_state == SMC_LISTEN) { + release_sock(&smc->sk); return -EINVAL; + } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) answ = 0; @@ -1547,8 +1552,10 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd, break; case SIOCOUTQNSD: /* output queue size (not send only) */ - if (smc->sk.sk_state == SMC_LISTEN) + if (smc->sk.sk_state == SMC_LISTEN) { + release_sock(&smc->sk); return -EINVAL; + } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) answ = 0; @@ -1556,8 +1563,10 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd, answ = smc_tx_prepared_sends(&smc->conn); break; case SIOCATMARK: - if (smc->sk.sk_state == SMC_LISTEN) + if (smc->sk.sk_state == SMC_LISTEN) { + release_sock(&smc->sk); return -EINVAL; + } if (smc->sk.sk_state == SMC_INIT || smc->sk.sk_state == SMC_CLOSED) { answ = 0; @@ -1573,8 +1582,10 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd, } break; default: + release_sock(&smc->sk); return -ENOIOCTLCMD; } + release_sock(&smc->sk); return put_user(answ, (int __user *)arg); } -- cgit v1.2.3 From 83ed7d1fe2d2d4a11b30660dec20168bb473d9c1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 18 Jul 2018 10:48:56 +0200 Subject: ipv6: ila: select CONFIG_DST_CACHE My randconfig builds came across an old missing dependency for ILA: ERROR: "dst_cache_set_ip6" [net/ipv6/ila/ila.ko] undefined! ERROR: "dst_cache_get" [net/ipv6/ila/ila.ko] undefined! ERROR: "dst_cache_init" [net/ipv6/ila/ila.ko] undefined! ERROR: "dst_cache_destroy" [net/ipv6/ila/ila.ko] undefined! We almost never run into this by accident because randconfig builds end up selecting DST_CACHE from some other tunnel protocol, and this one appears to be the only one missing the explicit 'select'. >From all I can tell, this problem first appeared in linux-4.9 when dst_cache support got added to ILA. Fixes: 79ff2fc31e0f ("ila: Cache a route to translated address") Cc: Tom Herbert Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- net/ipv6/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 0eff75525da1..b3885ca22d6f 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -108,6 +108,7 @@ config IPV6_MIP6 config IPV6_ILA tristate "IPv6: Identifier Locator Addressing (ILA)" depends on NETFILTER + select DST_CACHE select LWTUNNEL ---help--- Support for IPv6 Identifier Locator Addressing (ILA). -- cgit v1.2.3 From 3bc53be9db21040b5d2de4d455f023c8c494aa68 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 18 Jul 2018 18:57:27 +0900 Subject: net/nfc: Avoid stalls when nfc_alloc_send_skb() returned NULL. syzbot is reporting stalls at nfc_llcp_send_ui_frame() [1]. This is because nfc_llcp_send_ui_frame() is retrying the loop without any delay when nonblocking nfc_alloc_send_skb() returned NULL. Since there is no need to use MSG_DONTWAIT if we retry until sock_alloc_send_pskb() succeeds, let's use blocking call. Also, in case an unexpected error occurred, let's break the loop if blocking nfc_alloc_send_skb() failed. [1] https://syzkaller.appspot.com/bug?id=4a131cc571c3733e0eff6bc673f4e36ae48f19c6 Signed-off-by: Tetsuo Handa Reported-by: syzbot Signed-off-by: David S. Miller --- net/nfc/llcp_commands.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c index 2ceefa183cee..6a196e438b6c 100644 --- a/net/nfc/llcp_commands.c +++ b/net/nfc/llcp_commands.c @@ -752,11 +752,14 @@ int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap, pr_debug("Fragment %zd bytes remaining %zd", frag_len, remaining_len); - pdu = nfc_alloc_send_skb(sock->dev, &sock->sk, MSG_DONTWAIT, + pdu = nfc_alloc_send_skb(sock->dev, &sock->sk, 0, frag_len + LLCP_HEADER_SIZE, &err); if (pdu == NULL) { - pr_err("Could not allocate PDU\n"); - continue; + pr_err("Could not allocate PDU (error=%d)\n", err); + len -= remaining_len; + if (len == 0) + len = err; + break; } pdu = llcp_add_header(pdu, dsap, ssap, LLCP_PDU_UI); -- cgit v1.2.3 From 99be51f11d51400f744632f3938445a8d4de8943 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 18 Jul 2018 15:22:49 +0200 Subject: net/smc: optimize consumer cursor updates The SMC protocol requires to send a separate consumer cursor update, if it cannot be piggybacked to updates of the producer cursor. Currently the decision to send a separate consumer cursor update just considers the amount of data already received by the socket program. It does not consider the amount of data already arrived, but not yet consumed by the receiver. Basing the decision on the difference between already confirmed and already arrived data (instead of difference between already confirmed and already consumed data), may lead to a somewhat earlier consumer cursor update send in fast unidirectional traffic scenarios, and thus to better throughput. Signed-off-by: Ursula Braun Suggested-by: Thomas Richter Signed-off-by: David S. Miller --- net/smc/smc_tx.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index cee666400752..f82886b7d1d8 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -495,7 +495,8 @@ out: void smc_tx_consumer_update(struct smc_connection *conn, bool force) { - union smc_host_cursor cfed, cons; + union smc_host_cursor cfed, cons, prod; + int sender_free = conn->rmb_desc->len; int to_confirm; smc_curs_write(&cons, @@ -505,11 +506,18 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force) smc_curs_read(&conn->rx_curs_confirmed, conn), conn); to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons); + if (to_confirm > conn->rmbe_update_limit) { + smc_curs_write(&prod, + smc_curs_read(&conn->local_rx_ctrl.prod, conn), + conn); + sender_free = conn->rmb_desc->len - + smc_curs_diff(conn->rmb_desc->len, &prod, &cfed); + } if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || force || ((to_confirm > conn->rmbe_update_limit) && - ((to_confirm > (conn->rmb_desc->len / 2)) || + ((sender_free <= (conn->rmb_desc->len / 2)) || conn->local_rx_ctrl.prod_flags.write_blocked))) { if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && conn->alert_token_local) { /* connection healthy */ -- cgit v1.2.3 From ac0107edba253a6e58e923f9e68825decef3e681 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 18 Jul 2018 15:22:50 +0200 Subject: net/smc: add error handling for get_user() For security reasons the return code of get_user() should always be checked. Fixes: 01d2f7e2cdd31 ("net/smc: sockopts TCP_NODELAY and TCP_CORK") Reported-by: Heiko Carstens Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c12a7fc18f56..6e5479067db0 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1456,7 +1456,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, if (optlen < sizeof(int)) return -EINVAL; - get_user(val, (int __user *)optval); + if (get_user(val, (int __user *)optval)) + return -EFAULT; lock_sock(sk); switch (optname) { -- cgit v1.2.3 From f6bdc42f021194ec095914b92c7a8b1a09789e6d Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 18 Jul 2018 15:22:51 +0200 Subject: net/smc: reset recv timeout after clc handshake During clc handshake the receive timeout is set to CLC_WAIT_TIME. Remember and reset the original timeout value after the receive calls, and remove a duplicate assignment of CLC_WAIT_TIME. Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_clc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 717449b1da0b..ae5d168653ce 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -250,6 +250,7 @@ out: int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type) { + long rcvtimeo = smc->clcsock->sk->sk_rcvtimeo; struct sock *clc_sk = smc->clcsock->sk; struct smc_clc_msg_hdr *clcm = buf; struct msghdr msg = {NULL, 0}; @@ -306,7 +307,6 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, memset(&msg, 0, sizeof(struct msghdr)); iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, datlen); krflags = MSG_WAITALL; - smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; len = sock_recvmsg(smc->clcsock, &msg, krflags); if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) { smc->sk.sk_err = EPROTO; @@ -322,6 +322,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } out: + smc->clcsock->sk->sk_rcvtimeo = rcvtimeo; return reason_code; } -- cgit v1.2.3 From 53189183909f392c2aff1177565eabbfc48b8524 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 17 Jul 2018 20:58:14 +0800 Subject: net: sched: Using NULL instead of plain integer Fixes the following sparse warnings: net/sched/cls_api.c:1101:43: warning: Using plain integer as NULL pointer net/sched/cls_api.c:1492:75: warning: Using plain integer as NULL pointer Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/sched/cls_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index cdc3c87c53e6..f74513a7c7a8 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1053,7 +1053,7 @@ static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, for (tp = rtnl_dereference(chain->filter_chain); tp; tp = rtnl_dereference(tp->next)) tfilter_notify(net, oskb, n, tp, block, - q, parent, 0, event, false); + q, parent, NULL, event, false); } static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, @@ -1444,7 +1444,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); if (cb->args[1] == 0) { - if (tcf_fill_node(net, skb, tp, block, q, parent, 0, + if (tcf_fill_node(net, skb, tp, block, q, parent, NULL, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) -- cgit v1.2.3 From 3ee593adbbb46d9e1bb1320915943926f4744483 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 17 Jul 2018 16:52:54 +0100 Subject: ipv6: sr: fix useless rol32 call on hash The rol32 call is currently rotating hash but the rol'd value is being discarded. I believe the current code is incorrect and hash should be assigned the rotated value returned from rol32. Detected by CoverityScan, CID#1468411 ("Useless call") Fixes: b5facfdba14c ("ipv6: sr: Compute flowlabel for outer IPv6 header of seg6 encap mode") Signed-off-by: Colin Ian King Acked-by: dlebrun@google.com Signed-off-by: David S. Miller --- net/ipv6/seg6_iptunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 19ccf0dc996c..a8854dd3e9c5 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -101,7 +101,7 @@ static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb, if (do_flowlabel > 0) { hash = skb_get_hash(skb); - rol32(hash, 16); + hash = rol32(hash, 16); flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; } else if (!do_flowlabel && skb->protocol == htons(ETH_P_IPV6)) { flowlabel = ip6_flowlabel(inner_hdr); -- cgit v1.2.3 From e56b8ce363a36fb7b74b80aaa5cc9084f2c908b4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 17 Jul 2018 18:27:45 -0700 Subject: tcp: identify cryptic messages as TCP seq # bugs Attempt to make cryptic TCP seq number error messages clearer by (1) identifying the source of the message as "TCP", (2) identifying the errors as "seq # bug", and (3) grouping the field identifiers and values by separating them with commas. E.g., the following message is changed from: recvmsg bug 2: copied 73BCB6CD seq 70F17CBE rcvnxt 73BCB9AA fl 0 WARNING: CPU: 2 PID: 1501 at /linux/net/ipv4/tcp.c:1881 tcp_recvmsg+0x649/0xb90 to: TCP recvmsg seq # bug 2: copied 73BCB6CD, seq 70F17CBE, rcvnxt 73BCB9AA, fl 0 WARNING: CPU: 2 PID: 1501 at /linux/net/ipv4/tcp.c:2011 tcp_recvmsg+0x694/0xba0 Suggested-by: 積丹尼 Dan Jacobson Signed-off-by: Randy Dunlap Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ec2186e3087f..4491faf83f4f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1998,7 +1998,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, * shouldn't happen. */ if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), - "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n", + "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags)) break; @@ -2013,7 +2013,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; WARN(!(flags & MSG_PEEK), - "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n", + "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); } -- cgit v1.2.3