From e8d67fa5696e2fcaf956dae36d11e6eff5246101 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 30 May 2019 00:51:26 +0300 Subject: net: dsa: sja1105: Don't store frame type in skb->cb Due to a confusion I thought that eth_type_trans() was called by the network stack whereas it can actually be called by network drivers to figure out the skb protocol and next packet_type handlers. In light of the above, it is not safe to store the frame type from the DSA tagger's .filter callback (first entry point on RX path), since GRO is yet to be invoked on the received traffic. Hence it is very likely that the skb->cb will actually get overwritten between eth_type_trans() and the actual DSA packet_type handler. Of course, what this patch fixes is the actual overwriting of the SJA1105_SKB_CB(skb)->type field from the GRO layer, which made all frames be seen as SJA1105_FRAME_TYPE_NORMAL (0). Fixes: 227d07a07ef1 ("net: dsa: sja1105: Add support for traffic through standalone ports") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 12 ------------ net/dsa/tag_sja1105.c | 10 +++------- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 603a02e5a8cb..e46e18c47d41 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -20,18 +20,6 @@ #define SJA1105_LINKLOCAL_FILTER_B 0x011B19000000ull #define SJA1105_LINKLOCAL_FILTER_B_MASK 0xFFFFFF000000ull -enum sja1105_frame_type { - SJA1105_FRAME_TYPE_NORMAL = 0, - SJA1105_FRAME_TYPE_LINK_LOCAL, -}; - -struct sja1105_skb_cb { - enum sja1105_frame_type type; -}; - -#define SJA1105_SKB_CB(skb) \ - ((struct sja1105_skb_cb *)DSA_SKB_CB_PRIV(skb)) - struct sja1105_port { struct dsa_port *dp; int mgmt_slot; diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 969402c7dbf1..d43737e6c3fb 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -28,14 +28,10 @@ static inline bool sja1105_is_link_local(const struct sk_buff *skb) */ static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev) { - if (sja1105_is_link_local(skb)) { - SJA1105_SKB_CB(skb)->type = SJA1105_FRAME_TYPE_LINK_LOCAL; + if (sja1105_is_link_local(skb)) return true; - } - if (!dsa_port_is_vlan_filtering(dev->dsa_ptr)) { - SJA1105_SKB_CB(skb)->type = SJA1105_FRAME_TYPE_NORMAL; + if (!dsa_port_is_vlan_filtering(dev->dsa_ptr)) return true; - } return false; } @@ -84,7 +80,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, skb->offload_fwd_mark = 1; - if (SJA1105_SKB_CB(skb)->type == SJA1105_FRAME_TYPE_LINK_LOCAL) { + if (sja1105_is_link_local(skb)) { /* Management traffic path. Switch embeds the switch ID and * port ID into bytes of the destination MAC, courtesy of * the incl_srcpt options. -- cgit v1.2.3 From 62394708f3e01c9f2be6be74eb6305bae1ed924f Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Fri, 31 May 2019 10:35:14 +0300 Subject: net: dsa: mv88e6xxx: avoid error message on remove from VLAN 0 When non-bridged, non-vlan'ed mv88e6xxx port is moving down, error message is logged: failed to kill vid 0081/0 for device eth_cu_1000_4 This is caused by call from __vlan_vid_del() with vin set to zero, over call chain this results into _mv88e6xxx_port_vlan_del() called with vid=0, and mv88e6xxx_vtu_get() called from there returns -EINVAL. On symmetric path moving port up, call goes through mv88e6xxx_port_vlan_prepare() that calls mv88e6xxx_port_check_hw_vlan() that returns -EOPNOTSUPP for zero vid. This patch changes mv88e6xxx_vtu_get() to also return -EOPNOTSUPP for zero vid, then this error code is explicitly cleared in dsa_slave_vlan_rx_kill_vid() and error message is no longer logged. Signed-off-by: Nikita Yushchenko Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 12f165a71a6c..d8edc4b59a22 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -1392,7 +1392,7 @@ static int mv88e6xxx_vtu_get(struct mv88e6xxx_chip *chip, u16 vid, int err; if (!vid) - return -EINVAL; + return -EOPNOTSUPP; entry->vid = vid - 1; entry->valid = false; -- cgit v1.2.3 From 2e1f164861e500f4e068a9d909bbd3fcc7841483 Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Fri, 31 May 2019 16:59:50 +0800 Subject: net: hns: Fix loopback test failed at copper ports When doing a loopback test at copper ports, the serdes loopback and the phy loopback will fail, because of the adjust link had not finished, and phy not ready. Adds sleep between adjust link and test process to fix it. Signed-off-by: Yonglong Liu Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns/hns_ethtool.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c index ce15d2350db9..188c3f6791b5 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c @@ -339,6 +339,7 @@ static int __lb_setup(struct net_device *ndev, static int __lb_up(struct net_device *ndev, enum hnae_loop loop_mode) { +#define NIC_LB_TEST_WAIT_PHY_LINK_TIME 300 struct hns_nic_priv *priv = netdev_priv(ndev); struct hnae_handle *h = priv->ae_handle; int speed, duplex; @@ -365,6 +366,9 @@ static int __lb_up(struct net_device *ndev, h->dev->ops->adjust_link(h, speed, duplex); + /* wait adjust link done and phy ready */ + msleep(NIC_LB_TEST_WAIT_PHY_LINK_TIME); + return 0; } -- cgit v1.2.3 From 8c268598192df9a07ca967f6a8d5f12c3c1b3888 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Fri, 31 May 2019 08:31:02 +0100 Subject: Update my email address Signed-off-by: Wei Liu Acked-by: Paul Durrant Signed-off-by: David S. Miller --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 429c6c624861..2c71e0611fd2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17293,7 +17293,7 @@ F: Documentation/ABI/stable/sysfs-hypervisor-xen F: Documentation/ABI/testing/sysfs-hypervisor-xen XEN NETWORK BACKEND DRIVER -M: Wei Liu +M: Wei Liu M: Paul Durrant L: xen-devel@lists.xenproject.org (moderated for non-subscribers) L: netdev@vger.kernel.org -- cgit v1.2.3 From afa0925c6fcc6a8f610e996ca09bc3215048033c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 31 May 2019 12:37:23 -0400 Subject: packet: unconditionally free po->rollover Rollover used to use a complex RCU mechanism for assignment, which had a race condition. The below patch fixed the bug and greatly simplified the logic. The feature depends on fanout, but the state is private to the socket. Fanout_release returns f only when the last member leaves and the fanout struct is to be freed. Destroy rollover unconditionally, regardless of fanout state. Fixes: 57f015f5eccf2 ("packet: fix crash in fanout_demux_rollover()") Reported-by: syzbot Diagnosed-by: Dmitry Vyukov Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/packet/af_packet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index fbc775fbf712..d4889bf7248e 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3014,8 +3014,8 @@ static int packet_release(struct socket *sock) synchronize_net(); + kfree(po->rollover); if (f) { - kfree(po->rollover); fanout_release_data(f); kfree(f); } -- cgit v1.2.3 From 09faf5a7d7c0bcb07faba072f611937af9dd5788 Mon Sep 17 00:00:00 2001 From: Ivan Khoronzhuk Date: Fri, 31 May 2019 16:47:25 +0300 Subject: net: ethernet: ti: cpsw_ethtool: fix ethtool ring param set Fix ability to set RX descriptor number, the reason - initially "tx_max_pending" was set incorrectly, but the issue appears after adding sanity check, so fix is for "sanity" patch. Fixes: 37e2d99b59c476 ("ethtool: Ensure new ring parameters are within bounds during SRINGPARAM") Signed-off-by: Ivan Khoronzhuk Reviewed-by: Grygorii Strashko Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/cpsw_ethtool.c b/drivers/net/ethernet/ti/cpsw_ethtool.c index a4a7ec0d2531..6d1c9ebae7cc 100644 --- a/drivers/net/ethernet/ti/cpsw_ethtool.c +++ b/drivers/net/ethernet/ti/cpsw_ethtool.c @@ -643,7 +643,7 @@ void cpsw_get_ringparam(struct net_device *ndev, struct cpsw_common *cpsw = priv->cpsw; /* not supported */ - ering->tx_max_pending = 0; + ering->tx_max_pending = cpsw->descs_pool_size - CPSW_MAX_QUEUES; ering->tx_pending = cpdma_get_num_tx_descs(cpsw->dma); ering->rx_max_pending = cpsw->descs_pool_size - CPSW_MAX_QUEUES; ering->rx_pending = cpdma_get_num_rx_descs(cpsw->dma); -- cgit v1.2.3 From 9e4f56f1a7f3287718d0083b5cb85298dc05a5fd Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Sat, 1 Jun 2019 08:16:26 +0800 Subject: net: ethernet: mediatek: Use hw_feature to judge if HWLRO is supported Should hw_feature as hardware capability flags to check if hardware LRO got support. Signed-off-by: Mark Lee Signed-off-by: Sean Wang Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index f9fbb3ffa3a6..0b88febbaf2a 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -2298,13 +2298,13 @@ static int mtk_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd, switch (cmd->cmd) { case ETHTOOL_GRXRINGS: - if (dev->features & NETIF_F_LRO) { + if (dev->hw_features & NETIF_F_LRO) { cmd->data = MTK_MAX_RX_RING_NUM; ret = 0; } break; case ETHTOOL_GRXCLSRLCNT: - if (dev->features & NETIF_F_LRO) { + if (dev->hw_features & NETIF_F_LRO) { struct mtk_mac *mac = netdev_priv(dev); cmd->rule_cnt = mac->hwlro_ip_cnt; @@ -2312,11 +2312,11 @@ static int mtk_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd, } break; case ETHTOOL_GRXCLSRULE: - if (dev->features & NETIF_F_LRO) + if (dev->hw_features & NETIF_F_LRO) ret = mtk_hwlro_get_fdir_entry(dev, cmd); break; case ETHTOOL_GRXCLSRLALL: - if (dev->features & NETIF_F_LRO) + if (dev->hw_features & NETIF_F_LRO) ret = mtk_hwlro_get_fdir_all(dev, cmd, rule_locs); break; @@ -2333,11 +2333,11 @@ static int mtk_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd) switch (cmd->cmd) { case ETHTOOL_SRXCLSRLINS: - if (dev->features & NETIF_F_LRO) + if (dev->hw_features & NETIF_F_LRO) ret = mtk_hwlro_add_ipaddr(dev, cmd); break; case ETHTOOL_SRXCLSRLDEL: - if (dev->features & NETIF_F_LRO) + if (dev->hw_features & NETIF_F_LRO) ret = mtk_hwlro_del_ipaddr(dev, cmd); break; default: -- cgit v1.2.3 From 880c2d4b2fdfd580ebcd6bb7240a8027a1d34751 Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Sat, 1 Jun 2019 08:16:27 +0800 Subject: net: ethernet: mediatek: Use NET_IP_ALIGN to judge if HW RX_2BYTE_OFFSET is enabled Should only enable HW RX_2BYTE_OFFSET function in the case NET_IP_ALIGN equals to 2. Signed-off-by: Mark Lee Signed-off-by: Sean Wang Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 0b88febbaf2a..765cd56ebcd2 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -1778,6 +1778,7 @@ static void mtk_poll_controller(struct net_device *dev) static int mtk_start_dma(struct mtk_eth *eth) { + u32 rx_2b_offset = (NET_IP_ALIGN == 2) ? MTK_RX_2B_OFFSET : 0; int err; err = mtk_dma_init(eth); @@ -1794,7 +1795,7 @@ static int mtk_start_dma(struct mtk_eth *eth) MTK_QDMA_GLO_CFG); mtk_w32(eth, - MTK_RX_DMA_EN | MTK_RX_2B_OFFSET | + MTK_RX_DMA_EN | rx_2b_offset | MTK_RX_BT_32DWORDS | MTK_MULTI_EN, MTK_PDMA_GLO_CFG); -- cgit v1.2.3 From 67c0aaa1eaec60e9dab301012bdebe6726ae04bd Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 2 Jun 2019 19:09:55 +0800 Subject: selftests: set sysctl bc_forwarding properly in router_broadcast.sh sysctl setting bc_forwarding for $rp2 is needed when ping_test_from h2, otherwise the bc packets from $rp2 won't be forwarded. This patch is to add this setting for $rp2. Also, as ping_test_from does grep "$from" only, which could match some unexpected output, some test case doesn't really work, like: # ping_test_from $h2 198.51.200.255 198.51.200.2 PING 198.51.200.255 from 198.51.100.2 veth3: 56(84) bytes of data. 64 bytes from 198.51.100.1: icmp_seq=1 ttl=64 time=0.336 ms When doing grep $form (198.51.200.2), the output could still match. So change to grep "bytes from $from" instead. Fixes: 40f98b9af943 ("selftests: add a selftest for directed broadcast forwarding") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- tools/testing/selftests/net/forwarding/router_broadcast.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/router_broadcast.sh b/tools/testing/selftests/net/forwarding/router_broadcast.sh index 9a678ece32b4..4eac0a06f451 100755 --- a/tools/testing/selftests/net/forwarding/router_broadcast.sh +++ b/tools/testing/selftests/net/forwarding/router_broadcast.sh @@ -145,16 +145,19 @@ bc_forwarding_disable() { sysctl_set net.ipv4.conf.all.bc_forwarding 0 sysctl_set net.ipv4.conf.$rp1.bc_forwarding 0 + sysctl_set net.ipv4.conf.$rp2.bc_forwarding 0 } bc_forwarding_enable() { sysctl_set net.ipv4.conf.all.bc_forwarding 1 sysctl_set net.ipv4.conf.$rp1.bc_forwarding 1 + sysctl_set net.ipv4.conf.$rp2.bc_forwarding 1 } bc_forwarding_restore() { + sysctl_restore net.ipv4.conf.$rp2.bc_forwarding sysctl_restore net.ipv4.conf.$rp1.bc_forwarding sysctl_restore net.ipv4.conf.all.bc_forwarding } @@ -171,7 +174,7 @@ ping_test_from() log_info "ping $dip, expected reply from $from" ip vrf exec $(master_name_get $oif) \ $PING -I $oif $dip -c 10 -i 0.1 -w $PING_TIMEOUT -b 2>&1 \ - | grep $from &> /dev/null + | grep "bytes from $from" > /dev/null check_err_fail $fail $? } -- cgit v1.2.3 From 28e74a7cfd6403f0d1c0f8b10b45d6fae37b227e Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 2 Jun 2019 15:13:00 +0100 Subject: net: sfp: read eeprom in maximum 16 byte increments Some SFP modules do not like reads longer than 16 bytes, so read the EEPROM in chunks of 16 bytes at a time. This behaviour is not specified in the SFP MSAs, which specifies: "The serial interface uses the 2-wire serial CMOS E2PROM protocol defined for the ATMEL AT24C01A/02/04 family of components." and "As long as the SFP+ receives an acknowledge, it shall serially clock out sequential data words. The sequence is terminated when the host responds with a NACK and a STOP instead of an acknowledge." We must avoid breaking a read across a 16-bit quantity in the diagnostic page, thankfully all 16-bit quantities in that page are naturally aligned. Signed-off-by: Russell King Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/sfp.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c index d4635c2178d1..71812be0ac64 100644 --- a/drivers/net/phy/sfp.c +++ b/drivers/net/phy/sfp.c @@ -281,6 +281,7 @@ static int sfp_i2c_read(struct sfp *sfp, bool a2, u8 dev_addr, void *buf, { struct i2c_msg msgs[2]; u8 bus_addr = a2 ? 0x51 : 0x50; + size_t this_len; int ret; msgs[0].addr = bus_addr; @@ -292,11 +293,26 @@ static int sfp_i2c_read(struct sfp *sfp, bool a2, u8 dev_addr, void *buf, msgs[1].len = len; msgs[1].buf = buf; - ret = i2c_transfer(sfp->i2c, msgs, ARRAY_SIZE(msgs)); - if (ret < 0) - return ret; + while (len) { + this_len = len; + if (this_len > 16) + this_len = 16; - return ret == ARRAY_SIZE(msgs) ? len : 0; + msgs[1].len = this_len; + + ret = i2c_transfer(sfp->i2c, msgs, ARRAY_SIZE(msgs)); + if (ret < 0) + return ret; + + if (ret != ARRAY_SIZE(msgs)) + break; + + msgs[1].buf += this_len; + dev_addr += this_len; + len -= this_len; + } + + return msgs[1].buf - (u8 *)buf; } static int sfp_i2c_write(struct sfp *sfp, bool a2, u8 dev_addr, void *buf, -- cgit v1.2.3 From 77316763321ee4050f0576ffd472183aa90dcb30 Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 2 Jun 2019 15:12:54 +0100 Subject: net: phylink: avoid reducing support mask Avoid reducing the support mask as a result of the interface type selected for SFP modules, or when setting the link settings through ethtool - this should only change when the supported link modes of the hardware combination change. Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 9044b95d2afe..4c0616ba314d 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1073,6 +1073,7 @@ EXPORT_SYMBOL_GPL(phylink_ethtool_ksettings_get); int phylink_ethtool_ksettings_set(struct phylink *pl, const struct ethtool_link_ksettings *kset) { + __ETHTOOL_DECLARE_LINK_MODE_MASK(support); struct ethtool_link_ksettings our_kset; struct phylink_link_state config; int ret; @@ -1083,11 +1084,12 @@ int phylink_ethtool_ksettings_set(struct phylink *pl, kset->base.autoneg != AUTONEG_ENABLE) return -EINVAL; + linkmode_copy(support, pl->supported); config = pl->link_config; /* Mask out unsupported advertisements */ linkmode_and(config.advertising, kset->link_modes.advertising, - pl->supported); + support); /* FIXME: should we reject autoneg if phy/mac does not support it? */ if (kset->base.autoneg == AUTONEG_DISABLE) { @@ -1097,7 +1099,7 @@ int phylink_ethtool_ksettings_set(struct phylink *pl, * duplex. */ s = phy_lookup_setting(kset->base.speed, kset->base.duplex, - pl->supported, false); + support, false); if (!s) return -EINVAL; @@ -1126,7 +1128,7 @@ int phylink_ethtool_ksettings_set(struct phylink *pl, __set_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, config.advertising); } - if (phylink_validate(pl, pl->supported, &config)) + if (phylink_validate(pl, support, &config)) return -EINVAL; /* If autonegotiation is enabled, we must have an advertisement */ @@ -1576,6 +1578,7 @@ static int phylink_sfp_module_insert(void *upstream, { struct phylink *pl = upstream; __ETHTOOL_DECLARE_LINK_MODE_MASK(support) = { 0, }; + __ETHTOOL_DECLARE_LINK_MODE_MASK(support1); struct phylink_link_state config; phy_interface_t iface; int ret = 0; @@ -1603,6 +1606,8 @@ static int phylink_sfp_module_insert(void *upstream, return ret; } + linkmode_copy(support1, support); + iface = sfp_select_interface(pl->sfp_bus, id, config.advertising); if (iface == PHY_INTERFACE_MODE_NA) { netdev_err(pl->netdev, @@ -1612,7 +1617,7 @@ static int phylink_sfp_module_insert(void *upstream, } config.interface = iface; - ret = phylink_validate(pl, support, &config); + ret = phylink_validate(pl, support1, &config); if (ret) { netdev_err(pl->netdev, "validation of %s/%s with support %*pb failed: %d\n", phylink_an_mode_str(MLO_AN_INBAND), -- cgit v1.2.3 From f4cfcfbdf03cf7cf5f9097803415dfdcf965676c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 3 Jun 2019 02:31:37 +0300 Subject: net: dsa: sja1105: Fix link speed not working at 100 Mbps and below The hardware values for link speed are held in the sja1105_speed_t enum. However they do not increase in the order that sja1105_get_speed_cfg was iterating over them (basically from SJA1105_SPEED_AUTO - 0 - to SJA1105_SPEED_1000MBPS - 1 - skipping the other two). Another bug is that the code in sja1105_adjust_port_config relies on the fact that an invalid link speed is detected by sja1105_get_speed_cfg and returned as -EINVAL. However storing this into an enum that only has positive members will cast it into an unsigned value, and it will miss the negative check. So take the simplest approach and remove the sja1105_get_speed_cfg function and replace it with a simple switch-case statement. Fixes: 8aa9ebccae87 ("net: dsa: Introduce driver for NXP SJA1105 5-port L2 switch") Signed-off-by: Vladimir Oltean Suggested-by: Andrew Lunn Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 0663b78a2f6c..1c3959efebc4 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -652,16 +652,6 @@ static int sja1105_speed[] = { [SJA1105_SPEED_1000MBPS] = 1000, }; -static sja1105_speed_t sja1105_get_speed_cfg(unsigned int speed_mbps) -{ - int i; - - for (i = SJA1105_SPEED_AUTO; i <= SJA1105_SPEED_1000MBPS; i++) - if (sja1105_speed[i] == speed_mbps) - return i; - return -EINVAL; -} - /* Set link speed and enable/disable traffic I/O in the MAC configuration * for a specific port. * @@ -684,8 +674,21 @@ static int sja1105_adjust_port_config(struct sja1105_private *priv, int port, mii = priv->static_config.tables[BLK_IDX_XMII_PARAMS].entries; mac = priv->static_config.tables[BLK_IDX_MAC_CONFIG].entries; - speed = sja1105_get_speed_cfg(speed_mbps); - if (speed_mbps && speed < 0) { + switch (speed_mbps) { + case 0: + /* No speed update requested */ + speed = SJA1105_SPEED_AUTO; + break; + case 10: + speed = SJA1105_SPEED_10MBPS; + break; + case 100: + speed = SJA1105_SPEED_100MBPS; + break; + case 1000: + speed = SJA1105_SPEED_1000MBPS; + break; + default: dev_err(dev, "Invalid speed %iMbps\n", speed_mbps); return -EINVAL; } @@ -695,10 +698,7 @@ static int sja1105_adjust_port_config(struct sja1105_private *priv, int port, * and we no longer need to store it in the static config (already told * hardware we want auto during upload phase). */ - if (speed_mbps) - mac[port].speed = speed; - else - mac[port].speed = SJA1105_SPEED_AUTO; + mac[port].speed = speed; /* On P/Q/R/S, one can read from the device via the MAC reconfiguration * tables. On E/T, MAC reconfig tables are not readable, only writable. -- cgit v1.2.3 From 27393f8c6efc03b8e0b64134721b0d337fca0a80 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 4 Jun 2019 12:00:11 -0700 Subject: Revert "net/tls: avoid NULL-deref on resync during device removal" This reverts commit 38030d7cb77963ba84cdbe034806e2b81245339f. Unfortunately the RX resync may get called from soft IRQ, so we can't take the rwsem to protect from the device disappearing. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/tls/tls_device.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index b95c408fd771..49b3a2ff8ef3 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -553,8 +553,8 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx) void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) { struct tls_context *tls_ctx = tls_get_ctx(sk); + struct net_device *netdev = tls_ctx->netdev; struct tls_offload_context_rx *rx_ctx; - struct net_device *netdev; u32 is_req_pending; s64 resync_req; u32 req_seq; @@ -568,15 +568,10 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) is_req_pending = resync_req; if (unlikely(is_req_pending) && req_seq == seq && - atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) { - seq += TLS_HEADER_SIZE - 1; - down_read(&device_offload_lock); - netdev = tls_ctx->netdev; - if (netdev) - netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, seq, - rcd_sn); - up_read(&device_offload_lock); - } + atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) + netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, + seq + TLS_HEADER_SIZE - 1, + rcd_sn); } static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) -- cgit v1.2.3 From e52972c11d6b1262964db96d65934196db621685 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 4 Jun 2019 12:00:12 -0700 Subject: net/tls: replace the sleeping lock around RX resync with a bit lock Commit 38030d7cb779 ("net/tls: avoid NULL-deref on resync during device removal") tried to fix a potential NULL-dereference by taking the context rwsem. Unfortunately the RX resync may get called from soft IRQ, so we can't use the rwsem to protect from the device disappearing. Because we are guaranteed there can be only one resync at a time (it's called from strparser) use a bit to indicate resync is busy and make device removal wait for the bit to get cleared. Note that there is a leftover "flags" field in struct tls_context already. Fixes: 4799ac81e52a ("tls: Add rx inline crypto offload") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/tls.h | 4 ++++ net/tls/tls_device.c | 27 +++++++++++++++++++++------ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 39ea62f0c1f6..4a55ce6a303f 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -209,6 +209,10 @@ struct tls_offload_context_tx { (ALIGN(sizeof(struct tls_offload_context_tx), sizeof(void *)) + \ TLS_DRIVER_STATE_SIZE) +enum tls_context_flags { + TLS_RX_SYNC_RUNNING = 0, +}; + struct cipher_context { char *iv; char *rec_seq; diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 49b3a2ff8ef3..1f9cf57d9754 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -550,10 +550,22 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx) } } +static void tls_device_resync_rx(struct tls_context *tls_ctx, + struct sock *sk, u32 seq, u64 rcd_sn) +{ + struct net_device *netdev; + + if (WARN_ON(test_and_set_bit(TLS_RX_SYNC_RUNNING, &tls_ctx->flags))) + return; + netdev = READ_ONCE(tls_ctx->netdev); + if (netdev) + netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, seq, rcd_sn); + clear_bit_unlock(TLS_RX_SYNC_RUNNING, &tls_ctx->flags); +} + void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) { struct tls_context *tls_ctx = tls_get_ctx(sk); - struct net_device *netdev = tls_ctx->netdev; struct tls_offload_context_rx *rx_ctx; u32 is_req_pending; s64 resync_req; @@ -568,10 +580,10 @@ void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn) is_req_pending = resync_req; if (unlikely(is_req_pending) && req_seq == seq && - atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) - netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk, - seq + TLS_HEADER_SIZE - 1, - rcd_sn); + atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) { + seq += TLS_HEADER_SIZE - 1; + tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn); + } } static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb) @@ -972,7 +984,10 @@ static int tls_device_down(struct net_device *netdev) if (ctx->rx_conf == TLS_HW) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_RX); - ctx->netdev = NULL; + WRITE_ONCE(ctx->netdev, NULL); + smp_mb__before_atomic(); /* pairs with test_and_set_bit() */ + while (test_bit(TLS_RX_SYNC_RUNNING, &ctx->flags)) + usleep_range(10, 200); dev_put(netdev); list_del_init(&ctx->list); -- cgit v1.2.3 From 82ba25c6de200d7a9e9c970c998cdd6dfa8637ae Mon Sep 17 00:00:00 2001 From: Tim Beale Date: Tue, 4 Jun 2019 13:56:23 +1200 Subject: udp: only choose unbound UDP socket for multicast when not in a VRF By default, packets received in another VRF should not be passed to an unbound socket in the default VRF. This patch updates the IPv4 UDP multicast logic to match the unicast VRF logic (in compute_score()), as well as the IPv6 mcast logic (in __udp_v6_is_mcast_sock()). The particular case I noticed was DHCP discover packets going to the 255.255.255.255 address, which are handled by __udp4_lib_mcast_deliver(). The previous code meant that running multiple different DHCP server or relay agent instances across VRFs did not work correctly - any server/relay agent in the default VRF received DHCP discover packets for all other VRFs. Fixes: 6da5b0f027a8 ("net: ensure unbound datagram socket to be chosen when not in a VRF") Signed-off-by: Tim Beale Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/udp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8fb250ed53d4..efe9283a8a95 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -538,8 +538,7 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, (inet->inet_dport != rmt_port && inet->inet_dport) || (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) || ipv6_only_sock(sk) || - (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif && - sk->sk_bound_dev_if != sdif)) + !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return false; if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif)) return false; -- cgit v1.2.3 From ceae266bf0ae6564ac16d086bf749a096fa90ded Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 Jun 2019 06:07:34 +0000 Subject: net: ipvlan: Fix ipvlan device tso disabled while NETIF_F_IP_CSUM is set There's some NICs, such as hinic, with NETIF_F_IP_CSUM and NETIF_F_TSO on but NETIF_F_HW_CSUM off. And ipvlan device features will be NETIF_F_TSO on with NETIF_F_IP_CSUM and NETIF_F_IP_CSUM both off as IPVLAN_FEATURES only care about NETIF_F_HW_CSUM. So TSO will be disabled in netdev_fix_features. For example: Features for enp129s0f0: rx-checksumming: on tx-checksumming: on tx-checksum-ipv4: on tx-checksum-ip-generic: off [fixed] tx-checksum-ipv6: on Fixes: a188222b6ed2 ("net: Rename NETIF_F_ALL_CSUM to NETIF_F_CSUM_MASK") Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- drivers/net/ipvlan/ipvlan_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index bbeb1623e2d5..717fce6edeb7 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -112,7 +112,7 @@ static void ipvlan_port_destroy(struct net_device *dev) } #define IPVLAN_FEATURES \ - (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ + (NETIF_F_SG | NETIF_F_CSUM_MASK | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ NETIF_F_GSO | NETIF_F_TSO | NETIF_F_GSO_ROBUST | \ NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \ NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER) -- cgit v1.2.3 From fdf71426e7c548d6e4f5e51e0238732d60e05f5f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 4 Jun 2019 11:44:06 +0200 Subject: net: fix indirect calls helpers for ptype list hooks. As Eric noted, the current wrapper for ptype func hook inside __netif_receive_skb_list_ptype() has no chance of avoiding the indirect call: we enter such code path only for protocols other than ipv4 and ipv6. Instead we can wrap the list_func invocation. v1 -> v2: - use the correct fix tag Fixes: f5737cbadb7d ("net: use indirect calls helpers for ptype hook") Suggested-by: Eric Dumazet Signed-off-by: Paolo Abeni Acked-by: Edward Cree Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 66f7508825bd..1c4593ec4409 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5025,12 +5025,12 @@ static inline void __netif_receive_skb_list_ptype(struct list_head *head, if (list_empty(head)) return; if (pt_prev->list_func != NULL) - pt_prev->list_func(head, pt_prev, orig_dev); + INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv, + ip_list_rcv, head, pt_prev, orig_dev); else list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); - INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb, - skb->dev, pt_prev, orig_dev); + pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } } -- cgit v1.2.3 From 72c87976c5abbf8a834ad85f10d03c0cd58b985c Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Wed, 5 Jun 2019 13:48:48 +0200 Subject: s390/qeth: handle limited IPv4 broadcast in L3 TX path When selecting the cast type of a neighbourless IPv4 skb (eg. on a raw socket), qeth_l3 falls back to the packet's destination IP address. For this case we should classify traffic sent to 255.255.255.255 as broadcast. This fixes DHCP requests, which were misclassified as unicast (and for IQD interfaces thus ended up on the wrong HW queue). Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l3_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 0271833da6a2..2df67abdfde7 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -1911,6 +1911,8 @@ static int qeth_l3_get_cast_type(struct sk_buff *skb) /* no neighbour (eg AF_PACKET), fall back to target's IP address ... */ switch (qeth_get_ip_version(skb)) { case 4: + if (ipv4_is_lbcast(ip_hdr(skb)->daddr)) + return RTN_BROADCAST; return ipv4_is_multicast(ip_hdr(skb)->daddr) ? RTN_MULTICAST : RTN_UNICAST; case 6: -- cgit v1.2.3 From 0cd6783d3c7d40be165d1f3c811cedf0e3dfcdf1 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Wed, 5 Jun 2019 13:48:49 +0200 Subject: s390/qeth: check dst entry before use While qeth_l3 uses netif_keep_dst() to hold onto the dst, a skb's dst may still have been obsoleted (via dst_dev_put()) by the time that we end up using it. The dst then points to the loopback interface, which means the neighbour lookup in qeth_l3_get_cast_type() determines a bogus cast type of RTN_BROADCAST. For IQD interfaces this causes us to place such skbs on the wrong HW queue, resulting in TX errors. Fix-up the various call sites to first validate the dst entry with dst_check(), and fall back accordingly. Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l3_main.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 2df67abdfde7..13bf3e2e9cea 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -1888,13 +1888,20 @@ static int qeth_l3_do_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) static int qeth_l3_get_cast_type(struct sk_buff *skb) { + int ipv = qeth_get_ip_version(skb); struct neighbour *n = NULL; struct dst_entry *dst; rcu_read_lock(); dst = skb_dst(skb); - if (dst) - n = dst_neigh_lookup_skb(dst, skb); + if (dst) { + struct rt6_info *rt = (struct rt6_info *) dst; + + dst = dst_check(dst, (ipv == 6) ? rt6_get_cookie(rt) : 0); + if (dst) + n = dst_neigh_lookup_skb(dst, skb); + } + if (n) { int cast_type = n->type; @@ -1909,7 +1916,7 @@ static int qeth_l3_get_cast_type(struct sk_buff *skb) rcu_read_unlock(); /* no neighbour (eg AF_PACKET), fall back to target's IP address ... */ - switch (qeth_get_ip_version(skb)) { + switch (ipv) { case 4: if (ipv4_is_lbcast(ip_hdr(skb)->daddr)) return RTN_BROADCAST; @@ -1942,6 +1949,7 @@ static void qeth_l3_fill_header(struct qeth_qdio_out_q *queue, struct qeth_hdr_layer3 *l3_hdr = &hdr->hdr.l3; struct vlan_ethhdr *veth = vlan_eth_hdr(skb); struct qeth_card *card = queue->card; + struct dst_entry *dst; hdr->hdr.l3.length = data_len; @@ -1987,15 +1995,27 @@ static void qeth_l3_fill_header(struct qeth_qdio_out_q *queue, } rcu_read_lock(); + dst = skb_dst(skb); + if (ipv == 4) { - struct rtable *rt = skb_rtable(skb); + struct rtable *rt; + + if (dst) + dst = dst_check(dst, 0); + rt = (struct rtable *) dst; *((__be32 *) &hdr->hdr.l3.next_hop.ipv4.addr) = (rt) ? rt_nexthop(rt, ip_hdr(skb)->daddr) : ip_hdr(skb)->daddr; } else { /* IPv6 */ - const struct rt6_info *rt = skb_rt6_info(skb); + struct rt6_info *rt; + + if (dst) { + rt = (struct rt6_info *) dst; + dst = dst_check(dst, rt6_get_cookie(rt)); + } + rt = (struct rt6_info *) dst; if (rt && !ipv6_addr_any(&rt->rt6i_gateway)) l3_hdr->next_hop.ipv6_addr = rt->rt6i_gateway; -- cgit v1.2.3 From 335726195e460cb6b3f795b695bfd31f0ea70ef0 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Wed, 5 Jun 2019 13:48:50 +0200 Subject: s390/qeth: fix VLAN attribute in bridge_hostnotify udev event Enabling sysfs attribute bridge_hostnotify triggers a series of udev events for the MAC addresses of all currently connected peers. In case no VLAN is set for a peer, the device reports the corresponding MAC addresses with VLAN ID 4096. This currently results in attribute VLAN=4096 for all non-VLAN interfaces in the initial series of events after host-notify is enabled. Instead, no VLAN attribute should be reported in the udev event for non-VLAN interfaces. Only the initial events face this issue. For dynamic changes that are reported later, the device uses a validity flag. This also changes the code so that it now sets the VLAN attribute for MAC addresses with VID 0. On Linux, no qeth interface will ever be registered with VID 0: Linux kernel registers VID 0 on all network interfaces initially, but qeth will drop .ndo_vlan_rx_add_vid for VID 0. Peers with other OSs could register MACs with VID 0. Fixes: 9f48b9db9a22 ("qeth: bridgeport support - address notifications") Signed-off-by: Alexandra Winter Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l2_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 218801232ca2..ff8a6cd790b1 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -1680,7 +1680,7 @@ static void qeth_bridgeport_an_set_cb(void *priv, l2entry = (struct qdio_brinfo_entry_l2 *)entry; code = IPA_ADDR_CHANGE_CODE_MACADDR; - if (l2entry->addr_lnid.lnid) + if (l2entry->addr_lnid.lnid < VLAN_N_VID) code |= IPA_ADDR_CHANGE_CODE_VLANID; qeth_bridge_emit_host_event(card, anev_reg_unreg, code, (struct net_if_token *)&l2entry->nit, -- cgit v1.2.3 From bd966839bdf9165ee0aefa57132e87e9caf55982 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Wed, 5 Jun 2019 13:48:51 +0200 Subject: s390/qeth: handle error when updating TX queue count netif_set_real_num_tx_queues() can return an error, deal with it. Fixes: 73dc2daf110f ("s390/qeth: add TX multiqueue support for OSA devices") Signed-off-by: Julian Wiedmann Signed-off-by: David S. Miller --- drivers/s390/net/qeth_core_main.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index 009f2c0ec504..b1823d75dd35 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -1274,16 +1274,20 @@ static int qeth_setup_channel(struct qeth_channel *channel, bool alloc_buffers) return 0; } -static void qeth_osa_set_output_queues(struct qeth_card *card, bool single) +static int qeth_osa_set_output_queues(struct qeth_card *card, bool single) { unsigned int count = single ? 1 : card->dev->num_tx_queues; + int rc; rtnl_lock(); - netif_set_real_num_tx_queues(card->dev, count); + rc = netif_set_real_num_tx_queues(card->dev, count); rtnl_unlock(); + if (rc) + return rc; + if (card->qdio.no_out_queues == count) - return; + return 0; if (atomic_read(&card->qdio.state) != QETH_QDIO_UNINITIALIZED) qeth_free_qdio_queues(card); @@ -1293,12 +1297,14 @@ static void qeth_osa_set_output_queues(struct qeth_card *card, bool single) card->qdio.default_out_queue = single ? 0 : QETH_DEFAULT_QUEUE; card->qdio.no_out_queues = count; + return 0; } static int qeth_update_from_chp_desc(struct qeth_card *card) { struct ccw_device *ccwdev; struct channel_path_desc_fmt0 *chp_dsc; + int rc = 0; QETH_DBF_TEXT(SETUP, 2, "chp_desc"); @@ -1311,12 +1317,12 @@ static int qeth_update_from_chp_desc(struct qeth_card *card) if (IS_OSD(card) || IS_OSX(card)) /* CHPP field bit 6 == 1 -> single queue */ - qeth_osa_set_output_queues(card, chp_dsc->chpp & 0x02); + rc = qeth_osa_set_output_queues(card, chp_dsc->chpp & 0x02); kfree(chp_dsc); QETH_DBF_TEXT_(SETUP, 2, "nr:%x", card->qdio.no_out_queues); QETH_DBF_TEXT_(SETUP, 2, "lvl:%02x", card->info.func_level); - return 0; + return rc; } static void qeth_init_qdio_info(struct qeth_card *card) @@ -5597,8 +5603,12 @@ static struct net_device *qeth_alloc_netdev(struct qeth_card *card) dev->hw_features |= NETIF_F_SG; dev->vlan_features |= NETIF_F_SG; if (IS_IQD(card)) { - netif_set_real_num_tx_queues(dev, QETH_IQD_MIN_TXQ); dev->features |= NETIF_F_SG; + if (netif_set_real_num_tx_queues(dev, + QETH_IQD_MIN_TXQ)) { + free_netdev(dev); + return NULL; + } } } -- cgit v1.2.3 From 0a90478b93a46bdcd56ba33c37566a993e455d54 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 2 Jun 2019 19:10:24 +0800 Subject: ipv4: not do cache for local delivery if bc_forwarding is enabled With the topo: h1 ---| rp1 | | route rp3 |--- h3 (192.168.200.1) h2 ---| rp2 | If rp1 bc_forwarding is set while rp2 bc_forwarding is not, after doing "ping 192.168.200.255" on h1, then ping 192.168.200.255 on h2, and the packets can still be forwared. This issue was caused by the input route cache. It should only do the cache for either bc forwarding or local delivery. Otherwise, local delivery can use the route cache for bc forwarding of other interfaces. This patch is to fix it by not doing cache for local delivery if all.bc_forwarding is enabled. Note that we don't fix it by checking route cache local flag after rt_cache_valid() in "local_input:" and "ip_mkroute_input", as the common route code shouldn't be touched for bc_forwarding. Fixes: 5cbf777cfdf6 ("route: add support for directed broadcast forwarding") Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/route.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 11ddc276776e..91bf75b8ac1c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1985,7 +1985,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, u32 itag = 0; struct rtable *rth; struct flowi4 fl4; - bool do_cache; + bool do_cache = true; /* IP on this device is disabled. */ @@ -2062,6 +2062,9 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res->type == RTN_BROADCAST) { if (IN_DEV_BFORWARD(in_dev)) goto make_route; + /* not do cache if bc_forwarding is enabled */ + if (IPV4_DEVCONF_ALL(net, BC_FORWARDING)) + do_cache = false; goto brd_input; } @@ -2099,18 +2102,15 @@ brd_input: RT_CACHE_STAT_INC(in_brd); local_input: - do_cache = false; - if (res->fi) { - if (!itag) { - struct fib_nh_common *nhc = FIB_RES_NHC(*res); + do_cache &= res->fi && !itag; + if (do_cache) { + struct fib_nh_common *nhc = FIB_RES_NHC(*res); - rth = rcu_dereference(nhc->nhc_rth_input); - if (rt_cache_valid(rth)) { - skb_dst_set_noref(skb, &rth->dst); - err = 0; - goto out; - } - do_cache = true; + rth = rcu_dereference(nhc->nhc_rth_input); + if (rt_cache_valid(rth)) { + skb_dst_set_noref(skb, &rth->dst); + err = 0; + goto out; } } -- cgit v1.2.3 From b7999b07726c16974ba9ca3bb9fe98ecbec5f81c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 2 Jun 2019 19:10:46 +0800 Subject: ipv6: fix the check before getting the cookie in rt6_get_cookie In Jianlin's testing, netperf was broken with 'Connection reset by peer', as the cookie check failed in rt6_check() and ip6_dst_check() always returned NULL. It's caused by Commit 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes"), where the cookie can be got only when 'c1'(see below) for setting dst_cookie whereas rt6_check() is called when !'c1' for checking dst_cookie, as we can see in ip6_dst_check(). Since in ip6_dst_check() both rt6_dst_from_check() (c1) and rt6_check() (!c1) will check the 'from' cookie, this patch is to remove the c1 check in rt6_get_cookie(), so that the dst_cookie can always be set properly. c1: (rt->rt6i_flags & RTF_PCPU || unlikely(!list_empty(&rt->rt6i_uncached))) Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes") Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 525f701653ca..d6d936cbf6b3 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -263,8 +263,7 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt) rcu_read_lock(); from = rcu_dereference(rt->from); - if (from && (rt->rt6i_flags & RTF_PCPU || - unlikely(!list_empty(&rt->rt6i_uncached)))) + if (from) fib6_get_cookie_safe(from, &cookie); rcu_read_unlock(); -- cgit v1.2.3 From b50e058746ba29f517e27299447831ab3d93f896 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Mon, 3 Jun 2019 08:48:19 -0400 Subject: net: rds: fix memory leak when unload rds_rdma When KASAN is enabled, after several rds connections are created, then "rmmod rds_rdma" is run. The following will appear. " BUG rds_ib_incoming (Not tainted): Objects remaining in rds_ib_incoming on __kmem_cache_shutdown() Call Trace: dump_stack+0x71/0xab slab_err+0xad/0xd0 __kmem_cache_shutdown+0x17d/0x370 shutdown_cache+0x17/0x130 kmem_cache_destroy+0x1df/0x210 rds_ib_recv_exit+0x11/0x20 [rds_rdma] rds_ib_exit+0x7a/0x90 [rds_rdma] __x64_sys_delete_module+0x224/0x2c0 ? __ia32_sys_delete_module+0x2c0/0x2c0 do_syscall_64+0x73/0x190 entry_SYSCALL_64_after_hwframe+0x44/0xa9 " This is rds connection memory leak. The root cause is: When "rmmod rds_rdma" is run, rds_ib_remove_one will call rds_ib_dev_shutdown to drop the rds connections. rds_ib_dev_shutdown will call rds_conn_drop to drop rds connections as below. " rds_conn_path_drop(&conn->c_path[0], false); " In the above, destroy is set to false. void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy) { atomic_set(&cp->cp_state, RDS_CONN_ERROR); rcu_read_lock(); if (!destroy && rds_destroy_pending(cp->cp_conn)) { rcu_read_unlock(); return; } queue_work(rds_wq, &cp->cp_down_w); rcu_read_unlock(); } In the above function, destroy is set to false. rds_destroy_pending is called. This does not move rds connections to ib_nodev_conns. So destroy is set to true to move rds connections to ib_nodev_conns. In rds_ib_unregister_client, flush_workqueue is called to make rds_wq finsh shutdown rds connections. The function rds_ib_destroy_nodev_conns is called to shutdown rds connections finally. Then rds_ib_recv_exit is called to destroy slab. void rds_ib_recv_exit(void) { kmem_cache_destroy(rds_ib_incoming_slab); kmem_cache_destroy(rds_ib_frag_slab); } The above slab memory leak will not occur again. >From tests, 256 rds connections [root@ca-dev14 ~]# time rmmod rds_rdma real 0m16.522s user 0m0.000s sys 0m8.152s 512 rds connections [root@ca-dev14 ~]# time rmmod rds_rdma real 0m32.054s user 0m0.000s sys 0m15.568s To rmmod rds_rdma with 256 rds connections, about 16 seconds are needed. And with 512 rds connections, about 32 seconds are needed. >From ftrace, when one rds connection is destroyed, " 19) | rds_conn_destroy [rds]() { 19) 7.782 us | rds_conn_path_drop [rds](); 15) | rds_shutdown_worker [rds]() { 15) | rds_conn_shutdown [rds]() { 15) 1.651 us | rds_send_path_reset [rds](); 15) 7.195 us | } 15) + 11.434 us | } 19) 2.285 us | rds_cong_remove_conn [rds](); 19) * 24062.76 us | } " So if many rds connections will be destroyed, this function rds_ib_destroy_nodev_conns uses most of time. Suggested-by: Håkon Bugge Signed-off-by: Zhu Yanjun Signed-off-by: David S. Miller --- net/rds/ib.c | 2 +- net/rds/ib_recv.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/net/rds/ib.c b/net/rds/ib.c index 2da9b75bad16..b8d581b779b2 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -87,7 +87,7 @@ static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) spin_lock_irqsave(&rds_ibdev->spinlock, flags); list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node) - rds_conn_drop(ic->conn); + rds_conn_path_drop(&ic->conn->c_path[0], true); spin_unlock_irqrestore(&rds_ibdev->spinlock, flags); } diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 8946c89d7392..3cae88cbdaa0 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -168,6 +168,7 @@ void rds_ib_recv_free_caches(struct rds_ib_connection *ic) list_del(&inc->ii_cache_entry); WARN_ON(!list_empty(&inc->ii_frags)); kmem_cache_free(rds_ib_incoming_slab, inc); + atomic_dec(&rds_ib_allocation); } rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); @@ -1057,6 +1058,8 @@ out: void rds_ib_recv_exit(void) { + WARN_ON(atomic_read(&rds_ib_allocation)); + kmem_cache_destroy(rds_ib_incoming_slab); kmem_cache_destroy(rds_ib_frag_slab); } -- cgit v1.2.3 From 0a8dd9f67cd0da7dc284f48b032ce00db1a68791 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 3 Jun 2019 16:32:59 -0400 Subject: Fix memory leak in sctp_process_init syzbot found the following leak in sctp_process_init BUG: memory leak unreferenced object 0xffff88810ef68400 (size 1024): comm "syz-executor273", pid 7046, jiffies 4294945598 (age 28.770s) hex dump (first 32 bytes): 1d de 28 8d de 0b 1b e3 b5 c2 f9 68 fd 1a 97 25 ..(........h...% 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000a02cebbd>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<00000000a02cebbd>] slab_post_alloc_hook mm/slab.h:439 [inline] [<00000000a02cebbd>] slab_alloc mm/slab.c:3326 [inline] [<00000000a02cebbd>] __do_kmalloc mm/slab.c:3658 [inline] [<00000000a02cebbd>] __kmalloc_track_caller+0x15d/0x2c0 mm/slab.c:3675 [<000000009e6245e6>] kmemdup+0x27/0x60 mm/util.c:119 [<00000000dfdc5d2d>] kmemdup include/linux/string.h:432 [inline] [<00000000dfdc5d2d>] sctp_process_init+0xa7e/0xc20 net/sctp/sm_make_chunk.c:2437 [<00000000b58b62f8>] sctp_cmd_process_init net/sctp/sm_sideeffect.c:682 [inline] [<00000000b58b62f8>] sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1384 [inline] [<00000000b58b62f8>] sctp_side_effects net/sctp/sm_sideeffect.c:1194 [inline] [<00000000b58b62f8>] sctp_do_sm+0xbdc/0x1d60 net/sctp/sm_sideeffect.c:1165 [<0000000044e11f96>] sctp_assoc_bh_rcv+0x13c/0x200 net/sctp/associola.c:1074 [<00000000ec43804d>] sctp_inq_push+0x7f/0xb0 net/sctp/inqueue.c:95 [<00000000726aa954>] sctp_backlog_rcv+0x5e/0x2a0 net/sctp/input.c:354 [<00000000d9e249a8>] sk_backlog_rcv include/net/sock.h:950 [inline] [<00000000d9e249a8>] __release_sock+0xab/0x110 net/core/sock.c:2418 [<00000000acae44fa>] release_sock+0x37/0xd0 net/core/sock.c:2934 [<00000000963cc9ae>] sctp_sendmsg+0x2c0/0x990 net/sctp/socket.c:2122 [<00000000a7fc7565>] inet_sendmsg+0x64/0x120 net/ipv4/af_inet.c:802 [<00000000b732cbd3>] sock_sendmsg_nosec net/socket.c:652 [inline] [<00000000b732cbd3>] sock_sendmsg+0x54/0x70 net/socket.c:671 [<00000000274c57ab>] ___sys_sendmsg+0x393/0x3c0 net/socket.c:2292 [<000000008252aedb>] __sys_sendmsg+0x80/0xf0 net/socket.c:2330 [<00000000f7bf23d1>] __do_sys_sendmsg net/socket.c:2339 [inline] [<00000000f7bf23d1>] __se_sys_sendmsg net/socket.c:2337 [inline] [<00000000f7bf23d1>] __x64_sys_sendmsg+0x23/0x30 net/socket.c:2337 [<00000000a8b4131f>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:3 The problem was that the peer.cookie value points to an skb allocated area on the first pass through this function, at which point it is overwritten with a heap allocated value, but in certain cases, where a COOKIE_ECHO chunk is included in the packet, a second pass through sctp_process_init is made, where the cookie value is re-allocated, leaking the first allocation. Fix is to always allocate the cookie value, and free it when we are done using it. Signed-off-by: Neil Horman Reported-by: syzbot+f7e9153b037eac9b1df8@syzkaller.appspotmail.com CC: Marcelo Ricardo Leitner CC: "David S. Miller" CC: netdev@vger.kernel.org Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 13 +++---------- net/sctp/sm_sideeffect.c | 5 +++++ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 92331e1195c1..f17908f5c4f3 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2312,7 +2312,6 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, union sctp_addr addr; struct sctp_af *af; int src_match = 0; - char *cookie; /* We must include the address that the INIT packet came from. * This is the only address that matters for an INIT packet. @@ -2416,14 +2415,6 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, /* Peer Rwnd : Current calculated value of the peer's rwnd. */ asoc->peer.rwnd = asoc->peer.i.a_rwnd; - /* Copy cookie in case we need to resend COOKIE-ECHO. */ - cookie = asoc->peer.cookie; - if (cookie) { - asoc->peer.cookie = kmemdup(cookie, asoc->peer.cookie_len, gfp); - if (!asoc->peer.cookie) - goto clean_up; - } - /* RFC 2960 7.2.1 The initial value of ssthresh MAY be arbitrarily * high (for example, implementations MAY use the size of the receiver * advertised window). @@ -2592,7 +2583,9 @@ do_addr_param: case SCTP_PARAM_STATE_COOKIE: asoc->peer.cookie_len = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); - asoc->peer.cookie = param.cookie->body; + asoc->peer.cookie = kmemdup(param.cookie->body, asoc->peer.cookie_len, gfp); + if (!asoc->peer.cookie) + retval = 0; break; case SCTP_PARAM_HEARTBEAT_INFO: diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 9b50da548db2..a554d6d15d1b 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -883,6 +883,11 @@ static void sctp_cmd_new_state(struct sctp_cmd_seq *cmds, asoc->rto_initial; } + if (sctp_state(asoc, ESTABLISHED)) { + kfree(asoc->peer.cookie); + asoc->peer.cookie = NULL; + } + if (sctp_state(asoc, ESTABLISHED) || sctp_state(asoc, CLOSED) || sctp_state(asoc, SHUTDOWN_RECEIVED)) { -- cgit v1.2.3 From 0ee4e76937d69128a6a66861ba393ebdc2ffc8a2 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 3 Jun 2019 16:57:13 -0400 Subject: ethtool: fix potential userspace buffer overflow ethtool_get_regs() allocates a buffer of size ops->get_regs_len(), and pass it to the kernel driver via ops->get_regs() for filling. There is no restriction about what the kernel drivers can or cannot do with the open ethtool_regs structure. They usually set regs->version and ignore regs->len or set it to the same size as ops->get_regs_len(). But if userspace allocates a smaller buffer for the registers dump, we would cause a userspace buffer overflow in the final copy_to_user() call, which uses the regs.len value potentially reset by the driver. To fix this, make this case obvious and store regs.len before calling ops->get_regs(), to only copy as much data as requested by userspace, up to the value returned by ops->get_regs_len(). While at it, remove the redundant check for non-null regbuf. Signed-off-by: Vivien Didelot Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- net/core/ethtool.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 43e9add58340..1a0196fbb49c 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1359,13 +1359,16 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) if (!regbuf) return -ENOMEM; + if (regs.len < reglen) + reglen = regs.len; + ops->get_regs(dev, ®s, regbuf); ret = -EFAULT; if (copy_to_user(useraddr, ®s, sizeof(regs))) goto out; useraddr += offsetof(struct ethtool_regs, data); - if (regbuf && copy_to_user(useraddr, regbuf, regs.len)) + if (copy_to_user(useraddr, regbuf, reglen)) goto out; ret = 0; -- cgit v1.2.3 From 930b9a0543385d4eb8ef887e88cf84d95a844577 Mon Sep 17 00:00:00 2001 From: Nikita Danilov Date: Tue, 4 Jun 2019 13:23:49 +0000 Subject: net: aquantia: fix wol configuration not applied sometimes WoL magic packet configuration sometimes does not work due to couple of leakages found. Mainly there was a regression introduced during readx_poll refactoring. Next, fw request waiting time was too small. Sometimes that caused sleep proxy config function to return with an error and to skip WoL configuration. At last, WoL data were passed to FW from not clean buffer. That could cause FW to accept garbage as a random configuration data. Fixes: 6a7f2277313b ("net: aquantia: replace AQ_HW_WAIT_FOR with readx_poll_timeout_atomic") Signed-off-by: Nikita Danilov Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- .../net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c | 14 +++++++------- .../ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c | 4 +++- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c index 1208f7ecdd76..3fc41da39a0a 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c @@ -335,13 +335,13 @@ static int hw_atl_utils_fw_upload_dwords(struct aq_hw_s *self, u32 a, u32 *p, { u32 val; int err = 0; - bool is_locked; - is_locked = hw_atl_sem_ram_get(self); - if (!is_locked) { - err = -ETIME; + err = readx_poll_timeout_atomic(hw_atl_sem_ram_get, self, + val, val == 1U, + 10U, 100000U); + if (err < 0) goto err_exit; - } + if (IS_CHIP_FEATURE(REVISION_B1)) { u32 offset = 0; @@ -353,8 +353,8 @@ static int hw_atl_utils_fw_upload_dwords(struct aq_hw_s *self, u32 a, u32 *p, /* 1000 times by 10us = 10ms */ err = readx_poll_timeout_atomic(hw_atl_scrpad12_get, self, val, - (val & 0xF0000000) == - 0x80000000, + (val & 0xF0000000) != + 0x80000000, 10U, 10000U); } } else { diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c index fbc9d6ac841f..9c16d85fb104 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c @@ -384,7 +384,7 @@ static int aq_fw2x_set_sleep_proxy(struct aq_hw_s *self, u8 *mac) err = readx_poll_timeout_atomic(aq_fw2x_state2_get, self, val, val & HW_ATL_FW2X_CTRL_SLEEP_PROXY, - 1U, 10000U); + 1U, 100000U); err_exit: return err; @@ -404,6 +404,8 @@ static int aq_fw2x_set_wol_params(struct aq_hw_s *self, u8 *mac) msg = (struct fw2x_msg_wol *)rpc; + memset(msg, 0, sizeof(*msg)); + msg->msg_id = HAL_ATLANTIC_UTILS_FW2X_MSG_WOL; msg->magic_packet_enabled = true; memcpy(msg->hw_addr, mac, ETH_ALEN); -- cgit v1.2.3 From 4970b42d5c362bf873982db7d93245c5281e58f4 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 5 Jun 2019 12:27:14 +0800 Subject: Revert "fib_rules: return 0 directly if an exactly same rule exists when NLM_F_EXCL not supplied" This reverts commit e9919a24d3022f72bcadc407e73a6ef17093a849. Nathan reported the new behaviour breaks Android, as Android just add new rules and delete old ones. If we return 0 without adding dup rules, Android will remove the new added rules and causing system to soft-reboot. Fixes: e9919a24d302 ("fib_rules: return 0 directly if an exactly same rule exists when NLM_F_EXCL not supplied") Reported-by: Nathan Chancellor Reported-by: Yaro Slav Reported-by: Maciej Żenczykowski Signed-off-by: Hangbin Liu Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Signed-off-by: David S. Miller --- net/core/fib_rules.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 43f0115cce9c..18f8dd8329ed 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -757,9 +757,9 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) goto errout; - if (rule_exists(ops, frh, tb, rule)) { - if (nlh->nlmsg_flags & NLM_F_EXCL) - err = -EEXIST; + if ((nlh->nlmsg_flags & NLM_F_EXCL) && + rule_exists(ops, frh, tb, rule)) { + err = -EEXIST; goto errout_free; } -- cgit v1.2.3 From 59e3e4b52663a9d97efbce7307f62e4bc5c9ce91 Mon Sep 17 00:00:00 2001 From: Olivier Matz Date: Thu, 6 Jun 2019 09:15:18 +0200 Subject: ipv6: use READ_ONCE() for inet->hdrincl as in ipv4 As it was done in commit 8f659a03a0ba ("net: ipv4: fix for a race condition in raw_sendmsg") and commit 20b50d79974e ("net: ipv4: emulate READ_ONCE() on ->hdrincl bit-field in raw_sendmsg()") for ipv4, copy the value of inet->hdrincl in a local variable, to avoid introducing a race condition in the next commit. Signed-off-by: Olivier Matz Signed-off-by: David S. Miller --- net/ipv6/raw.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 96a3559f2a09..af2f9a833c4e 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -783,6 +783,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct flowi6 fl6; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; + int hdrincl; u16 proto; int err; @@ -796,6 +797,13 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; + /* hdrincl should be READ_ONCE(inet->hdrincl) + * but READ_ONCE() doesn't work with bit fields. + * Doing this indirectly yields the same result. + */ + hdrincl = inet->hdrincl; + hdrincl = READ_ONCE(hdrincl); + /* * Get and verify the address. */ @@ -908,7 +916,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_oif = np->ucast_oif; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - if (inet->hdrincl) + if (hdrincl) fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; if (ipc6.tclass < 0) @@ -931,7 +939,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto do_confirm; back_from_confirm: - if (inet->hdrincl) + if (hdrincl) err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, msg->msg_flags, &ipc6.sockc); else { -- cgit v1.2.3 From b9aa52c4cb457e7416cc0c95f475e72ef4a61336 Mon Sep 17 00:00:00 2001 From: Olivier Matz Date: Thu, 6 Jun 2019 09:15:19 +0200 Subject: ipv6: fix EFAULT on sendto with icmpv6 and hdrincl The following code returns EFAULT (Bad address): s = socket(AF_INET6, SOCK_RAW, IPPROTO_ICMPV6); setsockopt(s, SOL_IPV6, IPV6_HDRINCL, 1); sendto(ipv6_icmp6_packet, addr); /* returns -1, errno = EFAULT */ The IPv4 equivalent code works. A workaround is to use IPPROTO_RAW instead of IPPROTO_ICMPV6. The failure happens because 2 bytes are eaten from the msghdr by rawv6_probe_proto_opt() starting from commit 19e3c66b52ca ("ipv6 equivalent of "ipv4: Avoid reading user iov twice after raw_probe_proto_opt""), but at that time it was not a problem because IPV6_HDRINCL was not yet introduced. Only eat these 2 bytes if hdrincl == 0. Fixes: 715f504b1189 ("ipv6: add IPV6_HDRINCL option for raw sockets") Signed-off-by: Olivier Matz Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv6/raw.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index af2f9a833c4e..1bb88b4b677b 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -895,11 +895,14 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) opt = ipv6_fixup_options(&opt_space, opt); fl6.flowi6_proto = proto; - rfv.msg = msg; - rfv.hlen = 0; - err = rawv6_probe_proto_opt(&rfv, &fl6); - if (err) - goto out; + + if (!hdrincl) { + rfv.msg = msg; + rfv.hlen = 0; + err = rawv6_probe_proto_opt(&rfv, &fl6); + if (err) + goto out; + } if (!ipv6_addr_any(daddr)) fl6.daddr = *daddr; -- cgit v1.2.3 From 85cb928787eab6a2f4ca9d2a798b6f3bed53ced1 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Thu, 6 Jun 2019 04:00:03 -0400 Subject: net: rds: fix memory leak in rds_ib_flush_mr_pool When the following tests last for several hours, the problem will occur. Server: rds-stress -r 1.1.1.16 -D 1M Client: rds-stress -r 1.1.1.14 -s 1.1.1.16 -D 1M -T 30 The following will occur. " Starting up.... tsks tx/s rx/s tx+rx K/s mbi K/s mbo K/s tx us/c rtt us cpu % 1 0 0 0.00 0.00 0.00 0.00 0.00 -1.00 1 0 0 0.00 0.00 0.00 0.00 0.00 -1.00 1 0 0 0.00 0.00 0.00 0.00 0.00 -1.00 1 0 0 0.00 0.00 0.00 0.00 0.00 -1.00 " >From vmcore, we can find that clean_list is NULL. >From the source code, rds_mr_flushd calls rds_ib_mr_pool_flush_worker. Then rds_ib_mr_pool_flush_worker calls " rds_ib_flush_mr_pool(pool, 0, NULL); " Then in function " int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **ibmr_ret) " ibmr_ret is NULL. In the source code, " ... list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail); if (ibmr_ret) *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode); /* more than one entry in llist nodes */ if (clean_nodes->next) llist_add_batch(clean_nodes->next, clean_tail, &pool->clean_list); ... " When ibmr_ret is NULL, llist_entry is not executed. clean_nodes->next instead of clean_nodes is added in clean_list. So clean_nodes is discarded. It can not be used again. The workqueue is executed periodically. So more and more clean_nodes are discarded. Finally the clean_list is NULL. Then this problem will occur. Fixes: 1bc144b62524 ("net, rds, Replace xlist in net/rds/xlist.h with llist") Signed-off-by: Zhu Yanjun Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index d664e9ade74d..0b347f46b2f4 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -428,12 +428,14 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, wait_clean_list_grace(); list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail); - if (ibmr_ret) + if (ibmr_ret) { *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode); - + clean_nodes = clean_nodes->next; + } /* more than one entry in llist nodes */ - if (clean_nodes->next) - llist_add_batch(clean_nodes->next, clean_tail, &pool->clean_list); + if (clean_nodes) + llist_add_batch(clean_nodes, clean_tail, + &pool->clean_list); } -- cgit v1.2.3 From d37acd5aa99c57505b64913e0e2624ec3daed8c5 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 6 Jun 2019 10:42:56 +0200 Subject: net: mvpp2: Use strscpy to handle stat strings Use a safe strscpy call to copy the ethtool stat strings into the relevant buffers, instead of a memcpy that will be accessing out-of-bound data. Fixes: 118d6298f6f0 ("net: mvpp2: add ethtool GOP statistics") Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 7a67e23a2c2b..d8e5241097a9 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -1304,8 +1304,8 @@ static void mvpp2_ethtool_get_strings(struct net_device *netdev, u32 sset, int i; for (i = 0; i < ARRAY_SIZE(mvpp2_ethtool_regs); i++) - memcpy(data + i * ETH_GSTRING_LEN, - &mvpp2_ethtool_regs[i].string, ETH_GSTRING_LEN); + strscpy(data + i * ETH_GSTRING_LEN, + mvpp2_ethtool_regs[i].string, ETH_GSTRING_LEN); } } -- cgit v1.2.3 From 720f1de4021f09898b8c8443f3b3e995991b6e3a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 6 Jun 2019 15:45:03 +0200 Subject: pktgen: do not sleep with the thread lock held. Currently, the process issuing a "start" command on the pktgen procfs interface, acquires the pktgen thread lock and never release it, until all pktgen threads are completed. The above can blocks indefinitely any other pktgen command and any (even unrelated) netdevice removal - as the pktgen netdev notifier acquires the same lock. The issue is demonstrated by the following script, reported by Matteo: ip -b - <<'EOF' link add type dummy link add type veth link set dummy0 up EOF modprobe pktgen echo reset >/proc/net/pktgen/pgctrl { echo rem_device_all echo add_device dummy0 } >/proc/net/pktgen/kpktgend_0 echo count 0 >/proc/net/pktgen/dummy0 echo start >/proc/net/pktgen/pgctrl & sleep 1 rmmod veth Fix the above releasing the thread lock around the sleep call. Additionally we must prevent racing with forcefull rmmod - as the thread lock no more protects from them. Instead, acquire a self-reference before waiting for any thread. As a side effect, running rmmod pktgen while some thread is running now fails with "module in use" error, before this patch such command hanged indefinitely. Note: the issue predates the commit reported in the fixes tag, but this fix can't be applied before the mentioned commit. v1 -> v2: - no need to check for thread existence after flipping the lock, pktgen threads are freed only at net exit time - Fixes: 6146e6a43b35 ("[PKTGEN]: Removes thread_{un,}lock() macros.") Reported-and-tested-by: Matteo Croce Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/core/pktgen.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 319ad5490fb3..a33d03c05ed6 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3066,7 +3066,13 @@ static int pktgen_wait_thread_run(struct pktgen_thread *t) { while (thread_is_running(t)) { + /* note: 't' will still be around even after the unlock/lock + * cycle because pktgen_thread threads are only cleared at + * net exit + */ + mutex_unlock(&pktgen_thread_lock); msleep_interruptible(100); + mutex_lock(&pktgen_thread_lock); if (signal_pending(current)) goto signal; @@ -3081,6 +3087,10 @@ static int pktgen_wait_all_threads_run(struct pktgen_net *pn) struct pktgen_thread *t; int sig = 1; + /* prevent from racing with rmmod */ + if (!try_module_get(THIS_MODULE)) + return sig; + mutex_lock(&pktgen_thread_lock); list_for_each_entry(t, &pn->pktgen_threads, th_list) { @@ -3094,6 +3104,7 @@ static int pktgen_wait_all_threads_run(struct pktgen_net *pn) t->control |= (T_STOP); mutex_unlock(&pktgen_thread_lock); + module_put(THIS_MODULE); return sig; } -- cgit v1.2.3