diff options
Diffstat (limited to 'net')
228 files changed, 6007 insertions, 3570 deletions
diff --git a/net/6lowpan/6lowpan_i.h b/net/6lowpan/6lowpan_i.h index d16bb4b14aa1..97ecc27aeca6 100644 --- a/net/6lowpan/6lowpan_i.h +++ b/net/6lowpan/6lowpan_i.h @@ -3,6 +3,15 @@ #include <linux/netdevice.h> +#include <net/6lowpan.h> + +/* caller need to be sure it's dev->type is ARPHRD_6LOWPAN */ +static inline bool lowpan_is_ll(const struct net_device *dev, + enum lowpan_lltypes lltype) +{ + return lowpan_dev(dev)->lltype == lltype; +} + #ifdef CONFIG_6LOWPAN_DEBUGFS int lowpan_dev_debugfs_init(struct net_device *dev); void lowpan_dev_debugfs_exit(struct net_device *dev); diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index 34e44c0c0836..7a240b3eaed1 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -27,11 +27,11 @@ int lowpan_register_netdevice(struct net_device *dev, dev->mtu = IPV6_MIN_MTU; dev->priv_flags |= IFF_NO_QUEUE; - lowpan_priv(dev)->lltype = lltype; + lowpan_dev(dev)->lltype = lltype; - spin_lock_init(&lowpan_priv(dev)->ctx.lock); + spin_lock_init(&lowpan_dev(dev)->ctx.lock); for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) - lowpan_priv(dev)->ctx.table[i].id = i; + lowpan_dev(dev)->ctx.table[i].id = i; ret = register_netdevice(dev); if (ret < 0) @@ -85,7 +85,7 @@ static int lowpan_event(struct notifier_block *unused, case NETDEV_DOWN: for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, - &lowpan_priv(dev)->ctx.table[i].flags); + &lowpan_dev(dev)->ctx.table[i].flags); break; default: return NOTIFY_DONE; diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c index 0793a8157472..acbaa3db493b 100644 --- a/net/6lowpan/debugfs.c +++ b/net/6lowpan/debugfs.c @@ -172,7 +172,7 @@ static const struct file_operations lowpan_ctx_pfx_fops = { static int lowpan_dev_debugfs_ctx_init(struct net_device *dev, struct dentry *ctx, u8 id) { - struct lowpan_priv *lpriv = lowpan_priv(dev); + struct lowpan_dev *ldev = lowpan_dev(dev); struct dentry *dentry, *root; char buf[32]; @@ -185,25 +185,25 @@ static int lowpan_dev_debugfs_ctx_init(struct net_device *dev, return -EINVAL; dentry = debugfs_create_file("active", 0644, root, - &lpriv->ctx.table[id], + &ldev->ctx.table[id], &lowpan_ctx_flag_active_fops); if (!dentry) return -EINVAL; dentry = debugfs_create_file("compression", 0644, root, - &lpriv->ctx.table[id], + &ldev->ctx.table[id], &lowpan_ctx_flag_c_fops); if (!dentry) return -EINVAL; dentry = debugfs_create_file("prefix", 0644, root, - &lpriv->ctx.table[id], + &ldev->ctx.table[id], &lowpan_ctx_pfx_fops); if (!dentry) return -EINVAL; dentry = debugfs_create_file("prefix_len", 0644, root, - &lpriv->ctx.table[id], + &ldev->ctx.table[id], &lowpan_ctx_plen_fops); if (!dentry) return -EINVAL; @@ -247,21 +247,21 @@ static const struct file_operations lowpan_context_fops = { int lowpan_dev_debugfs_init(struct net_device *dev) { - struct lowpan_priv *lpriv = lowpan_priv(dev); + struct lowpan_dev *ldev = lowpan_dev(dev); struct dentry *contexts, *dentry; int ret, i; /* creating the root */ - lpriv->iface_debugfs = debugfs_create_dir(dev->name, lowpan_debugfs); - if (!lpriv->iface_debugfs) + ldev->iface_debugfs = debugfs_create_dir(dev->name, lowpan_debugfs); + if (!ldev->iface_debugfs) goto fail; - contexts = debugfs_create_dir("contexts", lpriv->iface_debugfs); + contexts = debugfs_create_dir("contexts", ldev->iface_debugfs); if (!contexts) goto remove_root; dentry = debugfs_create_file("show", 0644, contexts, - &lowpan_priv(dev)->ctx, + &lowpan_dev(dev)->ctx, &lowpan_context_fops); if (!dentry) goto remove_root; @@ -282,7 +282,7 @@ fail: void lowpan_dev_debugfs_exit(struct net_device *dev) { - debugfs_remove_recursive(lowpan_priv(dev)->iface_debugfs); + debugfs_remove_recursive(lowpan_dev(dev)->iface_debugfs); } int __init lowpan_debugfs_init(void) diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c index 68c80f3c9add..8501dd532fe1 100644 --- a/net/6lowpan/iphc.c +++ b/net/6lowpan/iphc.c @@ -53,9 +53,6 @@ #include <net/6lowpan.h> #include <net/ipv6.h> -/* special link-layer handling */ -#include <net/mac802154.h> - #include "6lowpan_i.h" #include "nhc.h" @@ -156,32 +153,17 @@ #define LOWPAN_IPHC_CID_DCI(cid) (cid & 0x0f) #define LOWPAN_IPHC_CID_SCI(cid) ((cid & 0xf0) >> 4) -static inline void iphc_uncompress_eui64_lladdr(struct in6_addr *ipaddr, - const void *lladdr) -{ - /* fe:80::XXXX:XXXX:XXXX:XXXX - * \_________________/ - * hwaddr - */ - ipaddr->s6_addr[0] = 0xFE; - ipaddr->s6_addr[1] = 0x80; - memcpy(&ipaddr->s6_addr[8], lladdr, EUI64_ADDR_LEN); - /* second bit-flip (Universe/Local) - * is done according RFC2464 - */ - ipaddr->s6_addr[8] ^= 0x02; -} - -static inline void iphc_uncompress_802154_lladdr(struct in6_addr *ipaddr, - const void *lladdr) +static inline void +lowpan_iphc_uncompress_802154_lladdr(struct in6_addr *ipaddr, + const void *lladdr) { const struct ieee802154_addr *addr = lladdr; - u8 eui64[EUI64_ADDR_LEN] = { }; + u8 eui64[EUI64_ADDR_LEN]; switch (addr->mode) { case IEEE802154_ADDR_LONG: ieee802154_le64_to_be64(eui64, &addr->extended_addr); - iphc_uncompress_eui64_lladdr(ipaddr, eui64); + lowpan_iphc_uncompress_eui64_lladdr(ipaddr, eui64); break; case IEEE802154_ADDR_SHORT: /* fe:80::ff:fe00:XXXX @@ -207,7 +189,7 @@ static inline void iphc_uncompress_802154_lladdr(struct in6_addr *ipaddr, static struct lowpan_iphc_ctx * lowpan_iphc_ctx_get_by_id(const struct net_device *dev, u8 id) { - struct lowpan_iphc_ctx *ret = &lowpan_priv(dev)->ctx.table[id]; + struct lowpan_iphc_ctx *ret = &lowpan_dev(dev)->ctx.table[id]; if (!lowpan_iphc_ctx_is_active(ret)) return NULL; @@ -219,7 +201,7 @@ static struct lowpan_iphc_ctx * lowpan_iphc_ctx_get_by_addr(const struct net_device *dev, const struct in6_addr *addr) { - struct lowpan_iphc_ctx *table = lowpan_priv(dev)->ctx.table; + struct lowpan_iphc_ctx *table = lowpan_dev(dev)->ctx.table; struct lowpan_iphc_ctx *ret = NULL; struct in6_addr addr_pfx; u8 addr_plen; @@ -263,7 +245,7 @@ static struct lowpan_iphc_ctx * lowpan_iphc_ctx_get_by_mcast_addr(const struct net_device *dev, const struct in6_addr *addr) { - struct lowpan_iphc_ctx *table = lowpan_priv(dev)->ctx.table; + struct lowpan_iphc_ctx *table = lowpan_dev(dev)->ctx.table; struct lowpan_iphc_ctx *ret = NULL; struct in6_addr addr_mcast, network_pfx = {}; int i; @@ -301,9 +283,10 @@ lowpan_iphc_ctx_get_by_mcast_addr(const struct net_device *dev, * * address_mode is the masked value for sam or dam value */ -static int uncompress_addr(struct sk_buff *skb, const struct net_device *dev, - struct in6_addr *ipaddr, u8 address_mode, - const void *lladdr) +static int lowpan_iphc_uncompress_addr(struct sk_buff *skb, + const struct net_device *dev, + struct in6_addr *ipaddr, + u8 address_mode, const void *lladdr) { bool fail; @@ -332,12 +315,12 @@ static int uncompress_addr(struct sk_buff *skb, const struct net_device *dev, case LOWPAN_IPHC_SAM_11: case LOWPAN_IPHC_DAM_11: fail = false; - switch (lowpan_priv(dev)->lltype) { + switch (lowpan_dev(dev)->lltype) { case LOWPAN_LLTYPE_IEEE802154: - iphc_uncompress_802154_lladdr(ipaddr, lladdr); + lowpan_iphc_uncompress_802154_lladdr(ipaddr, lladdr); break; default: - iphc_uncompress_eui64_lladdr(ipaddr, lladdr); + lowpan_iphc_uncompress_eui64_lladdr(ipaddr, lladdr); break; } break; @@ -360,11 +343,11 @@ static int uncompress_addr(struct sk_buff *skb, const struct net_device *dev, /* Uncompress address function for source context * based address(non-multicast). */ -static int uncompress_ctx_addr(struct sk_buff *skb, - const struct net_device *dev, - const struct lowpan_iphc_ctx *ctx, - struct in6_addr *ipaddr, u8 address_mode, - const void *lladdr) +static int lowpan_iphc_uncompress_ctx_addr(struct sk_buff *skb, + const struct net_device *dev, + const struct lowpan_iphc_ctx *ctx, + struct in6_addr *ipaddr, + u8 address_mode, const void *lladdr) { bool fail; @@ -393,12 +376,12 @@ static int uncompress_ctx_addr(struct sk_buff *skb, case LOWPAN_IPHC_SAM_11: case LOWPAN_IPHC_DAM_11: fail = false; - switch (lowpan_priv(dev)->lltype) { + switch (lowpan_dev(dev)->lltype) { case LOWPAN_LLTYPE_IEEE802154: - iphc_uncompress_802154_lladdr(ipaddr, lladdr); + lowpan_iphc_uncompress_802154_lladdr(ipaddr, lladdr); break; default: - iphc_uncompress_eui64_lladdr(ipaddr, lladdr); + lowpan_iphc_uncompress_eui64_lladdr(ipaddr, lladdr); break; } ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen); @@ -657,22 +640,24 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, } if (iphc1 & LOWPAN_IPHC_SAC) { - spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + spin_lock_bh(&lowpan_dev(dev)->ctx.lock); ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_SCI(cid)); if (!ci) { - spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + spin_unlock_bh(&lowpan_dev(dev)->ctx.lock); return -EINVAL; } pr_debug("SAC bit is set. Handle context based source address.\n"); - err = uncompress_ctx_addr(skb, dev, ci, &hdr.saddr, - iphc1 & LOWPAN_IPHC_SAM_MASK, saddr); - spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + err = lowpan_iphc_uncompress_ctx_addr(skb, dev, ci, &hdr.saddr, + iphc1 & LOWPAN_IPHC_SAM_MASK, + saddr); + spin_unlock_bh(&lowpan_dev(dev)->ctx.lock); } else { /* Source address uncompression */ pr_debug("source address stateless compression\n"); - err = uncompress_addr(skb, dev, &hdr.saddr, - iphc1 & LOWPAN_IPHC_SAM_MASK, saddr); + err = lowpan_iphc_uncompress_addr(skb, dev, &hdr.saddr, + iphc1 & LOWPAN_IPHC_SAM_MASK, + saddr); } /* Check on error of previous branch */ @@ -681,10 +666,10 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, switch (iphc1 & (LOWPAN_IPHC_M | LOWPAN_IPHC_DAC)) { case LOWPAN_IPHC_M | LOWPAN_IPHC_DAC: - spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + spin_lock_bh(&lowpan_dev(dev)->ctx.lock); ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid)); if (!ci) { - spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + spin_unlock_bh(&lowpan_dev(dev)->ctx.lock); return -EINVAL; } @@ -693,7 +678,7 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, err = lowpan_uncompress_multicast_ctx_daddr(skb, ci, &hdr.daddr, iphc1 & LOWPAN_IPHC_DAM_MASK); - spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + spin_unlock_bh(&lowpan_dev(dev)->ctx.lock); break; case LOWPAN_IPHC_M: /* multicast */ @@ -701,22 +686,24 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, iphc1 & LOWPAN_IPHC_DAM_MASK); break; case LOWPAN_IPHC_DAC: - spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + spin_lock_bh(&lowpan_dev(dev)->ctx.lock); ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid)); if (!ci) { - spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + spin_unlock_bh(&lowpan_dev(dev)->ctx.lock); return -EINVAL; } /* Destination address context based uncompression */ pr_debug("DAC bit is set. Handle context based destination address.\n"); - err = uncompress_ctx_addr(skb, dev, ci, &hdr.daddr, - iphc1 & LOWPAN_IPHC_DAM_MASK, daddr); - spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + err = lowpan_iphc_uncompress_ctx_addr(skb, dev, ci, &hdr.daddr, + iphc1 & LOWPAN_IPHC_DAM_MASK, + daddr); + spin_unlock_bh(&lowpan_dev(dev)->ctx.lock); break; default: - err = uncompress_addr(skb, dev, &hdr.daddr, - iphc1 & LOWPAN_IPHC_DAM_MASK, daddr); + err = lowpan_iphc_uncompress_addr(skb, dev, &hdr.daddr, + iphc1 & LOWPAN_IPHC_DAM_MASK, + daddr); pr_debug("dest: stateless compression mode %d dest %pI6c\n", iphc1 & LOWPAN_IPHC_DAM_MASK, &hdr.daddr); break; @@ -736,7 +723,7 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, return err; } - switch (lowpan_priv(dev)->lltype) { + switch (lowpan_dev(dev)->lltype) { case LOWPAN_LLTYPE_IEEE802154: if (lowpan_802154_cb(skb)->d_size) hdr.payload_len = htons(lowpan_802154_cb(skb)->d_size - @@ -1033,7 +1020,7 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, skb->data, skb->len); ipv6_daddr_type = ipv6_addr_type(&hdr->daddr); - spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + spin_lock_bh(&lowpan_dev(dev)->ctx.lock); if (ipv6_daddr_type & IPV6_ADDR_MULTICAST) dci = lowpan_iphc_ctx_get_by_mcast_addr(dev, &hdr->daddr); else @@ -1042,15 +1029,15 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, memcpy(&dci_entry, dci, sizeof(*dci)); cid |= dci->id; } - spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + spin_unlock_bh(&lowpan_dev(dev)->ctx.lock); - spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + spin_lock_bh(&lowpan_dev(dev)->ctx.lock); sci = lowpan_iphc_ctx_get_by_addr(dev, &hdr->saddr); if (sci) { memcpy(&sci_entry, sci, sizeof(*sci)); cid |= (sci->id << 4); } - spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + spin_unlock_bh(&lowpan_dev(dev)->ctx.lock); /* if cid is zero it will be compressed */ if (cid) { diff --git a/net/6lowpan/nhc_udp.c b/net/6lowpan/nhc_udp.c index 69537a2eaab1..225d91906dfa 100644 --- a/net/6lowpan/nhc_udp.c +++ b/net/6lowpan/nhc_udp.c @@ -91,7 +91,7 @@ static int udp_uncompress(struct sk_buff *skb, size_t needed) * here, we obtain the hint from the remaining size of the * frame */ - switch (lowpan_priv(skb->dev)->lltype) { + switch (lowpan_dev(skb->dev)->lltype) { case LOWPAN_LLTYPE_IEEE802154: if (lowpan_802154_cb(skb)->d_size) uh.len = htons(lowpan_802154_cb(skb)->d_size - diff --git a/net/Kconfig b/net/Kconfig index a8934d8c8fda..b841c42e5c9b 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -236,6 +236,7 @@ source "net/mpls/Kconfig" source "net/hsr/Kconfig" source "net/switchdev/Kconfig" source "net/l3mdev/Kconfig" +source "net/qrtr/Kconfig" config RPS bool diff --git a/net/Makefile b/net/Makefile index 81d14119eab5..bdd14553a774 100644 --- a/net/Makefile +++ b/net/Makefile @@ -78,3 +78,4 @@ endif ifneq ($(CONFIG_NET_L3_MASTER_DEV),) obj-y += l3mdev/ endif +obj-$(CONFIG_QRTR) += qrtr/ diff --git a/net/atm/lec.c b/net/atm/lec.c index cd3b37989057..e574a7e9db6f 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -194,7 +194,7 @@ lec_send(struct atm_vcc *vcc, struct sk_buff *skb) static void lec_tx_timeout(struct net_device *dev) { pr_info("%s\n", dev->name); - dev->trans_start = jiffies; + netif_trans_update(dev); netif_wake_queue(dev); } @@ -324,7 +324,7 @@ static netdev_tx_t lec_start_xmit(struct sk_buff *skb, out: if (entry) lec_arp_put(entry); - dev->trans_start = jiffies; + netif_trans_update(dev); return NETDEV_TX_OK; } diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index cb2d1b9b0340..7f98a9d39883 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -32,6 +32,7 @@ #include <linux/jiffies.h> #include <linux/list.h> #include <linux/kref.h> +#include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/pkt_sched.h> #include <linux/printk.h> @@ -175,71 +176,107 @@ unlock: } /** - * batadv_iv_ogm_orig_del_if - change the private structures of the orig_node to - * exclude the removed interface + * batadv_iv_ogm_drop_bcast_own_entry - drop section of bcast_own * @orig_node: the orig_node that has to be changed * @max_if_num: the current amount of interfaces * @del_if_num: the index of the interface being removed - * - * Return: 0 on success, a negative error code otherwise. */ -static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node, - int max_if_num, int del_if_num) +static void +batadv_iv_ogm_drop_bcast_own_entry(struct batadv_orig_node *orig_node, + int max_if_num, int del_if_num) { - int ret = -ENOMEM; - size_t chunk_size, if_offset; - void *data_ptr = NULL; - - spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); + size_t chunk_size; + size_t if_offset; + void *data_ptr; - /* last interface was removed */ - if (max_if_num == 0) - goto free_bcast_own; + lockdep_assert_held(&orig_node->bat_iv.ogm_cnt_lock); chunk_size = sizeof(unsigned long) * BATADV_NUM_WORDS; data_ptr = kmalloc_array(max_if_num, chunk_size, GFP_ATOMIC); if (!data_ptr) - goto unlock; + /* use old buffer when new one could not be allocated */ + data_ptr = orig_node->bat_iv.bcast_own; /* copy first part */ - memcpy(data_ptr, orig_node->bat_iv.bcast_own, del_if_num * chunk_size); + memmove(data_ptr, orig_node->bat_iv.bcast_own, del_if_num * chunk_size); /* copy second part */ if_offset = (del_if_num + 1) * chunk_size; - memcpy((char *)data_ptr + del_if_num * chunk_size, - (uint8_t *)orig_node->bat_iv.bcast_own + if_offset, - (max_if_num - del_if_num) * chunk_size); + memmove((char *)data_ptr + del_if_num * chunk_size, + (uint8_t *)orig_node->bat_iv.bcast_own + if_offset, + (max_if_num - del_if_num) * chunk_size); -free_bcast_own: - kfree(orig_node->bat_iv.bcast_own); - orig_node->bat_iv.bcast_own = data_ptr; + /* bcast_own was shrunk down in new buffer; free old one */ + if (orig_node->bat_iv.bcast_own != data_ptr) { + kfree(orig_node->bat_iv.bcast_own); + orig_node->bat_iv.bcast_own = data_ptr; + } +} + +/** + * batadv_iv_ogm_drop_bcast_own_sum_entry - drop section of bcast_own_sum + * @orig_node: the orig_node that has to be changed + * @max_if_num: the current amount of interfaces + * @del_if_num: the index of the interface being removed + */ +static void +batadv_iv_ogm_drop_bcast_own_sum_entry(struct batadv_orig_node *orig_node, + int max_if_num, int del_if_num) +{ + size_t if_offset; + void *data_ptr; - if (max_if_num == 0) - goto free_own_sum; + lockdep_assert_held(&orig_node->bat_iv.ogm_cnt_lock); data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC); - if (!data_ptr) { - kfree(orig_node->bat_iv.bcast_own); - goto unlock; - } + if (!data_ptr) + /* use old buffer when new one could not be allocated */ + data_ptr = orig_node->bat_iv.bcast_own_sum; - memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum, - del_if_num * sizeof(u8)); + memmove(data_ptr, orig_node->bat_iv.bcast_own_sum, + del_if_num * sizeof(u8)); if_offset = (del_if_num + 1) * sizeof(u8); - memcpy((char *)data_ptr + del_if_num * sizeof(u8), - orig_node->bat_iv.bcast_own_sum + if_offset, - (max_if_num - del_if_num) * sizeof(u8)); + memmove((char *)data_ptr + del_if_num * sizeof(u8), + orig_node->bat_iv.bcast_own_sum + if_offset, + (max_if_num - del_if_num) * sizeof(u8)); + + /* bcast_own_sum was shrunk down in new buffer; free old one */ + if (orig_node->bat_iv.bcast_own_sum != data_ptr) { + kfree(orig_node->bat_iv.bcast_own_sum); + orig_node->bat_iv.bcast_own_sum = data_ptr; + } +} -free_own_sum: - kfree(orig_node->bat_iv.bcast_own_sum); - orig_node->bat_iv.bcast_own_sum = data_ptr; +/** + * batadv_iv_ogm_orig_del_if - change the private structures of the orig_node to + * exclude the removed interface + * @orig_node: the orig_node that has to be changed + * @max_if_num: the current amount of interfaces + * @del_if_num: the index of the interface being removed + * + * Return: 0 on success, a negative error code otherwise. + */ +static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node, + int max_if_num, int del_if_num) +{ + spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); + + if (max_if_num == 0) { + kfree(orig_node->bat_iv.bcast_own); + kfree(orig_node->bat_iv.bcast_own_sum); + orig_node->bat_iv.bcast_own = NULL; + orig_node->bat_iv.bcast_own_sum = NULL; + } else { + batadv_iv_ogm_drop_bcast_own_entry(orig_node, max_if_num, + del_if_num); + batadv_iv_ogm_drop_bcast_own_sum_entry(orig_node, max_if_num, + del_if_num); + } - ret = 0; -unlock: spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); - return ret; + return 0; } /** @@ -644,18 +681,12 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, unsigned char *skb_buff; unsigned int skb_size; - if (!kref_get_unless_zero(&if_incoming->refcount)) - return; - - if (!kref_get_unless_zero(&if_outgoing->refcount)) - goto out_free_incoming; - /* own packet should always be scheduled */ if (!own_packet) { if (!batadv_atomic_dec_not_zero(&bat_priv->batman_queue_left)) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "batman packet queue full\n"); - goto out_free_outgoing; + return; } } @@ -681,6 +712,8 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, forw_packet_aggr->packet_len = packet_len; memcpy(skb_buff, packet_buff, packet_len); + kref_get(&if_incoming->refcount); + kref_get(&if_outgoing->refcount); forw_packet_aggr->own = own_packet; forw_packet_aggr->if_incoming = if_incoming; forw_packet_aggr->if_outgoing = if_outgoing; @@ -710,10 +743,6 @@ out_free_forw_packet: out_nomem: if (!own_packet) atomic_inc(&bat_priv->batman_queue_left); -out_free_outgoing: - batadv_hardif_put(if_outgoing); -out_free_incoming: - batadv_hardif_put(if_incoming); } /* aggregate a new packet into the existing ogm packet */ @@ -950,9 +979,15 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) list_for_each_entry_rcu(tmp_hard_iface, &batadv_hardif_list, list) { if (tmp_hard_iface->soft_iface != hard_iface->soft_iface) continue; + + if (!kref_get_unless_zero(&tmp_hard_iface->refcount)) + continue; + batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len, hard_iface, tmp_hard_iface, 1, send_time); + + batadv_hardif_put(tmp_hard_iface); } rcu_read_unlock(); @@ -1133,13 +1168,13 @@ out: * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * - * Return: 1 if the link can be considered bidirectional, 0 otherwise + * Return: true if the link can be considered bidirectional, false otherwise */ -static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - struct batadv_ogm_packet *batadv_ogm_packet, - struct batadv_hard_iface *if_incoming, - struct batadv_hard_iface *if_outgoing) +static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, + struct batadv_orig_node *orig_neigh_node, + struct batadv_ogm_packet *batadv_ogm_packet, + struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node; @@ -1147,9 +1182,10 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, u8 total_count; u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; - int tq_asym_penalty, inv_asym_penalty, if_num, ret = 0; + int tq_asym_penalty, inv_asym_penalty, if_num; unsigned int combined_tq; int tq_iface_penalty; + bool ret = false; /* find corresponding one hop neighbor */ rcu_read_lock(); @@ -1261,7 +1297,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, * consider it bidirectional */ if (batadv_ogm_packet->tq >= BATADV_TQ_TOTAL_BIDRECT_LIMIT) - ret = 1; + ret = true; out: if (neigh_node) @@ -1290,9 +1326,9 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, struct batadv_orig_ifinfo *orig_ifinfo = NULL; struct batadv_neigh_node *neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; - int is_dup; + bool is_dup; s32 seq_diff; - int need_update = 0; + bool need_update = false; int set_mark; enum batadv_dup_status ret = BATADV_NO_DUP; u32 seqno = ntohl(batadv_ogm_packet->seqno); @@ -1402,7 +1438,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, struct sk_buff *skb_priv; struct ethhdr *ethhdr; u8 *prev_sender; - int is_bidirect; + bool is_bidirect; /* create a private copy of the skb, as some functions change tq value * and/or flags. @@ -1730,8 +1766,13 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, if (hard_iface->soft_iface != bat_priv->soft_iface) continue; + if (!kref_get_unless_zero(&hard_iface->refcount)) + continue; + batadv_iv_ogm_process_per_outif(skb, ogm_offset, orig_node, if_incoming, hard_iface); + + batadv_hardif_put(hard_iface); } rcu_read_unlock(); @@ -1829,9 +1870,8 @@ static void batadv_iv_ogm_orig_print(struct batadv_priv *bat_priv, int batman_count = 0; u32 i; - seq_printf(seq, " %-15s %s (%s/%i) %17s [%10s]: %20s ...\n", - "Originator", "last-seen", "#", BATADV_TQ_MAX_VALUE, - "Nexthop", "outgoingIF", "Potential nexthops"); + seq_puts(seq, + " Originator last-seen (#/255) Nexthop [outgoingIF]: Potential nexthops ...\n"); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -1911,8 +1951,7 @@ static void batadv_iv_neigh_print(struct batadv_priv *bat_priv, struct batadv_hard_iface *hard_iface; int batman_count = 0; - seq_printf(seq, " %10s %-13s %s\n", - "IF", "Neighbor", "last-seen"); + seq_puts(seq, " IF Neighbor last-seen\n"); rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 3315b9a598af..3ff8bd1b7bdc 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -32,10 +32,21 @@ #include "bat_v_elp.h" #include "bat_v_ogm.h" +#include "hard-interface.h" #include "hash.h" #include "originator.h" #include "packet.h" +static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface) +{ + /* B.A.T.M.A.N. V does not use any queuing mechanism, therefore it can + * set the interface as ACTIVE right away, without any risk of race + * condition + */ + if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED) + hard_iface->if_status = BATADV_IF_ACTIVE; +} + static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface) { int ret; @@ -151,8 +162,8 @@ static void batadv_v_neigh_print(struct batadv_priv *bat_priv, struct batadv_hard_iface *hard_iface; int batman_count = 0; - seq_printf(seq, " %-15s %s (%11s) [%10s]\n", "Neighbor", - "last-seen", "throughput", "IF"); + seq_puts(seq, + " Neighbor last-seen ( throughput) [ IF]\n"); rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { @@ -191,9 +202,8 @@ static void batadv_v_orig_print(struct batadv_priv *bat_priv, int batman_count = 0; u32 i; - seq_printf(seq, " %-15s %s (%11s) %17s [%10s]: %20s ...\n", - "Originator", "last-seen", "throughput", "Nexthop", - "outgoingIF", "Potential nexthops"); + seq_puts(seq, + " Originator last-seen ( throughput) Nexthop [outgoingIF]: Potential nexthops ...\n"); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -274,6 +284,7 @@ static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1, static struct batadv_algo_ops batadv_batman_v __read_mostly = { .name = "BATMAN_V", + .bat_iface_activate = batadv_v_iface_activate, .bat_iface_enable = batadv_v_iface_enable, .bat_iface_disable = batadv_v_iface_disable, .bat_iface_update_mac = batadv_v_iface_update_mac, diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index d9bcbe6e7d65..473ebb9a0e73 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -26,6 +26,7 @@ #include <linux/if_ether.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/random.h> @@ -176,6 +177,9 @@ static void batadv_v_ogm_send(struct work_struct *work) if (hard_iface->soft_iface != bat_priv->soft_iface) continue; + if (!kref_get_unless_zero(&hard_iface->refcount)) + continue; + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Sending own OGM2 packet (originator %pM, seqno %u, throughput %u, TTL %d) on interface %s [%pM]\n", ogm_packet->orig, ntohl(ogm_packet->seqno), @@ -185,10 +189,13 @@ static void batadv_v_ogm_send(struct work_struct *work) /* this skb gets consumed by batadv_v_ogm_send_to_if() */ skb_tmp = skb_clone(skb, GFP_ATOMIC); - if (!skb_tmp) + if (!skb_tmp) { + batadv_hardif_put(hard_iface); break; + } batadv_v_ogm_send_to_if(skb_tmp, hard_iface); + batadv_hardif_put(hard_iface); } rcu_read_unlock(); @@ -234,73 +241,6 @@ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface) } /** - * batadv_v_ogm_orig_update - update the originator status based on the received - * OGM - * @bat_priv: the bat priv with all the soft interface information - * @orig_node: the originator to update - * @neigh_node: the neighbour the OGM has been received from (to update) - * @ogm2: the received OGM - * @if_outgoing: the interface where this OGM is going to be forwarded through - */ -static void -batadv_v_ogm_orig_update(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_neigh_node *neigh_node, - const struct batadv_ogm2_packet *ogm2, - struct batadv_hard_iface *if_outgoing) -{ - struct batadv_neigh_ifinfo *router_ifinfo = NULL, *neigh_ifinfo = NULL; - struct batadv_neigh_node *router = NULL; - s32 neigh_seq_diff; - u32 neigh_last_seqno; - u32 router_last_seqno; - u32 router_throughput, neigh_throughput; - - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Searching and updating originator entry of received packet\n"); - - /* if this neighbor already is our next hop there is nothing - * to change - */ - router = batadv_orig_router_get(orig_node, if_outgoing); - if (router == neigh_node) - goto out; - - /* don't consider neighbours with worse throughput. - * also switch route if this seqno is BATADV_V_MAX_ORIGDIFF newer than - * the last received seqno from our best next hop. - */ - if (router) { - router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); - neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); - - /* if these are not allocated, something is wrong. */ - if (!router_ifinfo || !neigh_ifinfo) - goto out; - - neigh_last_seqno = neigh_ifinfo->bat_v.last_seqno; - router_last_seqno = router_ifinfo->bat_v.last_seqno; - neigh_seq_diff = neigh_last_seqno - router_last_seqno; - router_throughput = router_ifinfo->bat_v.throughput; - neigh_throughput = neigh_ifinfo->bat_v.throughput; - - if ((neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF) && - (router_throughput >= neigh_throughput)) - goto out; - } - - batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node); - -out: - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); - if (neigh_ifinfo) - batadv_neigh_ifinfo_put(neigh_ifinfo); - if (router) - batadv_neigh_node_put(router); -} - -/** * batadv_v_forward_penalty - apply a penalty to the throughput metric forwarded * with B.A.T.M.A.N. V OGMs * @bat_priv: the bat priv with all the soft interface information @@ -347,10 +287,12 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, } /** - * batadv_v_ogm_forward - forward an OGM to the given outgoing interface + * batadv_v_ogm_forward - check conditions and forward an OGM to the given + * outgoing interface * @bat_priv: the bat priv with all the soft interface information * @ogm_received: previously received OGM to be forwarded - * @throughput: throughput to announce, may vary per outgoing interface + * @orig_node: the originator which has been updated + * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface on which this OGM was received on * @if_outgoing: the interface to which the OGM has to be forwarded to * @@ -359,28 +301,57 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, */ static void batadv_v_ogm_forward(struct batadv_priv *bat_priv, const struct batadv_ogm2_packet *ogm_received, - u32 throughput, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { + struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; + struct batadv_orig_ifinfo *orig_ifinfo = NULL; + struct batadv_neigh_node *router = NULL; struct batadv_ogm2_packet *ogm_forward; unsigned char *skb_buff; struct sk_buff *skb; size_t packet_len; u16 tvlv_len; + /* only forward for specific interfaces, not for the default one. */ + if (if_outgoing == BATADV_IF_DEFAULT) + goto out; + + orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); + if (!orig_ifinfo) + goto out; + + /* acquire possibly updated router */ + router = batadv_orig_router_get(orig_node, if_outgoing); + + /* strict rule: forward packets coming from the best next hop only */ + if (neigh_node != router) + goto out; + + /* don't forward the same seqno twice on one interface */ + if (orig_ifinfo->last_seqno_forwarded == ntohl(ogm_received->seqno)) + goto out; + + orig_ifinfo->last_seqno_forwarded = ntohl(ogm_received->seqno); + if (ogm_received->ttl <= 1) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n"); - return; + goto out; } + neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); + if (!neigh_ifinfo) + goto out; + tvlv_len = ntohs(ogm_received->tvlv_len); packet_len = BATADV_OGM2_HLEN + tvlv_len; skb = netdev_alloc_skb_ip_align(if_outgoing->net_dev, ETH_HLEN + packet_len); if (!skb) - return; + goto out; skb_reserve(skb, ETH_HLEN); skb_buff = skb_put(skb, packet_len); @@ -388,15 +359,23 @@ static void batadv_v_ogm_forward(struct batadv_priv *bat_priv, /* apply forward penalty */ ogm_forward = (struct batadv_ogm2_packet *)skb_buff; - ogm_forward->throughput = htonl(throughput); + ogm_forward->throughput = htonl(neigh_ifinfo->bat_v.throughput); ogm_forward->ttl--; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding OGM2 packet on %s: throughput %u, ttl %u, received via %s\n", - if_outgoing->net_dev->name, throughput, ogm_forward->ttl, - if_incoming->net_dev->name); + if_outgoing->net_dev->name, ntohl(ogm_forward->throughput), + ogm_forward->ttl, if_incoming->net_dev->name); batadv_v_ogm_send_to_if(skb, if_outgoing); + +out: + if (orig_ifinfo) + batadv_orig_ifinfo_put(orig_ifinfo); + if (router) + batadv_neigh_node_put(router); + if (neigh_ifinfo) + batadv_neigh_ifinfo_put(neigh_ifinfo); } /** @@ -493,8 +472,10 @@ out: * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface where this packet was received * @if_outgoing: the interface for which the packet should be considered + * + * Return: true if the packet should be forwarded, false otherwise */ -static void batadv_v_ogm_route_update(struct batadv_priv *bat_priv, +static bool batadv_v_ogm_route_update(struct batadv_priv *bat_priv, const struct ethhdr *ethhdr, const struct batadv_ogm2_packet *ogm2, struct batadv_orig_node *orig_node, @@ -503,14 +484,14 @@ static void batadv_v_ogm_route_update(struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_node *router = NULL; - struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; struct batadv_orig_node *orig_neigh_node = NULL; - struct batadv_orig_ifinfo *orig_ifinfo = NULL; struct batadv_neigh_node *orig_neigh_router = NULL; - - neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); - if (!neigh_ifinfo) - goto out; + struct batadv_neigh_ifinfo *router_ifinfo = NULL, *neigh_ifinfo = NULL; + u32 router_throughput, neigh_throughput; + u32 router_last_seqno; + u32 neigh_last_seqno; + s32 neigh_seq_diff; + bool forward = false; orig_neigh_node = batadv_v_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_neigh_node) @@ -529,47 +510,57 @@ static void batadv_v_ogm_route_update(struct batadv_priv *bat_priv, goto out; } - if (router) - batadv_neigh_node_put(router); + /* Mark the OGM to be considered for forwarding, and update routes + * if needed. + */ + forward = true; - /* Update routes, and check if the OGM is from the best next hop */ - batadv_v_ogm_orig_update(bat_priv, orig_node, neigh_node, ogm2, - if_outgoing); + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Searching and updating originator entry of received packet\n"); - orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); - if (!orig_ifinfo) + /* if this neighbor already is our next hop there is nothing + * to change + */ + if (router == neigh_node) goto out; - /* don't forward the same seqno twice on one interface */ - if (orig_ifinfo->last_seqno_forwarded == ntohl(ogm2->seqno)) - goto out; + /* don't consider neighbours with worse throughput. + * also switch route if this seqno is BATADV_V_MAX_ORIGDIFF newer than + * the last received seqno from our best next hop. + */ + if (router) { + router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); + neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); - /* acquire possibly updated router */ - router = batadv_orig_router_get(orig_node, if_outgoing); + /* if these are not allocated, something is wrong. */ + if (!router_ifinfo || !neigh_ifinfo) + goto out; - /* strict rule: forward packets coming from the best next hop only */ - if (neigh_node != router) - goto out; + neigh_last_seqno = neigh_ifinfo->bat_v.last_seqno; + router_last_seqno = router_ifinfo->bat_v.last_seqno; + neigh_seq_diff = neigh_last_seqno - router_last_seqno; + router_throughput = router_ifinfo->bat_v.throughput; + neigh_throughput = neigh_ifinfo->bat_v.throughput; - /* only forward for specific interface, not for the default one. */ - if (if_outgoing != BATADV_IF_DEFAULT) { - orig_ifinfo->last_seqno_forwarded = ntohl(ogm2->seqno); - batadv_v_ogm_forward(bat_priv, ogm2, - neigh_ifinfo->bat_v.throughput, - if_incoming, if_outgoing); + if ((neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF) && + (router_throughput >= neigh_throughput)) + goto out; } + batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node); out: - if (orig_ifinfo) - batadv_orig_ifinfo_put(orig_ifinfo); if (router) batadv_neigh_node_put(router); if (orig_neigh_router) batadv_neigh_node_put(orig_neigh_router); if (orig_neigh_node) batadv_orig_node_put(orig_neigh_node); + if (router_ifinfo) + batadv_neigh_ifinfo_put(router_ifinfo); if (neigh_ifinfo) batadv_neigh_ifinfo_put(neigh_ifinfo); + + return forward; } /** @@ -592,6 +583,7 @@ batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing) { int seqno_age; + bool forward; /* first, update the metric with according sanity checks */ seqno_age = batadv_v_ogm_metric_update(bat_priv, ogm2, orig_node, @@ -610,8 +602,14 @@ batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv, ntohs(ogm2->tvlv_len)); /* if the metric update went through, update routes if needed */ - batadv_v_ogm_route_update(bat_priv, ethhdr, ogm2, orig_node, - neigh_node, if_incoming, if_outgoing); + forward = batadv_v_ogm_route_update(bat_priv, ethhdr, ogm2, orig_node, + neigh_node, if_incoming, + if_outgoing); + + /* if the routes have been processed correctly, check and forward */ + if (forward) + batadv_v_ogm_forward(bat_priv, ogm2, orig_node, neigh_node, + if_incoming, if_outgoing); } /** @@ -713,9 +711,14 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, if (hard_iface->soft_iface != bat_priv->soft_iface) continue; + if (!kref_get_unless_zero(&hard_iface->refcount)) + continue; + batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet, orig_node, neigh_node, if_incoming, hard_iface); + + batadv_hardif_put(hard_iface); } rcu_read_unlock(); out: diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index b56bb000a0ab..a0c7913837a5 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -38,11 +38,11 @@ static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n) * the last sequence number * @set_mark: whether this packet should be marked in seq_bits * - * Return: 1 if the window was moved (either new or very old), - * 0 if the window was not moved/shifted. + * Return: true if the window was moved (either new or very old), + * false if the window was not moved/shifted. */ -int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, - int set_mark) +bool batadv_bit_get_packet(void *priv, unsigned long *seq_bits, + s32 seq_num_diff, int set_mark) { struct batadv_priv *bat_priv = priv; @@ -52,7 +52,7 @@ int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, if (seq_num_diff <= 0 && seq_num_diff > -BATADV_TQ_LOCAL_WINDOW_SIZE) { if (set_mark) batadv_set_bit(seq_bits, -seq_num_diff); - return 0; + return false; } /* sequence number is slightly newer, so we shift the window and @@ -63,7 +63,7 @@ int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, if (set_mark) batadv_set_bit(seq_bits, 0); - return 1; + return true; } /* sequence number is much newer, probably missed a lot of packets */ @@ -75,7 +75,7 @@ int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, bitmap_zero(seq_bits, BATADV_TQ_LOCAL_WINDOW_SIZE); if (set_mark) batadv_set_bit(seq_bits, 0); - return 1; + return true; } /* received a much older packet. The other host either restarted @@ -94,5 +94,5 @@ int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, if (set_mark) batadv_set_bit(seq_bits, 0); - return 1; + return true; } diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index 3e41bb80eb81..0e6e9d09078c 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -22,6 +22,7 @@ #include <linux/bitops.h> #include <linux/compiler.h> +#include <linux/stddef.h> #include <linux/types.h> /** @@ -31,17 +32,17 @@ * @last_seqno: latest sequence number in seq_bits * @curr_seqno: sequence number to test for * - * Return: 1 if the corresponding bit in the given seq_bits indicates true - * and curr_seqno is within range of last_seqno. Otherwise returns 0. + * Return: true if the corresponding bit in the given seq_bits indicates true + * and curr_seqno is within range of last_seqno. Otherwise returns false. */ -static inline int batadv_test_bit(const unsigned long *seq_bits, - u32 last_seqno, u32 curr_seqno) +static inline bool batadv_test_bit(const unsigned long *seq_bits, + u32 last_seqno, u32 curr_seqno) { s32 diff; diff = last_seqno - curr_seqno; if (diff < 0 || diff >= BATADV_TQ_LOCAL_WINDOW_SIZE) - return 0; + return false; return test_bit(diff, seq_bits) != 0; } @@ -55,7 +56,7 @@ static inline void batadv_set_bit(unsigned long *seq_bits, s32 n) set_bit(n, seq_bits); /* turn the position on */ } -int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, - int set_mark); +bool batadv_bit_get_packet(void *priv, unsigned long *seq_bits, + s32 seq_num_diff, int set_mark); #endif /* _NET_BATMAN_ADV_BITARRAY_H_ */ diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 0a6c8b824a00..748a9ead7ce5 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -50,6 +50,7 @@ #include "hash.h" #include "originator.h" #include "packet.h" +#include "sysfs.h" #include "translation-table.h" static const u8 batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05}; @@ -100,10 +101,10 @@ static inline u32 batadv_choose_backbone_gw(const void *data, u32 size) * @node: list node of the first entry to compare * @data2: pointer to the second backbone gateway * - * Return: 1 if the backbones have the same data, 0 otherwise + * Return: true if the backbones have the same data, false otherwise */ -static int batadv_compare_backbone_gw(const struct hlist_node *node, - const void *data2) +static bool batadv_compare_backbone_gw(const struct hlist_node *node, + const void *data2) { const void *data1 = container_of(node, struct batadv_bla_backbone_gw, hash_entry); @@ -111,23 +112,23 @@ static int batadv_compare_backbone_gw(const struct hlist_node *node, const struct batadv_bla_backbone_gw *gw2 = data2; if (!batadv_compare_eth(gw1->orig, gw2->orig)) - return 0; + return false; if (gw1->vid != gw2->vid) - return 0; + return false; - return 1; + return true; } /** - * batadv_compare_backbone_gw - compare address and vid of two claims + * batadv_compare_claim - compare address and vid of two claims * @node: list node of the first entry to compare * @data2: pointer to the second claims * - * Return: 1 if the claim have the same data, 0 otherwise + * Return: true if the claim have the same data, 0 otherwise */ -static int batadv_compare_claim(const struct hlist_node *node, - const void *data2) +static bool batadv_compare_claim(const struct hlist_node *node, + const void *data2) { const void *data1 = container_of(node, struct batadv_bla_claim, hash_entry); @@ -135,12 +136,12 @@ static int batadv_compare_claim(const struct hlist_node *node, const struct batadv_bla_claim *cl2 = data2; if (!batadv_compare_eth(cl1->addr, cl2->addr)) - return 0; + return false; if (cl1->vid != cl2->vid) - return 0; + return false; - return 1; + return true; } /** @@ -200,9 +201,9 @@ static void batadv_claim_put(struct batadv_bla_claim *claim) * * Return: claim if found or NULL otherwise. */ -static struct batadv_bla_claim -*batadv_claim_hash_find(struct batadv_priv *bat_priv, - struct batadv_bla_claim *data) +static struct batadv_bla_claim * +batadv_claim_hash_find(struct batadv_priv *bat_priv, + struct batadv_bla_claim *data) { struct batadv_hashtable *hash = bat_priv->bla.claim_hash; struct hlist_head *head; @@ -407,6 +408,14 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, ethhdr->h_source, ethhdr->h_dest, BATADV_PRINT_VID(vid)); break; + case BATADV_CLAIM_TYPE_LOOPDETECT: + ether_addr_copy(ethhdr->h_source, mac); + batadv_dbg(BATADV_DBG_BLA, bat_priv, + "bla_send_claim(): LOOPDETECT of %pM to %pM on vid %d\n", + ethhdr->h_source, ethhdr->h_dest, + BATADV_PRINT_VID(vid)); + + break; } if (vid & BATADV_VLAN_HAS_TAG) @@ -427,6 +436,36 @@ out: } /** + * batadv_bla_loopdetect_report - worker for reporting the loop + * @work: work queue item + * + * Throws an uevent, as the loopdetect check function can't do that itself + * since the kernel may sleep while throwing uevents. + */ +static void batadv_bla_loopdetect_report(struct work_struct *work) +{ + struct batadv_bla_backbone_gw *backbone_gw; + struct batadv_priv *bat_priv; + char vid_str[6] = { '\0' }; + + backbone_gw = container_of(work, struct batadv_bla_backbone_gw, + report_work); + bat_priv = backbone_gw->bat_priv; + + batadv_info(bat_priv->soft_iface, + "Possible loop on VLAN %d detected which can't be handled by BLA - please check your network setup!\n", + BATADV_PRINT_VID(backbone_gw->vid)); + snprintf(vid_str, sizeof(vid_str), "%d", + BATADV_PRINT_VID(backbone_gw->vid)); + vid_str[sizeof(vid_str) - 1] = 0; + + batadv_throw_uevent(bat_priv, BATADV_UEV_BLA, BATADV_UEV_LOOPDETECT, + vid_str); + + batadv_backbone_gw_put(backbone_gw); +} + +/** * batadv_bla_get_backbone_gw - finds or creates a backbone gateway * @bat_priv: the bat priv with all the soft interface information * @orig: the mac address of the originator @@ -464,6 +503,7 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, atomic_set(&entry->request_sent, 0); atomic_set(&entry->wait_periods, 0); ether_addr_copy(entry->orig, orig); + INIT_WORK(&entry->report_work, batadv_bla_loopdetect_report); /* one for the hash, one for returning */ kref_init(&entry->refcount); @@ -735,22 +775,22 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, * @backbone_addr: originator address of the sender (Ethernet source MAC) * @vid: the VLAN ID of the frame * - * Return: 1 if handled + * Return: true if handled */ -static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, - u8 *backbone_addr, unsigned short vid) +static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, + u8 *backbone_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; u16 backbone_crc, crc; if (memcmp(an_addr, batadv_announce_mac, 4) != 0) - return 0; + return false; backbone_gw = batadv_bla_get_backbone_gw(bat_priv, backbone_addr, vid, false); if (unlikely(!backbone_gw)) - return 1; + return true; /* handle as ANNOUNCE frame */ backbone_gw->lasttime = jiffies; @@ -783,7 +823,7 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, } batadv_backbone_gw_put(backbone_gw); - return 1; + return true; } /** @@ -794,29 +834,29 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, * @ethhdr: ethernet header of a packet * @vid: the VLAN ID of the frame * - * Return: 1 if handled + * Return: true if handled */ -static int batadv_handle_request(struct batadv_priv *bat_priv, - struct batadv_hard_iface *primary_if, - u8 *backbone_addr, struct ethhdr *ethhdr, - unsigned short vid) +static bool batadv_handle_request(struct batadv_priv *bat_priv, + struct batadv_hard_iface *primary_if, + u8 *backbone_addr, struct ethhdr *ethhdr, + unsigned short vid) { /* check for REQUEST frame */ if (!batadv_compare_eth(backbone_addr, ethhdr->h_dest)) - return 0; + return false; /* sanity check, this should not happen on a normal switch, * we ignore it in this case. */ if (!batadv_compare_eth(ethhdr->h_dest, primary_if->net_dev->dev_addr)) - return 1; + return true; batadv_dbg(BATADV_DBG_BLA, bat_priv, "handle_request(): REQUEST vid %d (sent by %pM)...\n", BATADV_PRINT_VID(vid), ethhdr->h_source); batadv_bla_answer_request(bat_priv, primary_if, vid); - return 1; + return true; } /** @@ -827,12 +867,12 @@ static int batadv_handle_request(struct batadv_priv *bat_priv, * @claim_addr: Client to be unclaimed (ARP sender HW MAC) * @vid: the VLAN ID of the frame * - * Return: 1 if handled + * Return: true if handled */ -static int batadv_handle_unclaim(struct batadv_priv *bat_priv, - struct batadv_hard_iface *primary_if, - u8 *backbone_addr, u8 *claim_addr, - unsigned short vid) +static bool batadv_handle_unclaim(struct batadv_priv *bat_priv, + struct batadv_hard_iface *primary_if, + u8 *backbone_addr, u8 *claim_addr, + unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; @@ -845,7 +885,7 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, backbone_gw = batadv_backbone_hash_find(bat_priv, backbone_addr, vid); if (!backbone_gw) - return 1; + return true; /* this must be an UNCLAIM frame */ batadv_dbg(BATADV_DBG_BLA, bat_priv, @@ -854,7 +894,7 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, batadv_bla_del_claim(bat_priv, claim_addr, vid); batadv_backbone_gw_put(backbone_gw); - return 1; + return true; } /** @@ -865,12 +905,12 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, * @claim_addr: client mac address to be claimed (ARP sender HW MAC) * @vid: the VLAN ID of the frame * - * Return: 1 if handled + * Return: true if handled */ -static int batadv_handle_claim(struct batadv_priv *bat_priv, - struct batadv_hard_iface *primary_if, - u8 *backbone_addr, u8 *claim_addr, - unsigned short vid) +static bool batadv_handle_claim(struct batadv_priv *bat_priv, + struct batadv_hard_iface *primary_if, + u8 *backbone_addr, u8 *claim_addr, + unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; @@ -880,7 +920,7 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv, false); if (unlikely(!backbone_gw)) - return 1; + return true; /* this must be a CLAIM frame */ batadv_bla_add_claim(bat_priv, claim_addr, vid, backbone_gw); @@ -891,7 +931,7 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv, /* TODO: we could call something like tt_local_del() here. */ batadv_backbone_gw_put(backbone_gw); - return 1; + return true; } /** @@ -975,12 +1015,12 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv, * @primary_if: the primary hard interface of this batman soft interface * @skb: the frame to be checked * - * Return: 1 if it was a claim frame, otherwise return 0 to + * Return: true if it was a claim frame, otherwise return false to * tell the callee that it can use the frame on its own. */ -static int batadv_bla_process_claim(struct batadv_priv *bat_priv, - struct batadv_hard_iface *primary_if, - struct sk_buff *skb) +static bool batadv_bla_process_claim(struct batadv_priv *bat_priv, + struct batadv_hard_iface *primary_if, + struct sk_buff *skb) { struct batadv_bla_claim_dst *bla_dst, *bla_dst_own; u8 *hw_src, *hw_dst; @@ -1011,7 +1051,7 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, vhdr = skb_header_pointer(skb, headlen, VLAN_HLEN, &vhdr_buf); if (!vhdr) - return 0; + return false; proto = vhdr->h_vlan_encapsulated_proto; headlen += VLAN_HLEN; @@ -1020,12 +1060,12 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, } if (proto != htons(ETH_P_ARP)) - return 0; /* not a claim frame */ + return false; /* not a claim frame */ /* this must be a ARP frame. check if it is a claim. */ if (unlikely(!pskb_may_pull(skb, headlen + arp_hdr_len(skb->dev)))) - return 0; + return false; /* pskb_may_pull() may have modified the pointers, get ethhdr again */ ethhdr = eth_hdr(skb); @@ -1035,13 +1075,13 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, * IP information */ if (arphdr->ar_hrd != htons(ARPHRD_ETHER)) - return 0; + return false; if (arphdr->ar_pro != htons(ETH_P_IP)) - return 0; + return false; if (arphdr->ar_hln != ETH_ALEN) - return 0; + return false; if (arphdr->ar_pln != 4) - return 0; + return false; hw_src = (u8 *)arphdr + sizeof(struct arphdr); hw_dst = hw_src + ETH_ALEN + 4; @@ -1051,14 +1091,18 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, /* check if it is a claim frame in general */ if (memcmp(bla_dst->magic, bla_dst_own->magic, sizeof(bla_dst->magic)) != 0) - return 0; + return false; /* check if there is a claim frame encapsulated deeper in (QinQ) and * drop that, as this is not supported by BLA but should also not be * sent via the mesh. */ if (vlan_depth > 1) - return 1; + return true; + + /* Let the loopdetect frames on the mesh in any case. */ + if (bla_dst->type == BATADV_CLAIM_TYPE_LOOPDETECT) + return 0; /* check if it is a claim frame. */ ret = batadv_check_claim_group(bat_priv, primary_if, hw_src, hw_dst, @@ -1070,7 +1114,7 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, hw_dst); if (ret < 2) - return ret; + return !!ret; /* become a backbone gw ourselves on this vlan if not happened yet */ batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid); @@ -1080,30 +1124,30 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, case BATADV_CLAIM_TYPE_CLAIM: if (batadv_handle_claim(bat_priv, primary_if, hw_src, ethhdr->h_source, vid)) - return 1; + return true; break; case BATADV_CLAIM_TYPE_UNCLAIM: if (batadv_handle_unclaim(bat_priv, primary_if, ethhdr->h_source, hw_src, vid)) - return 1; + return true; break; case BATADV_CLAIM_TYPE_ANNOUNCE: if (batadv_handle_announce(bat_priv, hw_src, ethhdr->h_source, vid)) - return 1; + return true; break; case BATADV_CLAIM_TYPE_REQUEST: if (batadv_handle_request(bat_priv, primary_if, hw_src, ethhdr, vid)) - return 1; + return true; break; } batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_process_claim(): ERROR - this looks like a claim frame, but is useless. eth src %pM on vid %d ...(hw_src %pM, hw_dst %pM)\n", ethhdr->h_source, BATADV_PRINT_VID(vid), hw_src, hw_dst); - return 1; + return true; } /** @@ -1265,6 +1309,26 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, } /** + * batadv_bla_send_loopdetect - send a loopdetect frame + * @bat_priv: the bat priv with all the soft interface information + * @backbone_gw: the backbone gateway for which a loop should be detected + * + * To detect loops that the bridge loop avoidance can't handle, send a loop + * detection packet on the backbone. Unlike other BLA frames, this frame will + * be allowed on the mesh by other nodes. If it is received on the mesh, this + * indicates that there is a loop. + */ +static void +batadv_bla_send_loopdetect(struct batadv_priv *bat_priv, + struct batadv_bla_backbone_gw *backbone_gw) +{ + batadv_dbg(BATADV_DBG_BLA, bat_priv, "Send loopdetect frame for vid %d\n", + backbone_gw->vid); + batadv_bla_send_claim(bat_priv, bat_priv->bla.loopdetect_addr, + backbone_gw->vid, BATADV_CLAIM_TYPE_LOOPDETECT); +} + +/** * batadv_bla_status_update - purge bla interfaces if necessary * @net_dev: the soft interface net device */ @@ -1301,9 +1365,10 @@ static void batadv_bla_periodic_work(struct work_struct *work) struct batadv_bla_backbone_gw *backbone_gw; struct batadv_hashtable *hash; struct batadv_hard_iface *primary_if; + bool send_loopdetect = false; int i; - delayed_work = container_of(work, struct delayed_work, work); + delayed_work = to_delayed_work(work); priv_bla = container_of(delayed_work, struct batadv_priv_bla, work); bat_priv = container_of(priv_bla, struct batadv_priv, bla); primary_if = batadv_primary_if_get_selected(bat_priv); @@ -1316,6 +1381,22 @@ static void batadv_bla_periodic_work(struct work_struct *work) if (!atomic_read(&bat_priv->bridge_loop_avoidance)) goto out; + if (atomic_dec_and_test(&bat_priv->bla.loopdetect_next)) { + /* set a new random mac address for the next bridge loop + * detection frames. Set the locally administered bit to avoid + * collisions with users mac addresses. + */ + random_ether_addr(bat_priv->bla.loopdetect_addr); + bat_priv->bla.loopdetect_addr[0] = 0xba; + bat_priv->bla.loopdetect_addr[1] = 0xbe; + bat_priv->bla.loopdetect_lasttime = jiffies; + atomic_set(&bat_priv->bla.loopdetect_next, + BATADV_BLA_LOOPDETECT_PERIODS); + + /* mark for sending loop detect on all VLANs */ + send_loopdetect = true; + } + hash = bat_priv->bla.backbone_hash; if (!hash) goto out; @@ -1332,6 +1413,9 @@ static void batadv_bla_periodic_work(struct work_struct *work) backbone_gw->lasttime = jiffies; batadv_bla_send_announce(bat_priv, backbone_gw); + if (send_loopdetect) + batadv_bla_send_loopdetect(bat_priv, + backbone_gw); /* request_sent is only set after creation to avoid * problems when we are not yet known as backbone gw @@ -1405,6 +1489,9 @@ int batadv_bla_init(struct batadv_priv *bat_priv) bat_priv->bla.bcast_duplist[i].entrytime = entrytime; bat_priv->bla.bcast_duplist_curr = 0; + atomic_set(&bat_priv->bla.loopdetect_next, + BATADV_BLA_LOOPDETECT_PERIODS); + if (bat_priv->bla.claim_hash) return 0; @@ -1442,15 +1529,16 @@ int batadv_bla_init(struct batadv_priv *bat_priv) * sent by another host, drop it. We allow equal packets from * the same host however as this might be intended. * - * Return: 1 if a packet is in the duplicate list, 0 otherwise. + * Return: true if a packet is in the duplicate list, false otherwise. */ -int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, - struct sk_buff *skb) +bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, + struct sk_buff *skb) { - int i, curr, ret = 0; + int i, curr; __be32 crc; struct batadv_bcast_packet *bcast_packet; struct batadv_bcast_duplist_entry *entry; + bool ret = false; bcast_packet = (struct batadv_bcast_packet *)skb->data; @@ -1478,9 +1566,9 @@ int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, continue; /* this entry seems to match: same crc, not too old, - * and from another gw. therefore return 1 to forbid it. + * and from another gw. therefore return true to forbid it. */ - ret = 1; + ret = true; goto out; } /* not found, add a new entry (overwrite the oldest entry) @@ -1546,21 +1634,21 @@ bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, * @orig_node: the orig_node of the frame * @hdr_size: maximum length of the frame * - * Return: 1 if the orig_node is also a gateway on the soft interface, otherwise - * it returns 0. + * Return: true if the orig_node is also a gateway on the soft interface, + * otherwise it returns false. */ -int batadv_bla_is_backbone_gw(struct sk_buff *skb, - struct batadv_orig_node *orig_node, int hdr_size) +bool batadv_bla_is_backbone_gw(struct sk_buff *skb, + struct batadv_orig_node *orig_node, int hdr_size) { struct batadv_bla_backbone_gw *backbone_gw; unsigned short vid; if (!atomic_read(&orig_node->bat_priv->bridge_loop_avoidance)) - return 0; + return false; /* first, find out the vid. */ if (!pskb_may_pull(skb, hdr_size + ETH_HLEN)) - return 0; + return false; vid = batadv_get_vid(skb, hdr_size); @@ -1568,14 +1656,14 @@ int batadv_bla_is_backbone_gw(struct sk_buff *skb, backbone_gw = batadv_backbone_hash_find(orig_node->bat_priv, orig_node->orig, vid); if (!backbone_gw) - return 0; + return false; batadv_backbone_gw_put(backbone_gw); - return 1; + return true; } /** - * batadv_bla_init - free all bla structures + * batadv_bla_free - free all bla structures * @bat_priv: the bat priv with all the soft interface information * * for softinterface free or module unload @@ -1602,6 +1690,55 @@ void batadv_bla_free(struct batadv_priv *bat_priv) } /** + * batadv_bla_loopdetect_check - check and handle a detected loop + * @bat_priv: the bat priv with all the soft interface information + * @skb: the packet to check + * @primary_if: interface where the request came on + * @vid: the VLAN ID of the frame + * + * Checks if this packet is a loop detect frame which has been sent by us, + * throw an uevent and log the event if that is the case. + * + * Return: true if it is a loop detect frame which is to be dropped, false + * otherwise. + */ +static bool +batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, + struct batadv_hard_iface *primary_if, + unsigned short vid) +{ + struct batadv_bla_backbone_gw *backbone_gw; + struct ethhdr *ethhdr; + + ethhdr = eth_hdr(skb); + + /* Only check for the MAC address and skip more checks here for + * performance reasons - this function is on the hotpath, after all. + */ + if (!batadv_compare_eth(ethhdr->h_source, + bat_priv->bla.loopdetect_addr)) + return false; + + /* If the packet came too late, don't forward it on the mesh + * but don't consider that as loop. It might be a coincidence. + */ + if (batadv_has_timed_out(bat_priv->bla.loopdetect_lasttime, + BATADV_BLA_LOOPDETECT_TIMEOUT)) + return true; + + backbone_gw = batadv_bla_get_backbone_gw(bat_priv, + primary_if->net_dev->dev_addr, + vid, true); + if (unlikely(!backbone_gw)) + return true; + + queue_work(batadv_event_workqueue, &backbone_gw->report_work); + /* backbone_gw is unreferenced in the report work function function */ + + return true; +} + +/** * batadv_bla_rx - check packets coming from the mesh. * @bat_priv: the bat priv with all the soft interface information * @skb: the frame to be checked @@ -1614,16 +1751,16 @@ void batadv_bla_free(struct batadv_priv *bat_priv) * * in these cases, the skb is further handled by this function * - * Return: 1 if handled, otherwise it returns 0 and the caller shall further - * process the skb. + * Return: true if handled, otherwise it returns false and the caller shall + * further process the skb. */ -int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, - unsigned short vid, bool is_bcast) +bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid, bool is_bcast) { struct ethhdr *ethhdr; struct batadv_bla_claim search_claim, *claim = NULL; struct batadv_hard_iface *primary_if; - int ret; + bool ret; ethhdr = eth_hdr(skb); @@ -1634,6 +1771,9 @@ int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, if (!atomic_read(&bat_priv->bridge_loop_avoidance)) goto allow; + if (batadv_bla_loopdetect_check(bat_priv, skb, primary_if, vid)) + goto handled; + if (unlikely(atomic_read(&bat_priv->bla.num_requests))) /* don't allow broadcasts while requests are in flight */ if (is_multicast_ether_addr(ethhdr->h_dest) && is_bcast) @@ -1682,12 +1822,12 @@ int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, } allow: batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid); - ret = 0; + ret = false; goto out; handled: kfree_skb(skb); - ret = 1; + ret = true; out: if (primary_if) @@ -1711,16 +1851,16 @@ out: * * This call might reallocate skb data. * - * Return: 1 if handled, otherwise it returns 0 and the caller shall further - * process the skb. + * Return: true if handled, otherwise it returns false and the caller shall + * further process the skb. */ -int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, - unsigned short vid) +bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid) { struct ethhdr *ethhdr; struct batadv_bla_claim search_claim, *claim = NULL; struct batadv_hard_iface *primary_if; - int ret = 0; + bool ret = false; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) @@ -1774,10 +1914,10 @@ int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, } allow: batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid); - ret = 0; + ret = false; goto out; handled: - ret = 1; + ret = true; out: if (primary_if) batadv_hardif_put(primary_if); @@ -1815,8 +1955,8 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) "Claims announced for the mesh %s (orig %pM, group id %#.4x)\n", net_dev->name, primary_addr, ntohs(bat_priv->bla.claim_dest.group)); - seq_printf(seq, " %-17s %-5s %-17s [o] (%-6s)\n", - "Client", "VID", "Originator", "CRC"); + seq_puts(seq, + " Client VID Originator [o] (CRC )\n"); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -1873,8 +2013,7 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) "Backbones announced for the mesh %s (orig %pM, group id %#.4x)\n", net_dev->name, primary_addr, ntohs(bat_priv->bla.claim_dest.group)); - seq_printf(seq, " %-17s %-5s %-9s (%-6s)\n", - "Originator", "VID", "last seen", "CRC"); + seq_puts(seq, " Originator VID last seen (CRC )\n"); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 579f0fa6fe6a..0f01daeb359e 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -27,19 +27,20 @@ struct seq_file; struct sk_buff; #ifdef CONFIG_BATMAN_ADV_BLA -int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, - unsigned short vid, bool is_bcast); -int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, - unsigned short vid); -int batadv_bla_is_backbone_gw(struct sk_buff *skb, - struct batadv_orig_node *orig_node, int hdr_size); +bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid, bool is_bcast); +bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid); +bool batadv_bla_is_backbone_gw(struct sk_buff *skb, + struct batadv_orig_node *orig_node, + int hdr_size); int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset); int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset); bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid); -int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, - struct sk_buff *skb); +bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, + struct sk_buff *skb); void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, struct batadv_hard_iface *oldif); @@ -50,24 +51,24 @@ void batadv_bla_free(struct batadv_priv *bat_priv); #define BATADV_BLA_CRC_INIT 0 #else /* ifdef CONFIG_BATMAN_ADV_BLA */ -static inline int batadv_bla_rx(struct batadv_priv *bat_priv, - struct sk_buff *skb, unsigned short vid, - bool is_bcast) +static inline bool batadv_bla_rx(struct batadv_priv *bat_priv, + struct sk_buff *skb, unsigned short vid, + bool is_bcast) { - return 0; + return false; } -static inline int batadv_bla_tx(struct batadv_priv *bat_priv, - struct sk_buff *skb, unsigned short vid) +static inline bool batadv_bla_tx(struct batadv_priv *bat_priv, + struct sk_buff *skb, unsigned short vid) { - return 0; + return false; } -static inline int batadv_bla_is_backbone_gw(struct sk_buff *skb, - struct batadv_orig_node *orig_node, - int hdr_size) +static inline bool batadv_bla_is_backbone_gw(struct sk_buff *skb, + struct batadv_orig_node *orig_node, + int hdr_size) { - return 0; + return false; } static inline int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, @@ -88,11 +89,11 @@ static inline bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, return false; } -static inline int +static inline bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb) { - return 0; + return false; } static inline void diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 48253cf8341b..952900466d88 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -134,7 +134,7 @@ static int batadv_log_release(struct inode *inode, struct file *file) return 0; } -static int batadv_log_empty(struct batadv_priv_debug_log *debug_log) +static bool batadv_log_empty(struct batadv_priv_debug_log *debug_log) { return !(debug_log->log_start - debug_log->log_end); } @@ -365,14 +365,17 @@ static int batadv_nc_nodes_open(struct inode *inode, struct file *file) #define BATADV_DEBUGINFO(_name, _mode, _open) \ struct batadv_debuginfo batadv_debuginfo_##_name = { \ - .attr = { .name = __stringify(_name), \ - .mode = _mode, }, \ - .fops = { .owner = THIS_MODULE, \ - .open = _open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ - } \ + .attr = { \ + .name = __stringify(_name), \ + .mode = _mode, \ + }, \ + .fops = { \ + .owner = THIS_MODULE, \ + .open = _open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ + }, \ } /* the following attributes are general and therefore they will be directly diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index e96d7c745b4a..278800a99c69 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -152,7 +152,7 @@ static void batadv_dat_purge(struct work_struct *work) struct batadv_priv_dat *priv_dat; struct batadv_priv *bat_priv; - delayed_work = container_of(work, struct delayed_work, work); + delayed_work = to_delayed_work(work); priv_dat = container_of(delayed_work, struct batadv_priv_dat, work); bat_priv = container_of(priv_dat, struct batadv_priv, dat); @@ -165,14 +165,14 @@ static void batadv_dat_purge(struct work_struct *work) * @node: node in the local table * @data2: second object to compare the node to * - * Return: 1 if the two entries are the same, 0 otherwise. + * Return: true if the two entries are the same, false otherwise. */ -static int batadv_compare_dat(const struct hlist_node *node, const void *data2) +static bool batadv_compare_dat(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_dat_entry, hash_entry); - return memcmp(data1, data2, sizeof(__be32)) == 0 ? 1 : 0; + return memcmp(data1, data2, sizeof(__be32)) == 0; } /** @@ -568,6 +568,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, * be sent to * @bat_priv: the bat priv with all the soft interface information * @ip_dst: ipv4 to look up in the DHT + * @vid: VLAN identifier * * An originator O is selected if and only if its DHT_ID value is one of three * closest values (from the LEFT, with wrap around if needed) then the hash @@ -576,7 +577,8 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, * Return: the candidate array of size BATADV_DAT_CANDIDATE_NUM. */ static struct batadv_dat_candidate * -batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst) +batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst, + unsigned short vid) { int select; batadv_dat_addr_t last_max = BATADV_DAT_ADDR_MAX, ip_key; @@ -592,7 +594,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst) return NULL; dat.ip = ip_dst; - dat.vid = 0; + dat.vid = vid; ip_key = (batadv_dat_addr_t)batadv_hash_dat(&dat, BATADV_DAT_ADDR_MAX); @@ -612,6 +614,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst) * @bat_priv: the bat priv with all the soft interface information * @skb: payload to send * @ip: the DHT key + * @vid: VLAN identifier * @packet_subtype: unicast4addr packet subtype to use * * This function copies the skb with pskb_copy() and is sent as unicast packet @@ -622,7 +625,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst) */ static bool batadv_dat_send_data(struct batadv_priv *bat_priv, struct sk_buff *skb, __be32 ip, - int packet_subtype) + unsigned short vid, int packet_subtype) { int i; bool ret = false; @@ -631,7 +634,7 @@ static bool batadv_dat_send_data(struct batadv_priv *bat_priv, struct sk_buff *tmp_skb; struct batadv_dat_candidate *cand; - cand = batadv_dat_select_candidates(bat_priv, ip); + cand = batadv_dat_select_candidates(bat_priv, ip, vid); if (!cand) goto out; @@ -717,7 +720,7 @@ void batadv_dat_status_update(struct net_device *net_dev) } /** - * batadv_gw_tvlv_ogm_handler_v1 - process incoming dat tvlv container + * batadv_dat_tvlv_ogm_handler_v1 - process incoming dat tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) @@ -814,8 +817,8 @@ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset) goto out; seq_printf(seq, "Distributed ARP Table (%s):\n", net_dev->name); - seq_printf(seq, " %-7s %-9s %4s %11s\n", "IPv4", - "MAC", "VID", "last-seen"); + seq_puts(seq, + " IPv4 MAC VID last-seen\n"); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -1022,7 +1025,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, ret = true; } else { /* Send the request to the DHT */ - ret = batadv_dat_send_data(bat_priv, skb, ip_dst, + ret = batadv_dat_send_data(bat_priv, skb, ip_dst, vid, BATADV_P_DAT_DHT_GET); } out: @@ -1150,8 +1153,8 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv, /* Send the ARP reply to the candidates for both the IP addresses that * the node obtained from the ARP reply */ - batadv_dat_send_data(bat_priv, skb, ip_src, BATADV_P_DAT_DHT_PUT); - batadv_dat_send_data(bat_priv, skb, ip_dst, BATADV_P_DAT_DHT_PUT); + batadv_dat_send_data(bat_priv, skb, ip_src, vid, BATADV_P_DAT_DHT_PUT); + batadv_dat_send_data(bat_priv, skb, ip_dst, vid, BATADV_P_DAT_DHT_PUT); } /** diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index e6956d0746a2..65536db1bff7 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -407,8 +407,8 @@ static struct sk_buff *batadv_frag_create(struct sk_buff *skb, unsigned int mtu) { struct sk_buff *skb_fragment; - unsigned header_size = sizeof(*frag_head); - unsigned fragment_size = mtu - header_size; + unsigned int header_size = sizeof(*frag_head); + unsigned int fragment_size = mtu - header_size; skb_fragment = netdev_alloc_skb(NULL, mtu + ETH_HLEN); if (!skb_fragment) @@ -444,15 +444,15 @@ bool batadv_frag_send_packet(struct sk_buff *skb, struct batadv_hard_iface *primary_if = NULL; struct batadv_frag_packet frag_header; struct sk_buff *skb_fragment; - unsigned mtu = neigh_node->if_incoming->net_dev->mtu; - unsigned header_size = sizeof(frag_header); - unsigned max_fragment_size, max_packet_size; + unsigned int mtu = neigh_node->if_incoming->net_dev->mtu; + unsigned int header_size = sizeof(frag_header); + unsigned int max_fragment_size, max_packet_size; bool ret = false; /* To avoid merge and refragmentation at next-hops we never send * fragments larger than BATADV_FRAG_MAX_FRAG_SIZE */ - mtu = min_t(unsigned, mtu, BATADV_FRAG_MAX_FRAG_SIZE); + mtu = min_t(unsigned int, mtu, BATADV_FRAG_MAX_FRAG_SIZE); max_fragment_size = mtu - header_size; max_packet_size = max_fragment_size * BATADV_FRAG_MAX_FRAGMENTS; diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index c59aff5ccac8..5839c569f769 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -135,8 +135,8 @@ static void batadv_gw_select(struct batadv_priv *bat_priv, spin_lock_bh(&bat_priv->gw.list_lock); - if (new_gw_node && !kref_get_unless_zero(&new_gw_node->refcount)) - new_gw_node = NULL; + if (new_gw_node) + kref_get(&new_gw_node->refcount); curr_gw_node = rcu_dereference_protected(bat_priv->gw.curr_gw, 1); rcu_assign_pointer(bat_priv->gw.curr_gw, new_gw_node); @@ -440,15 +440,11 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, if (gateway->bandwidth_down == 0) return; - if (!kref_get_unless_zero(&orig_node->refcount)) - return; - gw_node = kzalloc(sizeof(*gw_node), GFP_ATOMIC); - if (!gw_node) { - batadv_orig_node_put(orig_node); + if (!gw_node) return; - } + kref_get(&orig_node->refcount); INIT_HLIST_NODE(&gw_node->list); gw_node->orig_node = orig_node; gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index b22b2775a0a5..8c2f39962fa5 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -36,7 +36,6 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/workqueue.h> -#include <net/net_namespace.h> #include "bridge_loop_avoidance.h" #include "debugfs.h" @@ -121,6 +120,7 @@ static bool batadv_mutual_parents(const struct net_device *dev1, static bool batadv_is_on_batman_iface(const struct net_device *net_dev) { struct net_device *parent_dev; + struct net *net = dev_net(net_dev); bool ret; /* check if this is a batman-adv mesh interface */ @@ -133,7 +133,7 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) return false; /* recurse over the parent device */ - parent_dev = __dev_get_by_index(&init_net, dev_get_iflink(net_dev)); + parent_dev = __dev_get_by_index(net, dev_get_iflink(net_dev)); /* if we got a NULL parent_dev there is something broken.. */ if (WARN(!parent_dev, "Cannot find parent device")) return false; @@ -146,22 +146,22 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) return ret; } -static int batadv_is_valid_iface(const struct net_device *net_dev) +static bool batadv_is_valid_iface(const struct net_device *net_dev) { if (net_dev->flags & IFF_LOOPBACK) - return 0; + return false; if (net_dev->type != ARPHRD_ETHER) - return 0; + return false; if (net_dev->addr_len != ETH_ALEN) - return 0; + return false; /* no batman over batman */ if (batadv_is_on_batman_iface(net_dev)) - return 0; + return false; - return 1; + return true; } /** @@ -236,8 +236,8 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv, ASSERT_RTNL(); - if (new_hard_iface && !kref_get_unless_zero(&new_hard_iface->refcount)) - new_hard_iface = NULL; + if (new_hard_iface) + kref_get(&new_hard_iface->refcount); curr_hard_iface = rcu_dereference_protected(bat_priv->primary_if, 1); rcu_assign_pointer(bat_priv->primary_if, new_hard_iface); @@ -407,6 +407,9 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface) batadv_update_min_mtu(hard_iface->soft_iface); + if (bat_priv->bat_algo_ops->bat_iface_activate) + bat_priv->bat_algo_ops->bat_iface_activate(hard_iface); + out: if (primary_if) batadv_hardif_put(primary_if); @@ -453,7 +456,7 @@ static int batadv_master_del_slave(struct batadv_hard_iface *slave, } int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, - const char *iface_name) + struct net *net, const char *iface_name) { struct batadv_priv *bat_priv; struct net_device *soft_iface, *master; @@ -464,13 +467,12 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) goto out; - if (!kref_get_unless_zero(&hard_iface->refcount)) - goto out; + kref_get(&hard_iface->refcount); - soft_iface = dev_get_by_name(&init_net, iface_name); + soft_iface = dev_get_by_name(net, iface_name); if (!soft_iface) { - soft_iface = batadv_softif_create(iface_name); + soft_iface = batadv_softif_create(net, iface_name); if (!soft_iface) { ret = -ENOMEM; @@ -519,6 +521,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, goto err_upper; } + kref_get(&hard_iface->refcount); hard_iface->batman_adv_ptype.type = ethertype; hard_iface->batman_adv_ptype.func = batadv_batman_skb_recv; hard_iface->batman_adv_ptype.dev = hard_iface->net_dev; @@ -572,8 +575,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hard_iface *primary_if = NULL; - if (hard_iface->if_status == BATADV_IF_ACTIVE) - batadv_hardif_deactivate_interface(hard_iface); + batadv_hardif_deactivate_interface(hard_iface); if (hard_iface->if_status != BATADV_IF_INACTIVE) goto out; @@ -581,6 +583,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, batadv_info(hard_iface->soft_iface, "Removing interface: %s\n", hard_iface->net_dev->name); dev_remove_pack(&hard_iface->batman_adv_ptype); + batadv_hardif_put(hard_iface); bat_priv->num_ifaces--; batadv_orig_hash_del_if(hard_iface, bat_priv->num_ifaces); @@ -650,8 +653,7 @@ batadv_hardif_add_interface(struct net_device *net_dev) ASSERT_RTNL(); - ret = batadv_is_valid_iface(net_dev); - if (ret != 1) + if (!batadv_is_valid_iface(net_dev)) goto out; dev_hold(net_dev); diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index d74f1983f33e..a76724d369bf 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -28,6 +28,7 @@ #include <linux/types.h> struct net_device; +struct net; enum batadv_hard_if_state { BATADV_IF_NOT_IN_USE, @@ -55,7 +56,7 @@ bool batadv_is_wifi_iface(int ifindex); struct batadv_hard_iface* batadv_hardif_get_by_netdev(const struct net_device *net_dev); int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, - const char *iface_name); + struct net *net, const char *iface_name); void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, enum batadv_hard_if_cleanup autodel); void batadv_hardif_remove_interfaces(void); diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 9bb57b87447c..cbbf87075f06 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -32,10 +32,10 @@ struct lock_class_key; /* callback to a compare function. should compare 2 element datas for their * keys * - * Return: 0 if same and not 0 if not same + * Return: true if same and false if not same */ -typedef int (*batadv_hashdata_compare_cb)(const struct hlist_node *, - const void *); +typedef bool (*batadv_hashdata_compare_cb)(const struct hlist_node *, + const void *); /* the hashfunction * diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 14d0013b387e..777aea10cd8f 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -104,25 +104,21 @@ static int batadv_socket_open(struct inode *inode, struct file *file) static int batadv_socket_release(struct inode *inode, struct file *file) { - struct batadv_socket_client *socket_client = file->private_data; - struct batadv_socket_packet *socket_packet; - struct list_head *list_pos, *list_pos_tmp; + struct batadv_socket_client *client = file->private_data; + struct batadv_socket_packet *packet, *tmp; - spin_lock_bh(&socket_client->lock); + spin_lock_bh(&client->lock); /* for all packets in the queue ... */ - list_for_each_safe(list_pos, list_pos_tmp, &socket_client->queue_list) { - socket_packet = list_entry(list_pos, - struct batadv_socket_packet, list); - - list_del(list_pos); - kfree(socket_packet); + list_for_each_entry_safe(packet, tmp, &client->queue_list, list) { + list_del(&packet->list); + kfree(packet); } - batadv_socket_client_hash[socket_client->index] = NULL; - spin_unlock_bh(&socket_client->lock); + batadv_socket_client_hash[client->index] = NULL; + spin_unlock_bh(&client->lock); - kfree(socket_client); + kfree(client); module_put(THIS_MODULE); return 0; @@ -337,7 +333,7 @@ err: } /** - * batadv_socket_receive_packet - schedule an icmp packet to be sent to + * batadv_socket_add_packet - schedule an icmp packet to be sent to * userspace on an icmp socket. * @socket_client: the socket this packet belongs to * @icmph: pointer to the header of the icmp packet diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index d64ddb961979..5f2974bd1227 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -401,11 +401,19 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, hard_iface = container_of(ptype, struct batadv_hard_iface, batman_adv_ptype); + + /* Prevent processing a packet received on an interface which is getting + * shut down otherwise the packet may trigger de-reference errors + * further down in the receive path. + */ + if (!kref_get_unless_zero(&hard_iface->refcount)) + goto err_out; + skb = skb_share_check(skb, GFP_ATOMIC); /* skb was released by skb_share_check() */ if (!skb) - goto err_out; + goto err_put; /* packet should hold at least type and version */ if (unlikely(!pskb_may_pull(skb, 2))) @@ -448,6 +456,8 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, if (ret == NET_RX_DROP) kfree_skb(skb); + batadv_hardif_put(hard_iface); + /* return NET_RX_SUCCESS in any case as we * most probably dropped the packet for * routing-logical reasons. @@ -456,6 +466,8 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, err_free: kfree_skb(skb); +err_put: + batadv_hardif_put(hard_iface); err_out: return NET_RX_DROP; } @@ -663,8 +675,8 @@ static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler) * * Return: tvlv handler if found or NULL otherwise. */ -static struct batadv_tvlv_handler -*batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version) +static struct batadv_tvlv_handler * +batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_handler *tvlv_handler_tmp, *tvlv_handler = NULL; @@ -722,8 +734,8 @@ static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv) * * Return: tvlv container if found or NULL otherwise. */ -static struct batadv_tvlv_container -*batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) +static struct batadv_tvlv_container * +batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL; @@ -736,9 +748,7 @@ static struct batadv_tvlv_container if (tvlv_tmp->tvlv_hdr.version != version) continue; - if (!kref_get_unless_zero(&tvlv_tmp->refcount)) - continue; - + kref_get(&tvlv_tmp->refcount); tvlv = tvlv_tmp; break; } diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index db4533631834..76925266deed 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -24,7 +24,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2016.1" +#define BATADV_SOURCE_VERSION "2016.2" #endif /* B.A.T.M.A.N. parameters */ @@ -120,6 +120,8 @@ #define BATADV_BLA_BACKBONE_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 6) #define BATADV_BLA_CLAIM_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 10) #define BATADV_BLA_WAIT_PERIODS 3 +#define BATADV_BLA_LOOPDETECT_PERIODS 6 +#define BATADV_BLA_LOOPDETECT_TIMEOUT 3000 /* 3 seconds */ #define BATADV_DUPLIST_SIZE 16 #define BATADV_DUPLIST_TIMEOUT 500 /* 500 ms */ @@ -142,10 +144,12 @@ enum batadv_uev_action { BATADV_UEV_ADD = 0, BATADV_UEV_DEL, BATADV_UEV_CHANGE, + BATADV_UEV_LOOPDETECT, }; enum batadv_uev_type { BATADV_UEV_GW = 0, + BATADV_UEV_BLA, }; #define BATADV_GW_THRESHOLD 50 @@ -288,7 +292,7 @@ static inline void _batadv_dbg(int type __always_unused, * * note: can't use ether_addr_equal() as it requires aligned memory * - * Return: 1 if they are the same ethernet addr + * Return: true if they are the same ethernet addr */ static inline bool batadv_compare_eth(const void *data1, const void *data2) { @@ -296,7 +300,8 @@ static inline bool batadv_compare_eth(const void *data1, const void *data2) } /** - * has_timed_out - compares current time (jiffies) and timestamp + timeout + * batadv_has_timed_out - compares current time (jiffies) and timestamp + + * timeout * @timestamp: base value to compare with (in jiffies) * @timeout: added to base value before comparing (in milliseconds) * diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 8caa2c72efa3..c32f24fafe67 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -394,7 +394,8 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, } /** - * batadv_mcast_want_all_ip_count - count nodes with unspecific mcast interest + * batadv_mcast_forw_want_all_ip_count - count nodes with unspecific mcast + * interest * @bat_priv: the bat priv with all the soft interface information * @ethhdr: ethernet header of a packet * @@ -433,7 +434,7 @@ batadv_mcast_forw_tt_node_get(struct batadv_priv *bat_priv, } /** - * batadv_mcast_want_forw_ipv4_node_get - get a node with an ipv4 flag + * batadv_mcast_forw_ipv4_node_get - get a node with an ipv4 flag * @bat_priv: the bat priv with all the soft interface information * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and @@ -460,7 +461,7 @@ batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv) } /** - * batadv_mcast_want_forw_ipv6_node_get - get a node with an ipv6 flag + * batadv_mcast_forw_ipv6_node_get - get a node with an ipv6 flag * @bat_priv: the bat priv with all the soft interface information * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set @@ -487,7 +488,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) } /** - * batadv_mcast_want_forw_ip_node_get - get a node with an ipv4/ipv6 flag + * batadv_mcast_forw_ip_node_get - get a node with an ipv4/ipv6 flag * @bat_priv: the bat priv with all the soft interface information * @ethhdr: an ethernet header to determine the protocol family from * @@ -511,7 +512,7 @@ batadv_mcast_forw_ip_node_get(struct batadv_priv *bat_priv, } /** - * batadv_mcast_want_forw_unsnoop_node_get - get a node with an unsnoopable flag + * batadv_mcast_forw_unsnoop_node_get - get a node with an unsnoopable flag * @bat_priv: the bat priv with all the soft interface information * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index b41719b6487a..678f06865312 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -510,10 +510,10 @@ static u32 batadv_nc_hash_choose(const void *data, u32 size) * @node: node in the local table * @data2: second object to compare the node to * - * Return: 1 if the two entry are the same, 0 otherwise + * Return: true if the two entry are the same, false otherwise */ -static int batadv_nc_hash_compare(const struct hlist_node *node, - const void *data2) +static bool batadv_nc_hash_compare(const struct hlist_node *node, + const void *data2) { const struct batadv_nc_path *nc_path1, *nc_path2; @@ -521,15 +521,13 @@ static int batadv_nc_hash_compare(const struct hlist_node *node, nc_path2 = data2; /* Return 1 if the two keys are identical */ - if (memcmp(nc_path1->prev_hop, nc_path2->prev_hop, - sizeof(nc_path1->prev_hop)) != 0) - return 0; + if (!batadv_compare_eth(nc_path1->prev_hop, nc_path2->prev_hop)) + return false; - if (memcmp(nc_path1->next_hop, nc_path2->next_hop, - sizeof(nc_path1->next_hop)) != 0) - return 0; + if (!batadv_compare_eth(nc_path1->next_hop, nc_path2->next_hop)) + return false; - return 1; + return true; } /** @@ -714,7 +712,7 @@ static void batadv_nc_worker(struct work_struct *work) struct batadv_priv *bat_priv; unsigned long timeout; - delayed_work = container_of(work, struct delayed_work, work); + delayed_work = to_delayed_work(work); priv_nc = container_of(delayed_work, struct batadv_priv_nc, work); bat_priv = container_of(priv_nc, struct batadv_priv, nc); @@ -793,10 +791,10 @@ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv, * * Return: the nc_node if found, NULL otherwise. */ -static struct batadv_nc_node -*batadv_nc_find_nc_node(struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - bool in_coding) +static struct batadv_nc_node * +batadv_nc_find_nc_node(struct batadv_orig_node *orig_node, + struct batadv_orig_node *orig_neigh_node, + bool in_coding) { struct batadv_nc_node *nc_node, *nc_node_out = NULL; struct list_head *list; @@ -835,11 +833,11 @@ static struct batadv_nc_node * * Return: the nc_node if found or created, NULL in case of an error. */ -static struct batadv_nc_node -*batadv_nc_get_nc_node(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - bool in_coding) +static struct batadv_nc_node * +batadv_nc_get_nc_node(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node, + struct batadv_orig_node *orig_neigh_node, + bool in_coding) { struct batadv_nc_node *nc_node; spinlock_t *lock; /* Used to lock list selected by "int in_coding" */ @@ -856,8 +854,7 @@ static struct batadv_nc_node if (!nc_node) return NULL; - if (!kref_get_unless_zero(&orig_neigh_node->refcount)) - goto free; + kref_get(&orig_neigh_node->refcount); /* Initialize nc_node */ INIT_LIST_HEAD(&nc_node->list); @@ -884,10 +881,6 @@ static struct batadv_nc_node spin_unlock_bh(lock); return nc_node; - -free: - kfree(nc_node); - return NULL; } /** diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index e4cbb0753e37..1ff4ee473966 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -54,9 +54,9 @@ static void batadv_purge_orig(struct work_struct *work); * @node: node in the local table * @data2: second object to compare the node to * - * Return: 1 if they are the same originator + * Return: true if they are the same originator */ -int batadv_compare_orig(const struct hlist_node *node, const void *data2) +bool batadv_compare_orig(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_orig_node, hash_entry); @@ -250,7 +250,6 @@ static void batadv_neigh_node_release(struct kref *ref) { struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; - struct batadv_hardif_neigh_node *hardif_neigh; struct batadv_neigh_ifinfo *neigh_ifinfo; struct batadv_algo_ops *bao; @@ -262,13 +261,7 @@ static void batadv_neigh_node_release(struct kref *ref) batadv_neigh_ifinfo_put(neigh_ifinfo); } - hardif_neigh = batadv_hardif_neigh_get(neigh_node->if_incoming, - neigh_node->addr); - if (hardif_neigh) { - /* batadv_hardif_neigh_get() increases refcount too */ - batadv_hardif_neigh_put(hardif_neigh); - batadv_hardif_neigh_put(hardif_neigh); - } + batadv_hardif_neigh_put(neigh_node->hardif_neigh); if (bao->bat_neigh_free) bao->bat_neigh_free(neigh_node); @@ -289,7 +282,7 @@ void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node) } /** - * batadv_orig_node_get_router - router to the originator depending on iface + * batadv_orig_router_get - router to the originator depending on iface * @orig_node: the orig node for the router * @if_outgoing: the interface where the payload packet has been received or * the OGM should be sent to @@ -381,12 +374,8 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, if (!orig_ifinfo) goto out; - if (if_outgoing != BATADV_IF_DEFAULT && - !kref_get_unless_zero(&if_outgoing->refcount)) { - kfree(orig_ifinfo); - orig_ifinfo = NULL; - goto out; - } + if (if_outgoing != BATADV_IF_DEFAULT) + kref_get(&if_outgoing->refcount); reset_time = jiffies - 1; reset_time -= msecs_to_jiffies(BATADV_RESET_PROTECTION_MS); @@ -462,11 +451,8 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, if (!neigh_ifinfo) goto out; - if (if_outgoing && !kref_get_unless_zero(&if_outgoing->refcount)) { - kfree(neigh_ifinfo); - neigh_ifinfo = NULL; - goto out; - } + if (if_outgoing) + kref_get(&if_outgoing->refcount); INIT_HLIST_NODE(&neigh_ifinfo->list); kref_init(&neigh_ifinfo->refcount); @@ -539,15 +525,11 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, if (hardif_neigh) goto out; - if (!kref_get_unless_zero(&hard_iface->refcount)) - goto out; - hardif_neigh = kzalloc(sizeof(*hardif_neigh), GFP_ATOMIC); - if (!hardif_neigh) { - batadv_hardif_put(hard_iface); + if (!hardif_neigh) goto out; - } + kref_get(&hard_iface->refcount); INIT_HLIST_NODE(&hardif_neigh->list); ether_addr_copy(hardif_neigh->addr, neigh_addr); hardif_neigh->if_incoming = hard_iface; @@ -650,19 +632,19 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node, if (!neigh_node) goto out; - if (!kref_get_unless_zero(&hard_iface->refcount)) { - kfree(neigh_node); - neigh_node = NULL; - goto out; - } - INIT_HLIST_NODE(&neigh_node->list); INIT_HLIST_HEAD(&neigh_node->ifinfo_list); spin_lock_init(&neigh_node->ifinfo_lock); + kref_get(&hard_iface->refcount); ether_addr_copy(neigh_node->addr, neigh_addr); neigh_node->if_incoming = hard_iface; neigh_node->orig_node = orig_node; + neigh_node->last_seen = jiffies; + + /* increment unique neighbor refcount */ + kref_get(&hardif_neigh->refcount); + neigh_node->hardif_neigh = hardif_neigh; /* extra reference for return */ kref_init(&neigh_node->refcount); @@ -672,9 +654,6 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node, hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); spin_unlock_bh(&orig_node->neigh_list_lock); - /* increment unique neighbor refcount */ - kref_get(&hardif_neigh->refcount); - batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv, "Creating new neighbor %pM for orig_node %pM on interface %s\n", neigh_addr, orig_node->orig, hard_iface->net_dev->name); @@ -1165,6 +1144,9 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, if (hard_iface->soft_iface != bat_priv->soft_iface) continue; + if (!kref_get_unless_zero(&hard_iface->refcount)) + continue; + best_neigh_node = batadv_find_best_neighbor(bat_priv, orig_node, hard_iface); @@ -1172,6 +1154,8 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, best_neigh_node); if (best_neigh_node) batadv_neigh_node_put(best_neigh_node); + + batadv_hardif_put(hard_iface); } rcu_read_unlock(); @@ -1222,7 +1206,7 @@ static void batadv_purge_orig(struct work_struct *work) struct delayed_work *delayed_work; struct batadv_priv *bat_priv; - delayed_work = container_of(work, struct delayed_work, work); + delayed_work = to_delayed_work(work); bat_priv = container_of(delayed_work, struct batadv_priv, orig_work); _batadv_purge_orig(bat_priv); queue_delayed_work(batadv_event_workqueue, diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 4e8b67f11051..64a8951e5844 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -33,7 +33,7 @@ struct seq_file; -int batadv_compare_orig(const struct hlist_node *node, const void *data2); +bool batadv_compare_orig(const struct hlist_node *node, const void *data2); int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); void batadv_purge_orig_ref(struct batadv_priv *bat_priv); diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index 8a8d7ca1a5cf..372128ddb474 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -175,6 +175,7 @@ enum batadv_bla_claimframe { BATADV_CLAIM_TYPE_UNCLAIM = 0x01, BATADV_CLAIM_TYPE_ANNOUNCE = 0x02, BATADV_CLAIM_TYPE_REQUEST = 0x03, + BATADV_CLAIM_TYPE_LOOPDETECT = 0x04, }; /** @@ -501,7 +502,7 @@ struct batadv_coded_packet { #pragma pack() /** - * struct batadv_unicast_tvlv - generic unicast packet with tvlv payload + * struct batadv_unicast_tvlv_packet - generic unicast packet with tvlv payload * @packet_type: batman-adv packet type, part of the general header * @version: batman-adv protocol version, part of the genereal header * @ttl: time to live for this packet, part of the genereal header diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 4dd646a52f1a..ae850f2d11cb 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -100,11 +100,20 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, if (curr_router) batadv_neigh_node_put(curr_router); + spin_lock_bh(&orig_node->neigh_list_lock); + /* curr_router used earlier may not be the current orig_ifinfo->router + * anymore because it was dereferenced outside of the neigh_list_lock + * protected region. After the new best neighbor has replace the current + * best neighbor the reference counter needs to decrease. Consequently, + * the code needs to ensure the curr_router variable contains a pointer + * to the replaced best neighbor. + */ + curr_router = rcu_dereference_protected(orig_ifinfo->router, true); + /* increase refcount of new best neighbor */ - if (neigh_node && !kref_get_unless_zero(&neigh_node->refcount)) - neigh_node = NULL; + if (neigh_node) + kref_get(&neigh_node->refcount); - spin_lock_bh(&orig_node->neigh_list_lock); rcu_assign_pointer(orig_ifinfo->router, neigh_node); spin_unlock_bh(&orig_node->neigh_list_lock); batadv_orig_ifinfo_put(orig_ifinfo); @@ -154,18 +163,18 @@ out: * doesn't change otherwise. * * Return: - * 0 if the packet is to be accepted. - * 1 if the packet is to be ignored. + * false if the packet is to be accepted. + * true if the packet is to be ignored. */ -int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, - s32 seq_old_max_diff, unsigned long *last_reset, - bool *protection_started) +bool batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, + s32 seq_old_max_diff, unsigned long *last_reset, + bool *protection_started) { if (seq_num_diff <= -seq_old_max_diff || seq_num_diff >= BATADV_EXPECTED_SEQNO_RANGE) { if (!batadv_has_timed_out(*last_reset, BATADV_RESET_PROTECTION_MS)) - return 1; + return true; *last_reset = jiffies; if (protection_started) @@ -174,7 +183,7 @@ int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, "old packet received, start protection\n"); } - return 0; + return false; } bool batadv_check_management_packet(struct sk_buff *skb, @@ -709,8 +718,9 @@ out: return ret; } -static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, - struct sk_buff *skb, int hdr_len) { +static bool batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, + struct sk_buff *skb, int hdr_len) +{ struct batadv_unicast_packet *unicast_packet; struct batadv_hard_iface *primary_if; struct batadv_orig_node *orig_node; @@ -721,11 +731,11 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, /* check if there is enough data before accessing it */ if (!pskb_may_pull(skb, hdr_len + ETH_HLEN)) - return 0; + return false; /* create a copy of the skb (in case of for re-routing) to modify it. */ if (skb_cow(skb, sizeof(*unicast_packet)) < 0) - return 0; + return false; unicast_packet = (struct batadv_unicast_packet *)skb->data; vid = batadv_get_vid(skb, hdr_len); @@ -749,7 +759,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, * table. If not, let the packet go untouched anyway because * there is nothing the node can do */ - return 1; + return true; } /* retrieve the TTVN known by this node for the packet destination. This @@ -765,7 +775,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, * not be possible to deliver it */ if (!orig_node) - return 0; + return false; curr_ttvn = (u8)atomic_read(&orig_node->last_ttvn); batadv_orig_node_put(orig_node); @@ -776,7 +786,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, */ is_old_ttvn = batadv_seq_before(unicast_packet->ttvn, curr_ttvn); if (!is_old_ttvn) - return 1; + return true; old_ttvn = unicast_packet->ttvn; /* the packet was forged based on outdated network information. Its @@ -789,7 +799,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, "Rerouting unicast packet to %pM (dst=%pM): TTVN mismatch old_ttvn=%u new_ttvn=%u\n", unicast_packet->dest, ethhdr->h_dest, old_ttvn, curr_ttvn); - return 1; + return true; } /* the packet has not been re-routed: either the destination is @@ -797,14 +807,14 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, * it is possible to drop the packet */ if (!batadv_is_my_client(bat_priv, ethhdr->h_dest, vid)) - return 0; + return false; /* update the header in order to let the packet be delivered to this * node's soft interface */ primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) - return 0; + return false; ether_addr_copy(unicast_packet->dest, primary_if->net_dev->dev_addr); @@ -812,7 +822,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, unicast_packet->ttvn = curr_ttvn; - return 1; + return true; } /** @@ -903,7 +913,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, hdr_size)) goto rx_success; - batadv_interface_rx(recv_if->soft_iface, skb, recv_if, hdr_size, + batadv_interface_rx(recv_if->soft_iface, skb, hdr_size, orig_node); rx_success: @@ -1113,8 +1123,7 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, goto rx_success; /* broadcast for me */ - batadv_interface_rx(recv_if->soft_iface, skb, recv_if, hdr_size, - orig_node); + batadv_interface_rx(recv_if->soft_iface, skb, hdr_size, orig_node); rx_success: ret = NET_RX_SUCCESS; diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 02a5caa84127..05c3ff42e181 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -51,8 +51,8 @@ struct batadv_neigh_node * batadv_find_router(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_hard_iface *recv_if); -int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, - s32 seq_old_max_diff, unsigned long *last_reset, - bool *protection_started); +bool batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, + s32 seq_old_max_diff, unsigned long *last_reset, + bool *protection_started); #endif /* _NET_BATMAN_ADV_ROUTING_H_ */ diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 3ce06e0a91b1..f2f125684ed9 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -26,6 +26,7 @@ #include <linux/if.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/printk.h> @@ -552,7 +553,7 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) struct net_device *soft_iface; struct batadv_priv *bat_priv; - delayed_work = container_of(work, struct delayed_work, work); + delayed_work = to_delayed_work(work); forw_packet = container_of(delayed_work, struct batadv_forw_packet, delayed_work); soft_iface = forw_packet->if_incoming->soft_iface; @@ -577,10 +578,15 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) if (forw_packet->num_packets >= hard_iface->num_bcasts) continue; + if (!kref_get_unless_zero(&hard_iface->refcount)) + continue; + /* send a copy of the saved skb */ skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC); if (skb1) batadv_send_broadcast_skb(skb1, hard_iface); + + batadv_hardif_put(hard_iface); } rcu_read_unlock(); @@ -604,7 +610,7 @@ void batadv_send_outstanding_bat_ogm_packet(struct work_struct *work) struct batadv_forw_packet *forw_packet; struct batadv_priv *bat_priv; - delayed_work = container_of(work, struct delayed_work, work); + delayed_work = to_delayed_work(work); forw_packet = container_of(delayed_work, struct batadv_forw_packet, delayed_work); bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface); @@ -675,6 +681,9 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, if (pending) { hlist_del(&forw_packet->list); + if (!forw_packet->own) + atomic_inc(&bat_priv->bcast_queue_left); + batadv_forw_packet_free(forw_packet); } } @@ -702,6 +711,9 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, if (pending) { hlist_del(&forw_packet->list); + if (!forw_packet->own) + atomic_inc(&bat_priv->batman_queue_left); + batadv_forw_packet_free(forw_packet); } } diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 0710379491bf..343d2c904399 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -186,7 +186,6 @@ static int batadv_interface_tx(struct sk_buff *skb, struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_hard_iface *primary_if = NULL; struct batadv_bcast_packet *bcast_packet; - __be16 ethertype = htons(ETH_P_BATMAN); static const u8 stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00, 0x00, 0x00}; static const u8 ectp_addr[ETH_ALEN] = {0xCF, 0x00, 0x00, 0x00, @@ -208,7 +207,7 @@ static int batadv_interface_tx(struct sk_buff *skb, if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) goto dropped; - soft_iface->trans_start = jiffies; + netif_trans_update(soft_iface); vid = batadv_get_vid(skb, 0); ethhdr = eth_hdr(skb); @@ -216,7 +215,8 @@ static int batadv_interface_tx(struct sk_buff *skb, case ETH_P_8021Q: vhdr = vlan_eth_hdr(skb); - if (vhdr->h_vlan_encapsulated_proto != ethertype) { + /* drop batman-in-batman packets to prevent loops */ + if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN)) { network_offset += VLAN_HLEN; break; } @@ -381,13 +381,29 @@ end: return NETDEV_TX_OK; } +/** + * batadv_interface_rx - receive ethernet frame on local batman-adv interface + * @soft_iface: local interface which will receive the ethernet frame + * @skb: ethernet frame for @soft_iface + * @hdr_size: size of already parsed batman-adv header + * @orig_node: originator from which the batman-adv packet was sent + * + * Sends a ethernet frame to the receive path of the local @soft_iface. + * skb->data has still point to the batman-adv header with the size @hdr_size. + * The caller has to have parsed this header already and made sure that at least + * @hdr_size bytes are still available for pull in @skb. + * + * The packet may still get dropped. This can happen when the encapsulated + * ethernet frame is invalid or contains again an batman-adv packet. Also + * unicast packets will be dropped directly when it was sent between two + * isolated clients. + */ void batadv_interface_rx(struct net_device *soft_iface, - struct sk_buff *skb, struct batadv_hard_iface *recv_if, - int hdr_size, struct batadv_orig_node *orig_node) + struct sk_buff *skb, int hdr_size, + struct batadv_orig_node *orig_node) { struct batadv_bcast_packet *batadv_bcast_packet; struct batadv_priv *bat_priv = netdev_priv(soft_iface); - __be16 ethertype = htons(ETH_P_BATMAN); struct vlan_ethhdr *vhdr; struct ethhdr *ethhdr; unsigned short vid; @@ -396,10 +412,6 @@ void batadv_interface_rx(struct net_device *soft_iface, batadv_bcast_packet = (struct batadv_bcast_packet *)skb->data; is_bcast = (batadv_bcast_packet->packet_type == BATADV_BCAST); - /* check if enough space is available for pulling, and pull */ - if (!pskb_may_pull(skb, hdr_size)) - goto dropped; - skb_pull_rcsum(skb, hdr_size); skb_reset_mac_header(skb); @@ -408,14 +420,21 @@ void batadv_interface_rx(struct net_device *soft_iface, */ nf_reset(skb); + if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) + goto dropped; + vid = batadv_get_vid(skb, 0); ethhdr = eth_hdr(skb); switch (ntohs(ethhdr->h_proto)) { case ETH_P_8021Q: + if (!pskb_may_pull(skb, VLAN_ETH_HLEN)) + goto dropped; + vhdr = (struct vlan_ethhdr *)skb->data; - if (vhdr->h_vlan_encapsulated_proto != ethertype) + /* drop batman-in-batman packets to prevent loops */ + if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN)) break; /* fall through */ @@ -424,8 +443,6 @@ void batadv_interface_rx(struct net_device *soft_iface, } /* skb->dev & skb->pkt_type are set here */ - if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) - goto dropped; skb->protocol = eth_type_trans(skb, soft_iface); /* should not be necessary anymore as we use skb_pull_rcsum() @@ -539,7 +556,7 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, } /** - * batadv_create_vlan - allocate the needed resources for a new vlan + * batadv_softif_create_vlan - allocate the needed resources for a new vlan * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier * @@ -868,13 +885,14 @@ static int batadv_softif_slave_add(struct net_device *dev, struct net_device *slave_dev) { struct batadv_hard_iface *hard_iface; + struct net *net = dev_net(dev); int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); if (!hard_iface || hard_iface->soft_iface) goto out; - ret = batadv_hardif_enable_interface(hard_iface, dev->name); + ret = batadv_hardif_enable_interface(hard_iface, net, dev->name); out: if (hard_iface) @@ -955,7 +973,7 @@ static void batadv_softif_init_early(struct net_device *dev) dev->netdev_ops = &batadv_netdev_ops; dev->destructor = batadv_softif_free; - dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; + dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL; dev->priv_flags |= IFF_NO_QUEUE; /* can't call min_mtu, because the needed variables @@ -971,7 +989,7 @@ static void batadv_softif_init_early(struct net_device *dev) memset(priv, 0, sizeof(*priv)); } -struct net_device *batadv_softif_create(const char *name) +struct net_device *batadv_softif_create(struct net *net, const char *name) { struct net_device *soft_iface; int ret; @@ -981,6 +999,8 @@ struct net_device *batadv_softif_create(const char *name) if (!soft_iface) return NULL; + dev_net_set(soft_iface, net); + soft_iface->rtnl_link_ops = &batadv_link_ops; ret = register_netdevice(soft_iface); @@ -1025,12 +1045,12 @@ static void batadv_softif_destroy_netlink(struct net_device *soft_iface, unregister_netdevice_queue(soft_iface, head); } -int batadv_softif_is_valid(const struct net_device *net_dev) +bool batadv_softif_is_valid(const struct net_device *net_dev) { if (net_dev->netdev_ops->ndo_start_xmit == batadv_interface_tx) - return 1; + return true; - return 0; + return false; } struct rtnl_link_ops batadv_link_ops __read_mostly = { diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 9ae265703d23..ec303ddbf647 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -20,18 +20,20 @@ #include "main.h" +#include <linux/types.h> #include <net/rtnetlink.h> struct net_device; +struct net; struct sk_buff; int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); void batadv_interface_rx(struct net_device *soft_iface, - struct sk_buff *skb, struct batadv_hard_iface *recv_if, - int hdr_size, struct batadv_orig_node *orig_node); -struct net_device *batadv_softif_create(const char *name); + struct sk_buff *skb, int hdr_size, + struct batadv_orig_node *orig_node); +struct net_device *batadv_softif_create(struct net *net, const char *name); void batadv_softif_destroy_sysfs(struct net_device *soft_iface); -int batadv_softif_is_valid(const struct net_device *net_dev); +bool batadv_softif_is_valid(const struct net_device *net_dev); extern struct rtnl_link_ops batadv_link_ops; int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid); void batadv_softif_vlan_put(struct batadv_softif_vlan *softif_vlan); diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index e7cf51333a36..414b2074165f 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -116,11 +116,13 @@ batadv_kobj_to_vlan(struct batadv_priv *bat_priv, struct kobject *obj) static char *batadv_uev_action_str[] = { "add", "del", - "change" + "change", + "loopdetect", }; static char *batadv_uev_type_str[] = { - "gw" + "gw", + "bla", }; /* Use this, if you have customized show and store functions for vlan attrs */ @@ -830,6 +832,7 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj, size_t count) { struct net_device *net_dev = batadv_kobj_to_netdev(kobj); + struct net *net = dev_net(net_dev); struct batadv_hard_iface *hard_iface; int status_tmp = -1; int ret = count; @@ -873,7 +876,7 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj, batadv_hardif_disable_interface(hard_iface, BATADV_IF_CLEANUP_AUTO); - ret = batadv_hardif_enable_interface(hard_iface, buff); + ret = batadv_hardif_enable_interface(hard_iface, net, buff); unlock: rtnl_unlock(); diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 0b43e86328a5..feaf492b01ca 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -43,7 +43,6 @@ #include <linux/stddef.h> #include <linux/string.h> #include <linux/workqueue.h> -#include <net/net_namespace.h> #include "bridge_loop_avoidance.h" #include "hard-interface.h" @@ -76,9 +75,9 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv, * * Compare the MAC address and the VLAN ID of the two TT entries and check if * they are the same TT client. - * Return: 1 if the two TT clients are the same, 0 otherwise + * Return: true if the two TT clients are the same, false otherwise */ -static int batadv_compare_tt(const struct hlist_node *node, const void *data2) +static bool batadv_compare_tt(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_tt_common_entry, hash_entry); @@ -215,6 +214,8 @@ static void batadv_tt_local_entry_release(struct kref *ref) tt_local_entry = container_of(ref, struct batadv_tt_local_entry, common.refcount); + batadv_softif_vlan_put(tt_local_entry->vlan); + kfree_rcu(tt_local_entry, common.rcu); } @@ -583,6 +584,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_tt_local_entry *tt_local; struct batadv_tt_global_entry *tt_global = NULL; + struct net *net = dev_net(soft_iface); struct batadv_softif_vlan *vlan; struct net_device *in_dev = NULL; struct hlist_head *head; @@ -594,7 +596,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, u32 match_mark; if (ifindex != BATADV_NULL_IFINDEX) - in_dev = dev_get_by_index(&init_net, ifindex); + in_dev = dev_get_by_index(net, ifindex); tt_local = batadv_tt_local_hash_find(bat_priv, addr, vid); @@ -673,6 +675,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, kref_get(&tt_local->common.refcount); tt_local->last_seen = jiffies; tt_local->common.added_at = tt_local->last_seen; + tt_local->vlan = vlan; /* the batman interface mac and multicast addresses should never be * purged @@ -991,7 +994,6 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_local_entry *tt_local; struct batadv_hard_iface *primary_if; - struct batadv_softif_vlan *vlan; struct hlist_head *head; unsigned short vid; u32 i; @@ -1008,8 +1010,8 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) seq_printf(seq, "Locally retrieved addresses (from %s) announced via TT (TTVN: %u):\n", net_dev->name, (u8)atomic_read(&bat_priv->tt.vn)); - seq_printf(seq, " %-13s %s %-8s %-9s (%-10s)\n", "Client", "VID", - "Flags", "Last seen", "CRC"); + seq_puts(seq, + " Client VID Flags Last seen (CRC )\n"); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -1027,14 +1029,6 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) last_seen_msecs = last_seen_msecs % 1000; no_purge = tt_common_entry->flags & np_flag; - - vlan = batadv_softif_vlan_get(bat_priv, vid); - if (!vlan) { - seq_printf(seq, "Cannot retrieve VLAN %d\n", - BATADV_PRINT_VID(vid)); - continue; - } - seq_printf(seq, " * %pM %4i [%c%c%c%c%c%c] %3u.%03u (%#.8x)\n", tt_common_entry->addr, @@ -1052,9 +1046,7 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'), no_purge ? 0 : last_seen_secs, no_purge ? 0 : last_seen_msecs, - vlan->tt.crc); - - batadv_softif_vlan_put(vlan); + tt_local->vlan->tt.crc); } rcu_read_unlock(); } @@ -1099,7 +1091,6 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, { struct batadv_tt_local_entry *tt_local_entry; u16 flags, curr_flags = BATADV_NO_FLAGS; - struct batadv_softif_vlan *vlan; void *tt_entry_exists; tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid); @@ -1139,14 +1130,6 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, /* extra call to free the local tt entry */ batadv_tt_local_entry_put(tt_local_entry); - /* decrease the reference held for this vlan */ - vlan = batadv_softif_vlan_get(bat_priv, vid); - if (!vlan) - goto out; - - batadv_softif_vlan_put(vlan); - batadv_softif_vlan_put(vlan); - out: if (tt_local_entry) batadv_tt_local_entry_put(tt_local_entry); @@ -1219,7 +1202,6 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) spinlock_t *list_lock; /* protects write access to the hash lists */ struct batadv_tt_common_entry *tt_common_entry; struct batadv_tt_local_entry *tt_local; - struct batadv_softif_vlan *vlan; struct hlist_node *node_tmp; struct hlist_head *head; u32 i; @@ -1241,14 +1223,6 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) struct batadv_tt_local_entry, common); - /* decrease the reference held for this vlan */ - vlan = batadv_softif_vlan_get(bat_priv, - tt_common_entry->vid); - if (vlan) { - batadv_softif_vlan_put(vlan); - batadv_softif_vlan_put(vlan); - } - batadv_tt_local_entry_put(tt_local); } spin_unlock_bh(list_lock); @@ -1706,9 +1680,8 @@ int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset) seq_printf(seq, "Globally announced TT entries received via the mesh %s\n", net_dev->name); - seq_printf(seq, " %-13s %s %s %-15s %s (%-10s) %s\n", - "Client", "VID", "(TTVN)", "Originator", "(Curr TTVN)", - "CRC", "Flags"); + seq_puts(seq, + " Client VID (TTVN) Originator (Curr TTVN) (CRC ) Flags\n"); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -2388,19 +2361,19 @@ unlock: * @entry_ptr: to be checked local tt entry * @data_ptr: not used but definition required to satisfy the callback prototype * - * Return: 1 if the entry is a valid, 0 otherwise. + * Return: true if the entry is a valid, false otherwise. */ -static int batadv_tt_local_valid(const void *entry_ptr, const void *data_ptr) +static bool batadv_tt_local_valid(const void *entry_ptr, const void *data_ptr) { const struct batadv_tt_common_entry *tt_common_entry = entry_ptr; if (tt_common_entry->flags & BATADV_TT_CLIENT_NEW) - return 0; - return 1; + return false; + return true; } -static int batadv_tt_global_valid(const void *entry_ptr, - const void *data_ptr) +static bool batadv_tt_global_valid(const void *entry_ptr, + const void *data_ptr) { const struct batadv_tt_common_entry *tt_common_entry = entry_ptr; const struct batadv_tt_global_entry *tt_global_entry; @@ -2408,7 +2381,7 @@ static int batadv_tt_global_valid(const void *entry_ptr, if (tt_common_entry->flags & BATADV_TT_CLIENT_ROAM || tt_common_entry->flags & BATADV_TT_CLIENT_TEMP) - return 0; + return false; tt_global_entry = container_of(tt_common_entry, struct batadv_tt_global_entry, @@ -2430,7 +2403,8 @@ static int batadv_tt_global_valid(const void *entry_ptr, static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, struct batadv_hashtable *hash, void *tvlv_buff, u16 tt_len, - int (*valid_cb)(const void *, const void *), + bool (*valid_cb)(const void *, + const void *), void *cb_data) { struct batadv_tt_common_entry *tt_common_entry; @@ -2579,11 +2553,11 @@ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv, * * Return: true if the TT Request was sent, false otherwise */ -static int batadv_send_tt_request(struct batadv_priv *bat_priv, - struct batadv_orig_node *dst_orig_node, - u8 ttvn, - struct batadv_tvlv_tt_vlan_data *tt_vlan, - u16 num_vlan, bool full_table) +static bool batadv_send_tt_request(struct batadv_priv *bat_priv, + struct batadv_orig_node *dst_orig_node, + u8 ttvn, + struct batadv_tvlv_tt_vlan_data *tt_vlan, + u16 num_vlan, bool full_table) { struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; struct batadv_tt_req_node *tt_req_node = NULL; @@ -3227,7 +3201,7 @@ static void batadv_tt_purge(struct work_struct *work) struct batadv_priv_tt *priv_tt; struct batadv_priv *bat_priv; - delayed_work = container_of(work, struct delayed_work, work); + delayed_work = to_delayed_work(work); priv_tt = container_of(delayed_work, struct batadv_priv_tt, work); bat_priv = container_of(priv_tt, struct batadv_priv, tt); @@ -3309,7 +3283,6 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common; struct batadv_tt_local_entry *tt_local; - struct batadv_softif_vlan *vlan; struct hlist_node *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ @@ -3339,13 +3312,6 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) struct batadv_tt_local_entry, common); - /* decrease the reference held for this vlan */ - vlan = batadv_softif_vlan_get(bat_priv, tt_common->vid); - if (vlan) { - batadv_softif_vlan_put(vlan); - batadv_softif_vlan_put(vlan); - } - batadv_tt_local_entry_put(tt_local); } spin_unlock_bh(list_lock); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 9abfb3e73c34..6a577f4f8ba7 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -433,6 +433,7 @@ struct batadv_hardif_neigh_node { * @ifinfo_lock: lock protecting private ifinfo members and list * @if_incoming: pointer to incoming hard-interface * @last_seen: when last packet via this neighbor was received + * @hardif_neigh: hardif_neigh of this neighbor * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner */ @@ -444,6 +445,7 @@ struct batadv_neigh_node { spinlock_t ifinfo_lock; /* protects ifinfo_list and its members */ struct batadv_hard_iface *if_incoming; unsigned long last_seen; + struct batadv_hardif_neigh_node *hardif_neigh; struct kref refcount; struct rcu_head rcu; }; @@ -655,6 +657,9 @@ struct batadv_priv_tt { * @num_requests: number of bla requests in flight * @claim_hash: hash table containing mesh nodes this host has claimed * @backbone_hash: hash table containing all detected backbone gateways + * @loopdetect_addr: MAC address used for own loopdetection frames + * @loopdetect_lasttime: time when the loopdetection frames were sent + * @loopdetect_next: how many periods to wait for the next loopdetect process * @bcast_duplist: recently received broadcast packets array (for broadcast * duplicate suppression) * @bcast_duplist_curr: index of last broadcast packet added to bcast_duplist @@ -666,6 +671,9 @@ struct batadv_priv_bla { atomic_t num_requests; struct batadv_hashtable *claim_hash; struct batadv_hashtable *backbone_hash; + u8 loopdetect_addr[ETH_ALEN]; + unsigned long loopdetect_lasttime; + atomic_t loopdetect_next; struct batadv_bcast_duplist_entry bcast_duplist[BATADV_DUPLIST_SIZE]; int bcast_duplist_curr; /* protects bcast_duplist & bcast_duplist_curr */ @@ -1010,6 +1018,7 @@ struct batadv_socket_packet { * resolved * @crc: crc16 checksum over all claims * @crc_lock: lock protecting crc + * @report_work: work struct for reporting detected loops * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner */ @@ -1023,6 +1032,7 @@ struct batadv_bla_backbone_gw { atomic_t request_sent; u16 crc; spinlock_t crc_lock; /* protects crc */ + struct work_struct report_work; struct kref refcount; struct rcu_head rcu; }; @@ -1073,10 +1083,12 @@ struct batadv_tt_common_entry { * struct batadv_tt_local_entry - translation table local entry data * @common: general translation table data * @last_seen: timestamp used for purging stale tt local entries + * @vlan: soft-interface vlan of the entry */ struct batadv_tt_local_entry { struct batadv_tt_common_entry common; unsigned long last_seen; + struct batadv_softif_vlan *vlan; }; /** @@ -1250,6 +1262,8 @@ struct batadv_forw_packet { * struct batadv_algo_ops - mesh algorithm callbacks * @list: list node for the batadv_algo_list * @name: name of the algorithm + * @bat_iface_activate: start routing mechanisms when hard-interface is brought + * up * @bat_iface_enable: init routing info when hard-interface is enabled * @bat_iface_disable: de-init routing info when hard-interface is disabled * @bat_iface_update_mac: (re-)init mac addresses of the protocol information @@ -1277,6 +1291,7 @@ struct batadv_forw_packet { struct batadv_algo_ops { struct hlist_node list; char *name; + void (*bat_iface_activate)(struct batadv_hard_iface *hard_iface); int (*bat_iface_enable)(struct batadv_hard_iface *hard_iface); void (*bat_iface_disable)(struct batadv_hard_iface *hard_iface); void (*bat_iface_update_mac)(struct batadv_hard_iface *hard_iface); diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 8a4cc2f7f0db..780089d75915 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -68,7 +68,7 @@ struct lowpan_peer { struct in6_addr peer_addr; }; -struct lowpan_dev { +struct lowpan_btle_dev { struct list_head list; struct hci_dev *hdev; @@ -80,18 +80,21 @@ struct lowpan_dev { struct delayed_work notify_peers; }; -static inline struct lowpan_dev *lowpan_dev(const struct net_device *netdev) +static inline struct lowpan_btle_dev * +lowpan_btle_dev(const struct net_device *netdev) { - return (struct lowpan_dev *)lowpan_priv(netdev)->priv; + return (struct lowpan_btle_dev *)lowpan_dev(netdev)->priv; } -static inline void peer_add(struct lowpan_dev *dev, struct lowpan_peer *peer) +static inline void peer_add(struct lowpan_btle_dev *dev, + struct lowpan_peer *peer) { list_add_rcu(&peer->list, &dev->peers); atomic_inc(&dev->peer_count); } -static inline bool peer_del(struct lowpan_dev *dev, struct lowpan_peer *peer) +static inline bool peer_del(struct lowpan_btle_dev *dev, + struct lowpan_peer *peer) { list_del_rcu(&peer->list); kfree_rcu(peer, rcu); @@ -106,7 +109,7 @@ static inline bool peer_del(struct lowpan_dev *dev, struct lowpan_peer *peer) return false; } -static inline struct lowpan_peer *peer_lookup_ba(struct lowpan_dev *dev, +static inline struct lowpan_peer *peer_lookup_ba(struct lowpan_btle_dev *dev, bdaddr_t *ba, __u8 type) { struct lowpan_peer *peer; @@ -134,8 +137,8 @@ static inline struct lowpan_peer *peer_lookup_ba(struct lowpan_dev *dev, return NULL; } -static inline struct lowpan_peer *__peer_lookup_chan(struct lowpan_dev *dev, - struct l2cap_chan *chan) +static inline struct lowpan_peer * +__peer_lookup_chan(struct lowpan_btle_dev *dev, struct l2cap_chan *chan) { struct lowpan_peer *peer; @@ -147,8 +150,8 @@ static inline struct lowpan_peer *__peer_lookup_chan(struct lowpan_dev *dev, return NULL; } -static inline struct lowpan_peer *__peer_lookup_conn(struct lowpan_dev *dev, - struct l2cap_conn *conn) +static inline struct lowpan_peer * +__peer_lookup_conn(struct lowpan_btle_dev *dev, struct l2cap_conn *conn) { struct lowpan_peer *peer; @@ -160,7 +163,7 @@ static inline struct lowpan_peer *__peer_lookup_conn(struct lowpan_dev *dev, return NULL; } -static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_dev *dev, +static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev, struct in6_addr *daddr, struct sk_buff *skb) { @@ -220,7 +223,7 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_dev *dev, static struct lowpan_peer *lookup_peer(struct l2cap_conn *conn) { - struct lowpan_dev *entry; + struct lowpan_btle_dev *entry; struct lowpan_peer *peer = NULL; rcu_read_lock(); @@ -236,10 +239,10 @@ static struct lowpan_peer *lookup_peer(struct l2cap_conn *conn) return peer; } -static struct lowpan_dev *lookup_dev(struct l2cap_conn *conn) +static struct lowpan_btle_dev *lookup_dev(struct l2cap_conn *conn) { - struct lowpan_dev *entry; - struct lowpan_dev *dev = NULL; + struct lowpan_btle_dev *entry; + struct lowpan_btle_dev *dev = NULL; rcu_read_lock(); @@ -270,10 +273,10 @@ static int iphc_decompress(struct sk_buff *skb, struct net_device *netdev, struct l2cap_chan *chan) { const u8 *saddr, *daddr; - struct lowpan_dev *dev; + struct lowpan_btle_dev *dev; struct lowpan_peer *peer; - dev = lowpan_dev(netdev); + dev = lowpan_btle_dev(netdev); rcu_read_lock(); peer = __peer_lookup_chan(dev, chan); @@ -375,7 +378,7 @@ drop: /* Packet from BT LE device */ static int chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) { - struct lowpan_dev *dev; + struct lowpan_btle_dev *dev; struct lowpan_peer *peer; int err; @@ -431,15 +434,18 @@ static int setup_header(struct sk_buff *skb, struct net_device *netdev, bdaddr_t *peer_addr, u8 *peer_addr_type) { struct in6_addr ipv6_daddr; - struct lowpan_dev *dev; + struct ipv6hdr *hdr; + struct lowpan_btle_dev *dev; struct lowpan_peer *peer; bdaddr_t addr, *any = BDADDR_ANY; u8 *daddr = any->b; int err, status = 0; - dev = lowpan_dev(netdev); + hdr = ipv6_hdr(skb); + + dev = lowpan_btle_dev(netdev); - memcpy(&ipv6_daddr, &lowpan_cb(skb)->addr, sizeof(ipv6_daddr)); + memcpy(&ipv6_daddr, &hdr->daddr, sizeof(ipv6_daddr)); if (ipv6_addr_is_multicast(&ipv6_daddr)) { lowpan_cb(skb)->chan = NULL; @@ -489,15 +495,9 @@ static int header_create(struct sk_buff *skb, struct net_device *netdev, unsigned short type, const void *_daddr, const void *_saddr, unsigned int len) { - struct ipv6hdr *hdr; - if (type != ETH_P_IPV6) return -EINVAL; - hdr = ipv6_hdr(skb); - - memcpy(&lowpan_cb(skb)->addr, &hdr->daddr, sizeof(struct in6_addr)); - return 0; } @@ -543,19 +543,19 @@ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb, static int send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) { struct sk_buff *local_skb; - struct lowpan_dev *entry; + struct lowpan_btle_dev *entry; int err = 0; rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { struct lowpan_peer *pentry; - struct lowpan_dev *dev; + struct lowpan_btle_dev *dev; if (entry->netdev != netdev) continue; - dev = lowpan_dev(entry->netdev); + dev = lowpan_btle_dev(entry->netdev); list_for_each_entry_rcu(pentry, &dev->peers, list) { int ret; @@ -723,8 +723,8 @@ static void ifdown(struct net_device *netdev) static void do_notify_peers(struct work_struct *work) { - struct lowpan_dev *dev = container_of(work, struct lowpan_dev, - notify_peers.work); + struct lowpan_btle_dev *dev = container_of(work, struct lowpan_btle_dev, + notify_peers.work); netdev_notify_peers(dev->netdev); /* send neighbour adv at startup */ } @@ -766,7 +766,7 @@ static void set_ip_addr_bits(u8 addr_type, u8 *addr) } static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, - struct lowpan_dev *dev) + struct lowpan_btle_dev *dev) { struct lowpan_peer *peer; @@ -803,12 +803,12 @@ static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, return peer->chan; } -static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev) +static int setup_netdev(struct l2cap_chan *chan, struct lowpan_btle_dev **dev) { struct net_device *netdev; int err = 0; - netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_dev)), + netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_btle_dev)), IFACE_NAME_TEMPLATE, NET_NAME_UNKNOWN, netdev_setup); if (!netdev) @@ -820,7 +820,7 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev) SET_NETDEV_DEV(netdev, &chan->conn->hcon->hdev->dev); SET_NETDEV_DEVTYPE(netdev, &bt_type); - *dev = lowpan_dev(netdev); + *dev = lowpan_btle_dev(netdev); (*dev)->netdev = netdev; (*dev)->hdev = chan->conn->hcon->hdev; INIT_LIST_HEAD(&(*dev)->peers); @@ -853,7 +853,7 @@ out: static inline void chan_ready_cb(struct l2cap_chan *chan) { - struct lowpan_dev *dev; + struct lowpan_btle_dev *dev; dev = lookup_dev(chan->conn); @@ -890,8 +890,9 @@ static inline struct l2cap_chan *chan_new_conn_cb(struct l2cap_chan *pchan) static void delete_netdev(struct work_struct *work) { - struct lowpan_dev *entry = container_of(work, struct lowpan_dev, - delete_netdev); + struct lowpan_btle_dev *entry = container_of(work, + struct lowpan_btle_dev, + delete_netdev); lowpan_unregister_netdev(entry->netdev); @@ -900,8 +901,8 @@ static void delete_netdev(struct work_struct *work) static void chan_close_cb(struct l2cap_chan *chan) { - struct lowpan_dev *entry; - struct lowpan_dev *dev = NULL; + struct lowpan_btle_dev *entry; + struct lowpan_btle_dev *dev = NULL; struct lowpan_peer *peer; int err = -ENOENT; bool last = false, remove = true; @@ -921,7 +922,7 @@ static void chan_close_cb(struct l2cap_chan *chan) spin_lock(&devices_lock); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { - dev = lowpan_dev(entry->netdev); + dev = lowpan_btle_dev(entry->netdev); peer = __peer_lookup_chan(dev, chan); if (peer) { last = peer_del(dev, peer); @@ -1131,7 +1132,7 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, static void disconnect_all_peers(void) { - struct lowpan_dev *entry; + struct lowpan_btle_dev *entry; struct lowpan_peer *peer, *tmp_peer, *new_peer; struct list_head peers; @@ -1291,7 +1292,7 @@ static ssize_t lowpan_control_write(struct file *fp, static int lowpan_control_show(struct seq_file *f, void *ptr) { - struct lowpan_dev *entry; + struct lowpan_btle_dev *entry; struct lowpan_peer *peer; spin_lock(&devices_lock); @@ -1322,7 +1323,7 @@ static const struct file_operations lowpan_control_fops = { static void disconnect_devices(void) { - struct lowpan_dev *entry, *tmp, *new_dev; + struct lowpan_btle_dev *entry, *tmp, *new_dev; struct list_head devices; INIT_LIST_HEAD(&devices); @@ -1360,7 +1361,7 @@ static int device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); - struct lowpan_dev *entry; + struct lowpan_btle_dev *entry; if (netdev->type != ARPHRD_6LOWPAN) return NOTIFY_DONE; diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c index 6ceb5d36a32b..f4fcb4a9d5c1 100644 --- a/net/bluetooth/bnep/netdev.c +++ b/net/bluetooth/bnep/netdev.c @@ -188,7 +188,7 @@ static netdev_tx_t bnep_net_xmit(struct sk_buff *skb, * So we have to queue them and wake up session thread which is sleeping * on the sk_sleep(sk). */ - dev->trans_start = jiffies; + netif_trans_update(dev); skb_queue_tail(&sk->sk_write_queue, skb); wake_up_interruptible(sk_sleep(sk)); diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index f8fc6241469a..d99b2009771a 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -21,18 +21,19 @@ #include <asm/uaccess.h> #include "br_private.h" -/* called with RTNL */ static int get_bridge_ifindices(struct net *net, int *indices, int num) { struct net_device *dev; int i = 0; - for_each_netdev(net, dev) { + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { if (i >= num) break; if (dev->priv_flags & IFF_EBRIDGE) indices[i++] = dev->ifindex; } + rcu_read_unlock(); return i; } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 253bc77eda3b..7dbc80d01eb0 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -61,6 +61,19 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) e->flags |= MDB_FLAGS_OFFLOAD; } +static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip) +{ + memset(ip, 0, sizeof(struct br_ip)); + ip->vid = entry->vid; + ip->proto = entry->addr.proto; + if (ip->proto == htons(ETH_P_IP)) + ip->u.ip4 = entry->addr.u.ip4; +#if IS_ENABLED(CONFIG_IPV6) + else + ip->u.ip6 = entry->addr.u.ip6; +#endif +} + static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev) { @@ -243,9 +256,45 @@ static inline size_t rtnl_mdb_nlmsg_size(void) + nla_total_size(sizeof(struct br_mdb_entry)); } -static void __br_mdb_notify(struct net_device *dev, struct br_mdb_entry *entry, - int type, struct net_bridge_port_group *pg) +struct br_mdb_complete_info { + struct net_bridge_port *port; + struct br_ip ip; +}; + +static void br_mdb_complete(struct net_device *dev, int err, void *priv) { + struct br_mdb_complete_info *data = priv; + struct net_bridge_port_group __rcu **pp; + struct net_bridge_port_group *p; + struct net_bridge_mdb_htable *mdb; + struct net_bridge_mdb_entry *mp; + struct net_bridge_port *port = data->port; + struct net_bridge *br = port->br; + + if (err) + goto err; + + spin_lock_bh(&br->multicast_lock); + mdb = mlock_dereference(br->mdb, br); + mp = br_mdb_ip_get(mdb, &data->ip); + if (!mp) + goto out; + for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; + pp = &p->next) { + if (p->port != port) + continue; + p->flags |= MDB_PG_FLAGS_OFFLOAD; + } +out: + spin_unlock_bh(&br->multicast_lock); +err: + kfree(priv); +} + +static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p, + struct br_mdb_entry *entry, int type) +{ + struct br_mdb_complete_info *complete_info; struct switchdev_obj_port_mdb mdb = { .obj = { .id = SWITCHDEV_OBJ_ID_PORT_MDB, @@ -268,9 +317,14 @@ static void __br_mdb_notify(struct net_device *dev, struct br_mdb_entry *entry, mdb.obj.orig_dev = port_dev; if (port_dev && type == RTM_NEWMDB) { - err = switchdev_port_obj_add(port_dev, &mdb.obj); - if (!err && pg) - pg->flags |= MDB_PG_FLAGS_OFFLOAD; + complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC); + if (complete_info) { + complete_info->port = p; + __mdb_entry_to_br_ip(entry, &complete_info->ip); + mdb.obj.complete_priv = complete_info; + mdb.obj.complete = br_mdb_complete; + switchdev_port_obj_add(port_dev, &mdb.obj); + } } else if (port_dev && type == RTM_DELMDB) { switchdev_port_obj_del(port_dev, &mdb.obj); } @@ -291,21 +345,21 @@ errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } -void br_mdb_notify(struct net_device *dev, struct net_bridge_port_group *pg, - int type) +void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, + struct br_ip *group, int type, u8 flags) { struct br_mdb_entry entry; memset(&entry, 0, sizeof(entry)); - entry.ifindex = pg->port->dev->ifindex; - entry.addr.proto = pg->addr.proto; - entry.addr.u.ip4 = pg->addr.u.ip4; + entry.ifindex = port->dev->ifindex; + entry.addr.proto = group->proto; + entry.addr.u.ip4 = group->u.ip4; #if IS_ENABLED(CONFIG_IPV6) - entry.addr.u.ip6 = pg->addr.u.ip6; + entry.addr.u.ip6 = group->u.ip6; #endif - entry.vid = pg->addr.vid; - __mdb_entry_fill_flags(&entry, pg->flags); - __br_mdb_notify(dev, &entry, type, pg); + entry.vid = group->vid; + __mdb_entry_fill_flags(&entry, flags); + __br_mdb_notify(dev, port, &entry, type); } static int nlmsg_populate_rtr_fill(struct sk_buff *skb, @@ -450,8 +504,7 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, } static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, - struct br_ip *group, unsigned char state, - struct net_bridge_port_group **pg) + struct br_ip *group, unsigned char state) { struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; @@ -482,7 +535,6 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, if (unlikely(!p)) return -ENOMEM; rcu_assign_pointer(*pp, p); - *pg = p; if (state == MDB_TEMPORARY) mod_timer(&p->timer, now + br->multicast_membership_interval); @@ -490,8 +542,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, } static int __br_mdb_add(struct net *net, struct net_bridge *br, - struct br_mdb_entry *entry, - struct net_bridge_port_group **pg) + struct br_mdb_entry *entry) { struct br_ip ip; struct net_device *dev; @@ -509,18 +560,10 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, if (!p || p->br != br || p->state == BR_STATE_DISABLED) return -EINVAL; - memset(&ip, 0, sizeof(ip)); - ip.vid = entry->vid; - ip.proto = entry->addr.proto; - if (ip.proto == htons(ETH_P_IP)) - ip.u.ip4 = entry->addr.u.ip4; -#if IS_ENABLED(CONFIG_IPV6) - else - ip.u.ip6 = entry->addr.u.ip6; -#endif + __mdb_entry_to_br_ip(entry, &ip); spin_lock_bh(&br->multicast_lock); - ret = br_mdb_add_group(br, p, &ip, entry->state, pg); + ret = br_mdb_add_group(br, p, &ip, entry->state); spin_unlock_bh(&br->multicast_lock); return ret; } @@ -528,7 +571,6 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); - struct net_bridge_port_group *pg; struct net_bridge_vlan_group *vg; struct net_device *dev, *pdev; struct br_mdb_entry *entry; @@ -558,15 +600,15 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) if (br_vlan_enabled(br) && vg && entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { entry->vid = v->vid; - err = __br_mdb_add(net, br, entry, &pg); + err = __br_mdb_add(net, br, entry); if (err) break; - __br_mdb_notify(dev, entry, RTM_NEWMDB, pg); + __br_mdb_notify(dev, p, entry, RTM_NEWMDB); } } else { - err = __br_mdb_add(net, br, entry, &pg); + err = __br_mdb_add(net, br, entry); if (!err) - __br_mdb_notify(dev, entry, RTM_NEWMDB, pg); + __br_mdb_notify(dev, p, entry, RTM_NEWMDB); } return err; @@ -584,15 +626,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) if (!netif_running(br->dev) || br->multicast_disabled) return -EINVAL; - memset(&ip, 0, sizeof(ip)); - ip.vid = entry->vid; - ip.proto = entry->addr.proto; - if (ip.proto == htons(ETH_P_IP)) - ip.u.ip4 = entry->addr.u.ip4; -#if IS_ENABLED(CONFIG_IPV6) - else - ip.u.ip6 = entry->addr.u.ip6; -#endif + __mdb_entry_to_br_ip(entry, &ip); spin_lock_bh(&br->multicast_lock); mdb = mlock_dereference(br->mdb, br); @@ -662,12 +696,12 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) entry->vid = v->vid; err = __br_mdb_del(br, entry); if (!err) - __br_mdb_notify(dev, entry, RTM_DELMDB, NULL); + __br_mdb_notify(dev, p, entry, RTM_DELMDB); } } else { err = __br_mdb_del(br, entry); if (!err) - __br_mdb_notify(dev, entry, RTM_DELMDB, NULL); + __br_mdb_notify(dev, p, entry, RTM_DELMDB); } return err; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index a4c15df2b792..6852f3c7009c 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -283,7 +283,8 @@ static void br_multicast_del_pg(struct net_bridge *br, rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); - br_mdb_notify(br->dev, p, RTM_DELMDB); + br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB, + p->flags); call_rcu_bh(&p->rcu, br_multicast_free_pg); if (!mp->ports && !mp->mglist && @@ -705,7 +706,7 @@ static int br_multicast_add_group(struct net_bridge *br, if (unlikely(!p)) goto err; rcu_assign_pointer(*pp, p); - br_mdb_notify(br->dev, p, RTM_NEWMDB); + br_mdb_notify(br->dev, port, group, RTM_NEWMDB, 0); found: mod_timer(&p->timer, now + br->multicast_membership_interval); @@ -1278,6 +1279,7 @@ static int br_ip4_multicast_query(struct net_bridge *br, struct br_ip saddr; unsigned long max_delay; unsigned long now = jiffies; + unsigned int offset = skb_transport_offset(skb); __be32 group; int err = 0; @@ -1288,14 +1290,14 @@ static int br_ip4_multicast_query(struct net_bridge *br, group = ih->group; - if (skb->len == sizeof(*ih)) { + if (skb->len == offset + sizeof(*ih)) { max_delay = ih->code * (HZ / IGMP_TIMER_SCALE); if (!max_delay) { max_delay = 10 * HZ; group = 0; } - } else if (skb->len >= sizeof(*ih3)) { + } else if (skb->len >= offset + sizeof(*ih3)) { ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs) goto out; @@ -1356,6 +1358,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, struct br_ip saddr; unsigned long max_delay; unsigned long now = jiffies; + unsigned int offset = skb_transport_offset(skb); const struct in6_addr *group = NULL; bool is_general_query; int err = 0; @@ -1365,8 +1368,8 @@ static int br_ip6_multicast_query(struct net_bridge *br, (port && port->state == BR_STATE_DISABLED)) goto out; - if (skb->len == sizeof(*mld)) { - if (!pskb_may_pull(skb, sizeof(*mld))) { + if (skb->len == offset + sizeof(*mld)) { + if (!pskb_may_pull(skb, offset + sizeof(*mld))) { err = -EINVAL; goto out; } @@ -1375,7 +1378,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, if (max_delay) group = &mld->mld_mca; } else { - if (!pskb_may_pull(skb, sizeof(*mld2q))) { + if (!pskb_may_pull(skb, offset + sizeof(*mld2q))) { err = -EINVAL; goto out; } @@ -1461,7 +1464,8 @@ br_multicast_leave_group(struct net_bridge *br, hlist_del_init(&p->mglist); del_timer(&p->timer); call_rcu_bh(&p->rcu, br_multicast_free_pg); - br_mdb_notify(br->dev, p, RTM_DELMDB); + br_mdb_notify(br->dev, port, group, RTM_DELMDB, + p->flags); if (!mp->ports && !mp->mglist && netif_running(br->dev)) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 44114a94c576..2d25979273a6 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -217,13 +217,13 @@ static int br_validate_ipv4(struct net *net, struct sk_buff *skb) len = ntohs(iph->tot_len); if (skb->len < len) { - IP_INC_STATS_BH(net, IPSTATS_MIB_INTRUNCATEDPKTS); + __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; if (pskb_trim_rcsum(skb, len)) { - IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS); + __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto drop; } @@ -236,7 +236,7 @@ static int br_validate_ipv4(struct net *net, struct sk_buff *skb) return 0; inhdr_error: - IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS); + __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); drop: return -1; } diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index d61f56efc8dc..5e59a8457e7b 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -122,13 +122,13 @@ int br_validate_ipv6(struct net *net, struct sk_buff *skb) if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { if (pkt_len + ip6h_len > skb->len) { - IP6_INC_STATS_BH(net, idev, - IPSTATS_MIB_INTRUNCATEDPKTS); + __IP6_INC_STATS(net, idev, + IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) { - IP6_INC_STATS_BH(net, idev, - IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, idev, + IPSTATS_MIB_INDISCARDS); goto drop; } } @@ -142,7 +142,7 @@ int br_validate_ipv6(struct net *net, struct sk_buff *skb) return 0; inhdr_error: - IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); drop: return -1; } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 6bae1125e36d..a5343c7232bf 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -850,6 +850,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_NF_CALL_IP6TABLES] = { .type = NLA_U8 }, [IFLA_BR_NF_CALL_ARPTABLES] = { .type = NLA_U8 }, [IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 }, + [IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -921,6 +922,14 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (err) return err; } + + if (data[IFLA_BR_VLAN_STATS_ENABLED]) { + __u8 vlan_stats = nla_get_u8(data[IFLA_BR_VLAN_STATS_ENABLED]); + + err = br_vlan_set_stats(br, vlan_stats); + if (err) + return err; + } #endif if (data[IFLA_BR_GROUP_FWD_MASK]) { @@ -1082,6 +1091,7 @@ static size_t br_get_size(const struct net_device *brdev) #ifdef CONFIG_BRIDGE_VLAN_FILTERING nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */ nla_total_size(sizeof(u16)) + /* IFLA_BR_VLAN_DEFAULT_PVID */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_ENABLED */ #endif nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */ @@ -1167,7 +1177,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) #ifdef CONFIG_BRIDGE_VLAN_FILTERING if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) || - nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid)) + nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid) || + nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED, br->vlan_stats_enabled)) return -EMSGSIZE; #endif #ifdef CONFIG_BRIDGE_IGMP_SNOOPING @@ -1223,6 +1234,69 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) return 0; } +static size_t br_get_linkxstats_size(const struct net_device *dev) +{ + struct net_bridge *br = netdev_priv(dev); + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + int numvls = 0; + + vg = br_vlan_group(br); + if (!vg) + return 0; + + /* we need to count all, even placeholder entries */ + list_for_each_entry(v, &vg->vlan_list, vlist) + numvls++; + + /* account for the vlans and the link xstats type nest attribute */ + return numvls * nla_total_size(sizeof(struct bridge_vlan_xstats)) + + nla_total_size(0); +} + +static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev, + int *prividx) +{ + struct net_bridge *br = netdev_priv(dev); + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct nlattr *nest; + int vl_idx = 0; + + vg = br_vlan_group(br); + if (!vg) + goto out; + nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE); + if (!nest) + return -EMSGSIZE; + list_for_each_entry(v, &vg->vlan_list, vlist) { + struct bridge_vlan_xstats vxi; + struct br_vlan_stats stats; + + if (vl_idx++ < *prividx) + continue; + memset(&vxi, 0, sizeof(vxi)); + vxi.vid = v->vid; + br_vlan_get_stats(v, &stats); + vxi.rx_bytes = stats.rx_bytes; + vxi.rx_packets = stats.rx_packets; + vxi.tx_bytes = stats.tx_bytes; + vxi.tx_packets = stats.tx_packets; + + if (nla_put(skb, BRIDGE_XSTATS_VLAN, sizeof(vxi), &vxi)) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + *prividx = 0; +out: + return 0; + +nla_put_failure: + nla_nest_end(skb, nest); + *prividx = vl_idx; + + return -EMSGSIZE; +} static struct rtnl_af_ops br_af_ops __read_mostly = { .family = AF_BRIDGE, @@ -1241,6 +1315,8 @@ struct rtnl_link_ops br_link_ops __read_mostly = { .dellink = br_dev_delete, .get_size = br_get_size, .fill_info = br_fill_info, + .fill_linkxstats = br_fill_linkxstats, + .get_linkxstats_size = br_get_linkxstats_size, .slave_maxtype = IFLA_BRPORT_MAX, .slave_policy = br_port_policy, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 1b5d145dfcbf..c7fb5d7a7218 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -77,12 +77,21 @@ struct bridge_mcast_querier { }; #endif +struct br_vlan_stats { + u64 rx_bytes; + u64 rx_packets; + u64 tx_bytes; + u64 tx_packets; + struct u64_stats_sync syncp; +}; + /** * struct net_bridge_vlan - per-vlan entry * * @vnode: rhashtable member * @vid: VLAN id * @flags: bridge vlan flags + * @stats: per-cpu VLAN statistics * @br: if MASTER flag set, this points to a bridge struct * @port: if MASTER flag unset, this points to a port struct * @refcnt: if MASTER flag set, this is bumped for each port referencing it @@ -100,6 +109,7 @@ struct net_bridge_vlan { struct rhash_head vnode; u16 vid; u16 flags; + struct br_vlan_stats __percpu *stats; union { struct net_bridge *br; struct net_bridge_port *port; @@ -342,6 +352,7 @@ struct net_bridge #ifdef CONFIG_BRIDGE_VLAN_FILTERING struct net_bridge_vlan_group __rcu *vlgrp; u8 vlan_enabled; + u8 vlan_stats_enabled; __be16 vlan_proto; u16 default_pvid; #endif @@ -560,8 +571,8 @@ br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group, unsigned char flags); void br_mdb_init(void); void br_mdb_uninit(void); -void br_mdb_notify(struct net_device *dev, struct net_bridge_port_group *pg, - int type); +void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, + struct br_ip *group, int type, u8 flags); void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, int type); @@ -691,6 +702,7 @@ int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); int __br_vlan_set_proto(struct net_bridge *br, __be16 proto); int br_vlan_set_proto(struct net_bridge *br, unsigned long val); +int br_vlan_set_stats(struct net_bridge *br, unsigned long val); int br_vlan_init(struct net_bridge *br); int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val); int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid); @@ -699,6 +711,8 @@ int nbp_vlan_delete(struct net_bridge_port *port, u16 vid); void nbp_vlan_flush(struct net_bridge_port *port); int nbp_vlan_init(struct net_bridge_port *port); int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask); +void br_vlan_get_stats(const struct net_bridge_vlan *v, + struct br_vlan_stats *stats); static inline struct net_bridge_vlan_group *br_vlan_group( const struct net_bridge *br) @@ -881,6 +895,10 @@ static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu( return NULL; } +static inline void br_vlan_get_stats(const struct net_bridge_vlan *v, + struct br_vlan_stats *stats) +{ +} #endif struct nf_br_ops { diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 70bddfd0f3e9..beb47071e38d 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -731,6 +731,22 @@ static ssize_t default_pvid_store(struct device *d, return store_bridge_parm(d, buf, len, br_vlan_set_default_pvid); } static DEVICE_ATTR_RW(default_pvid); + +static ssize_t vlan_stats_enabled_show(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%u\n", br->vlan_stats_enabled); +} + +static ssize_t vlan_stats_enabled_store(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, br_vlan_set_stats); +} +static DEVICE_ATTR_RW(vlan_stats_enabled); #endif static struct attribute *bridge_attrs[] = { @@ -778,6 +794,7 @@ static struct attribute *bridge_attrs[] = { &dev_attr_vlan_filtering.attr, &dev_attr_vlan_protocol.attr, &dev_attr_default_pvid.attr, + &dev_attr_vlan_stats_enabled.attr, #endif NULL }; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index e001152d6ad1..b6de4f457161 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -162,6 +162,17 @@ static struct net_bridge_vlan *br_vlan_get_master(struct net_bridge *br, u16 vid return masterv; } +static void br_master_vlan_rcu_free(struct rcu_head *rcu) +{ + struct net_bridge_vlan *v; + + v = container_of(rcu, struct net_bridge_vlan, rcu); + WARN_ON(!br_vlan_is_master(v)); + free_percpu(v->stats); + v->stats = NULL; + kfree(v); +} + static void br_vlan_put_master(struct net_bridge_vlan *masterv) { struct net_bridge_vlan_group *vg; @@ -174,7 +185,7 @@ static void br_vlan_put_master(struct net_bridge_vlan *masterv) rhashtable_remove_fast(&vg->vlan_hash, &masterv->vnode, br_vlan_rht_params); __vlan_del_list(masterv); - kfree_rcu(masterv, rcu); + call_rcu(&masterv->rcu, br_master_vlan_rcu_free); } } @@ -230,6 +241,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) if (!masterv) goto out_filt; v->brvlan = masterv; + v->stats = masterv->stats; } /* Add the dev mac and count the vlan only if it's usable */ @@ -329,6 +341,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb) { + struct br_vlan_stats *stats; struct net_bridge_vlan *v; u16 vid; @@ -355,18 +368,27 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, return NULL; } } + if (br->vlan_stats_enabled) { + stats = this_cpu_ptr(v->stats); + u64_stats_update_begin(&stats->syncp); + stats->tx_bytes += skb->len; + stats->tx_packets++; + u64_stats_update_end(&stats->syncp); + } + if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) skb->vlan_tci = 0; - out: return skb; } /* Called under RCU */ -static bool __allowed_ingress(struct net_bridge_vlan_group *vg, __be16 proto, +static bool __allowed_ingress(const struct net_bridge *br, + struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid) { - const struct net_bridge_vlan *v; + struct br_vlan_stats *stats; + struct net_bridge_vlan *v; bool tagged; BR_INPUT_SKB_CB(skb)->vlan_filtered = true; @@ -375,7 +397,7 @@ static bool __allowed_ingress(struct net_bridge_vlan_group *vg, __be16 proto, * HW accelerated vlan tag. */ if (unlikely(!skb_vlan_tag_present(skb) && - skb->protocol == proto)) { + skb->protocol == br->vlan_proto)) { skb = skb_vlan_untag(skb); if (unlikely(!skb)) return false; @@ -383,7 +405,7 @@ static bool __allowed_ingress(struct net_bridge_vlan_group *vg, __be16 proto, if (!br_vlan_get_tag(skb, vid)) { /* Tagged frame */ - if (skb->vlan_proto != proto) { + if (skb->vlan_proto != br->vlan_proto) { /* Protocol-mismatch, empty out vlan_tci for new tag */ skb_push(skb, ETH_HLEN); skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, @@ -419,7 +441,7 @@ static bool __allowed_ingress(struct net_bridge_vlan_group *vg, __be16 proto, *vid = pvid; if (likely(!tagged)) /* Untagged Frame. */ - __vlan_hwaccel_put_tag(skb, proto, pvid); + __vlan_hwaccel_put_tag(skb, br->vlan_proto, pvid); else /* Priority-tagged Frame. * At this point, We know that skb->vlan_tci had @@ -428,13 +450,24 @@ static bool __allowed_ingress(struct net_bridge_vlan_group *vg, __be16 proto, */ skb->vlan_tci |= pvid; - return true; + /* if stats are disabled we can avoid the lookup */ + if (!br->vlan_stats_enabled) + return true; } - - /* Frame had a valid vlan tag. See if vlan is allowed */ v = br_vlan_find(vg, *vid); - if (v && br_vlan_should_use(v)) - return true; + if (!v || !br_vlan_should_use(v)) + goto drop; + + if (br->vlan_stats_enabled) { + stats = this_cpu_ptr(v->stats); + u64_stats_update_begin(&stats->syncp); + stats->rx_bytes += skb->len; + stats->rx_packets++; + u64_stats_update_end(&stats->syncp); + } + + return true; + drop: kfree_skb(skb); return false; @@ -452,7 +485,7 @@ bool br_allowed_ingress(const struct net_bridge *br, return true; } - return __allowed_ingress(vg, br->vlan_proto, skb, vid); + return __allowed_ingress(br, vg, skb, vid); } /* Called under RCU. */ @@ -542,6 +575,11 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags) if (!vlan) return -ENOMEM; + vlan->stats = netdev_alloc_pcpu_stats(struct br_vlan_stats); + if (!vlan->stats) { + kfree(vlan); + return -ENOMEM; + } vlan->vid = vid; vlan->flags = flags | BRIDGE_VLAN_INFO_MASTER; vlan->flags &= ~BRIDGE_VLAN_INFO_PVID; @@ -549,8 +587,10 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags) if (flags & BRIDGE_VLAN_INFO_BRENTRY) atomic_set(&vlan->refcnt, 1); ret = __vlan_add(vlan, flags); - if (ret) + if (ret) { + free_percpu(vlan->stats); kfree(vlan); + } return ret; } @@ -711,6 +751,20 @@ int br_vlan_set_proto(struct net_bridge *br, unsigned long val) return __br_vlan_set_proto(br, htons(val)); } +int br_vlan_set_stats(struct net_bridge *br, unsigned long val) +{ + switch (val) { + case 0: + case 1: + br->vlan_stats_enabled = val; + break; + default: + return -EINVAL; + } + + return 0; +} + static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid) { struct net_bridge_vlan *v; @@ -1000,3 +1054,30 @@ void nbp_vlan_flush(struct net_bridge_port *port) synchronize_rcu(); __vlan_group_free(vg); } + +void br_vlan_get_stats(const struct net_bridge_vlan *v, + struct br_vlan_stats *stats) +{ + int i; + + memset(stats, 0, sizeof(*stats)); + for_each_possible_cpu(i) { + u64 rxpackets, rxbytes, txpackets, txbytes; + struct br_vlan_stats *cpu_stats; + unsigned int start; + + cpu_stats = per_cpu_ptr(v->stats, i); + do { + start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + rxpackets = cpu_stats->rx_packets; + rxbytes = cpu_stats->rx_bytes; + txbytes = cpu_stats->tx_bytes; + txpackets = cpu_stats->tx_packets; + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); + + stats->rx_packets += rxpackets; + stats->rx_bytes += rxbytes; + stats->tx_bytes += txbytes; + stats->tx_packets += txpackets; + } +} diff --git a/net/ceph/auth.c b/net/ceph/auth.c index 6b923bcaa2a4..2bc5965fdd1e 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -293,13 +293,9 @@ int ceph_auth_create_authorizer(struct ceph_auth_client *ac, } EXPORT_SYMBOL(ceph_auth_create_authorizer); -void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac, - struct ceph_authorizer *a) +void ceph_auth_destroy_authorizer(struct ceph_authorizer *a) { - mutex_lock(&ac->mutex); - if (ac->ops && ac->ops->destroy_authorizer) - ac->ops->destroy_authorizer(ac, a); - mutex_unlock(&ac->mutex); + a->destroy(a); } EXPORT_SYMBOL(ceph_auth_destroy_authorizer); diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c index 8c93fa8d81bc..5f836f02ae36 100644 --- a/net/ceph/auth_none.c +++ b/net/ceph/auth_none.c @@ -16,7 +16,6 @@ static void reset(struct ceph_auth_client *ac) struct ceph_auth_none_info *xi = ac->private; xi->starting = true; - xi->built_authorizer = false; } static void destroy(struct ceph_auth_client *ac) @@ -39,6 +38,27 @@ static int should_authenticate(struct ceph_auth_client *ac) return xi->starting; } +static int ceph_auth_none_build_authorizer(struct ceph_auth_client *ac, + struct ceph_none_authorizer *au) +{ + void *p = au->buf; + void *const end = p + sizeof(au->buf); + int ret; + + ceph_encode_8_safe(&p, end, 1, e_range); + ret = ceph_entity_name_encode(ac->name, &p, end); + if (ret < 0) + return ret; + + ceph_encode_64_safe(&p, end, ac->global_id, e_range); + au->buf_len = p - (void *)au->buf; + dout("%s built authorizer len %d\n", __func__, au->buf_len); + return 0; + +e_range: + return -ERANGE; +} + static int build_request(struct ceph_auth_client *ac, void *buf, void *end) { return 0; @@ -57,32 +77,32 @@ static int handle_reply(struct ceph_auth_client *ac, int result, return result; } +static void ceph_auth_none_destroy_authorizer(struct ceph_authorizer *a) +{ + kfree(a); +} + /* - * build an 'authorizer' with our entity_name and global_id. we can - * reuse a single static copy since it is identical for all services - * we connect to. + * build an 'authorizer' with our entity_name and global_id. it is + * identical for all services we connect to. */ static int ceph_auth_none_create_authorizer( struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *auth) { - struct ceph_auth_none_info *ai = ac->private; - struct ceph_none_authorizer *au = &ai->au; - void *p, *end; + struct ceph_none_authorizer *au; int ret; - if (!ai->built_authorizer) { - p = au->buf; - end = p + sizeof(au->buf); - ceph_encode_8(&p, 1); - ret = ceph_entity_name_encode(ac->name, &p, end - 8); - if (ret < 0) - goto bad; - ceph_decode_need(&p, end, sizeof(u64), bad2); - ceph_encode_64(&p, ac->global_id); - au->buf_len = p - (void *)au->buf; - ai->built_authorizer = true; - dout("built authorizer len %d\n", au->buf_len); + au = kmalloc(sizeof(*au), GFP_NOFS); + if (!au) + return -ENOMEM; + + au->base.destroy = ceph_auth_none_destroy_authorizer; + + ret = ceph_auth_none_build_authorizer(ac, au); + if (ret) { + kfree(au); + return ret; } auth->authorizer = (struct ceph_authorizer *) au; @@ -92,17 +112,6 @@ static int ceph_auth_none_create_authorizer( auth->authorizer_reply_buf_len = sizeof (au->reply_buf); return 0; - -bad2: - ret = -ERANGE; -bad: - return ret; -} - -static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac, - struct ceph_authorizer *a) -{ - /* nothing to do */ } static const struct ceph_auth_client_ops ceph_auth_none_ops = { @@ -114,7 +123,6 @@ static const struct ceph_auth_client_ops ceph_auth_none_ops = { .build_request = build_request, .handle_reply = handle_reply, .create_authorizer = ceph_auth_none_create_authorizer, - .destroy_authorizer = ceph_auth_none_destroy_authorizer, }; int ceph_auth_none_init(struct ceph_auth_client *ac) @@ -127,7 +135,6 @@ int ceph_auth_none_init(struct ceph_auth_client *ac) return -ENOMEM; xi->starting = true; - xi->built_authorizer = false; ac->protocol = CEPH_AUTH_NONE; ac->private = xi; diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h index 059a3ce4b53f..62021535ae4a 100644 --- a/net/ceph/auth_none.h +++ b/net/ceph/auth_none.h @@ -12,6 +12,7 @@ */ struct ceph_none_authorizer { + struct ceph_authorizer base; char buf[128]; int buf_len; char reply_buf[0]; @@ -19,8 +20,6 @@ struct ceph_none_authorizer { struct ceph_auth_none_info { bool starting; - bool built_authorizer; - struct ceph_none_authorizer au; /* we only need one; it's static */ }; int ceph_auth_none_init(struct ceph_auth_client *ac); diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 9e43a315e662..a0905f04bd13 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -565,6 +565,14 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, return -EAGAIN; } +static void ceph_x_destroy_authorizer(struct ceph_authorizer *a) +{ + struct ceph_x_authorizer *au = (void *)a; + + ceph_x_authorizer_cleanup(au); + kfree(au); +} + static int ceph_x_create_authorizer( struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *auth) @@ -581,6 +589,8 @@ static int ceph_x_create_authorizer( if (!au) return -ENOMEM; + au->base.destroy = ceph_x_destroy_authorizer; + ret = ceph_x_build_authorizer(ac, th, au); if (ret) { kfree(au); @@ -643,16 +653,6 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, return ret; } -static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac, - struct ceph_authorizer *a) -{ - struct ceph_x_authorizer *au = (void *)a; - - ceph_x_authorizer_cleanup(au); - kfree(au); -} - - static void ceph_x_reset(struct ceph_auth_client *ac) { struct ceph_x_info *xi = ac->private; @@ -770,7 +770,6 @@ static const struct ceph_auth_client_ops ceph_x_ops = { .create_authorizer = ceph_x_create_authorizer, .update_authorizer = ceph_x_update_authorizer, .verify_authorizer_reply = ceph_x_verify_authorizer_reply, - .destroy_authorizer = ceph_x_destroy_authorizer, .invalidate_authorizer = ceph_x_invalidate_authorizer, .reset = ceph_x_reset, .destroy = ceph_x_destroy, diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h index 40b1a3cf7397..21a5af904bae 100644 --- a/net/ceph/auth_x.h +++ b/net/ceph/auth_x.h @@ -26,6 +26,7 @@ struct ceph_x_ticket_handler { struct ceph_x_authorizer { + struct ceph_authorizer base; struct ceph_crypto_key session_key; struct ceph_buffer *buf; unsigned int service; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 32355d9d0103..40a53a70efdf 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1087,10 +1087,8 @@ static void put_osd(struct ceph_osd *osd) dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), atomic_read(&osd->o_ref) - 1); if (atomic_dec_and_test(&osd->o_ref)) { - struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; - if (osd->o_auth.authorizer) - ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer); + ceph_auth_destroy_authorizer(osd->o_auth.authorizer); kfree(osd); } } @@ -2984,7 +2982,7 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, struct ceph_auth_handshake *auth = &o->o_auth; if (force_new && auth->authorizer) { - ceph_auth_destroy_authorizer(ac, auth->authorizer); + ceph_auth_destroy_authorizer(auth->authorizer); auth->authorizer = NULL; } if (!auth->authorizer) { diff --git a/net/core/dev.c b/net/core/dev.c index 6324bc9267f7..12436d1312ca 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1741,7 +1741,7 @@ static inline void net_timestamp_set(struct sk_buff *skb) __net_timestamp(SKB); \ } \ -bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb) +bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) { unsigned int len; @@ -1850,7 +1850,7 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) * taps currently in use. */ -static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) +void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct packet_type *ptype; struct sk_buff *skb2 = NULL; @@ -1907,6 +1907,7 @@ out_unlock: pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); /** * netif_setup_tc - Handle tc mappings on real_num_tx_queues change @@ -2815,7 +2816,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb, if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, type)) { - features &= ~NETIF_F_CSUM_MASK; + features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); } else if (illegal_highdma(skb->dev, skb)) { features &= ~NETIF_F_SG; } @@ -3469,6 +3470,7 @@ u32 rps_cpu_mask __read_mostly; EXPORT_SYMBOL(rps_cpu_mask); struct static_key rps_needed __read_mostly; +EXPORT_SYMBOL(rps_needed); static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, @@ -3955,9 +3957,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, break; case TC_ACT_SHOT: qdisc_qstats_cpu_drop(cl->q); + kfree_skb(skb); + return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: - kfree_skb(skb); + consume_skb(skb); return NULL; case TC_ACT_REDIRECT: /* skb_mac_header check was done by cls/act_bpf, so @@ -4982,8 +4986,8 @@ bool sk_busy_loop(struct sock *sk, int nonblock) netpoll_poll_unlock(have); } if (rc > 0) - NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_BUSYPOLLRXPACKETS, rc); + __NET_ADD_STATS(sock_net(sk), + LINUX_MIB_BUSYPOLLRXPACKETS, rc); local_bh_enable(); if (rc == LL_FLUSH_FAILED) @@ -6720,6 +6724,10 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~NETIF_F_TSO6; } + /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */ + if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO)) + features &= ~NETIF_F_TSO_MANGLEID; + /* TSO ECN requires that TSO is present as well. */ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) features &= ~NETIF_F_TSO_ECN; diff --git a/net/core/filter.c b/net/core/filter.c index 218e5de8c402..71c2a1f473ad 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1344,6 +1344,21 @@ struct bpf_scratchpad { static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); +static inline int bpf_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + int err; + + if (!skb_cloned(skb)) + return 0; + if (skb_clone_writable(skb, write_len)) + return 0; + err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + if (!err) + bpf_compute_data_end(skb); + return err; +} + static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) { struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); @@ -1366,7 +1381,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) */ if (unlikely((u32) offset > 0xffff || len > sizeof(sp->buff))) return -EFAULT; - if (unlikely(skb_try_make_writable(skb, offset + len))) + if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; ptr = skb_header_pointer(skb, offset, len, sp->buff); @@ -1444,7 +1459,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; - if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum)))) + if (unlikely(bpf_try_make_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); @@ -1499,7 +1514,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; - if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum)))) + if (unlikely(bpf_try_make_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); @@ -1699,12 +1714,15 @@ static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1; __be16 vlan_proto = (__force __be16) r2; + int ret; if (unlikely(vlan_proto != htons(ETH_P_8021Q) && vlan_proto != htons(ETH_P_8021AD))) vlan_proto = htons(ETH_P_8021Q); - return skb_vlan_push(skb, vlan_proto, vlan_tci); + ret = skb_vlan_push(skb, vlan_proto, vlan_tci); + bpf_compute_data_end(skb); + return ret; } const struct bpf_func_proto bpf_skb_vlan_push_proto = { @@ -1720,8 +1738,11 @@ EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1; + int ret; - return skb_vlan_pop(skb); + ret = skb_vlan_pop(skb); + bpf_compute_data_end(skb); + return ret; } const struct bpf_func_proto bpf_skb_vlan_pop_proto = { @@ -2066,8 +2087,12 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type) static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type) { - if (off == offsetof(struct __sk_buff, tc_classid)) + switch (off) { + case offsetof(struct __sk_buff, tc_classid): + case offsetof(struct __sk_buff, data): + case offsetof(struct __sk_buff, data_end): return false; + } if (type == BPF_WRITE) { switch (off) { @@ -2215,6 +2240,20 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off); break; + case offsetof(struct __sk_buff, data): + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, data)), + dst_reg, src_reg, + offsetof(struct sk_buff, data)); + break; + + case offsetof(struct __sk_buff, data_end): + ctx_off -= offsetof(struct __sk_buff, data_end); + ctx_off += offsetof(struct sk_buff, cb); + ctx_off += offsetof(struct bpf_skb_data_end, data_end); + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(void *)), + dst_reg, src_reg, ctx_off); + break; + case offsetof(struct __sk_buff, tc_index): #ifdef CONFIG_NET_SCHED BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); diff --git a/net/core/flow.c b/net/core/flow.c index 1033725be40b..3937b1b68d5b 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -92,8 +92,11 @@ static void flow_cache_gc_task(struct work_struct *work) list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list); spin_unlock_bh(&xfrm->flow_cache_gc_lock); - list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) + list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) { flow_entry_kill(fce, xfrm); + atomic_dec(&xfrm->flow_cache_gc_count); + WARN_ON(atomic_read(&xfrm->flow_cache_gc_count) < 0); + } } static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, @@ -101,6 +104,7 @@ static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, struct netns_xfrm *xfrm) { if (deleted) { + atomic_add(deleted, &xfrm->flow_cache_gc_count); fcp->hash_count -= deleted; spin_lock_bh(&xfrm->flow_cache_gc_lock); list_splice_tail(gc_list, &xfrm->flow_cache_gc_list); @@ -232,6 +236,13 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, if (fcp->hash_count > fc->high_watermark) flow_cache_shrink(fc, fcp); + if (fcp->hash_count > 2 * fc->high_watermark || + atomic_read(&net->xfrm.flow_cache_gc_count) > fc->high_watermark) { + atomic_inc(&net->xfrm.flow_cache_genid); + flo = ERR_PTR(-ENOBUFS); + goto ret_object; + } + fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC); if (fle) { fle->net = net; @@ -446,6 +457,7 @@ int flow_cache_init(struct net *net) INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task); INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task); mutex_init(&net->xfrm.flow_flush_sem); + atomic_set(&net->xfrm.flow_cache_gc_count, 0); fc->hash_shift = 10; fc->low_watermark = 2 * flow_cache_hash_size(fc); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index e640462ea8bf..f96ee8b9478d 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -25,9 +25,9 @@ static inline int -gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size) +gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr) { - if (nla_put(d->skb, type, size, buf)) + if (nla_put_64bit(d->skb, type, size, buf, padattr)) goto nla_put_failure; return 0; @@ -59,7 +59,8 @@ nla_put_failure: */ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, - int xstats_type, spinlock_t *lock, struct gnet_dump *d) + int xstats_type, spinlock_t *lock, + struct gnet_dump *d, int padattr) __acquires(lock) { memset(d, 0, sizeof(*d)); @@ -71,16 +72,17 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, d->skb = skb; d->compat_tc_stats = tc_stats_type; d->compat_xstats = xstats_type; + d->padattr = padattr; if (d->tail) - return gnet_stats_copy(d, type, NULL, 0); + return gnet_stats_copy(d, type, NULL, 0, padattr); return 0; } EXPORT_SYMBOL(gnet_stats_start_copy_compat); /** - * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode + * gnet_stats_start_copy - start dumping procedure in compatibility mode * @skb: socket buffer to put statistics TLVs into * @type: TLV type for top level statistic TLV * @lock: statistics lock @@ -94,9 +96,9 @@ EXPORT_SYMBOL(gnet_stats_start_copy_compat); */ int gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, - struct gnet_dump *d) + struct gnet_dump *d, int padattr) { - return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d); + return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d, padattr); } EXPORT_SYMBOL(gnet_stats_start_copy); @@ -169,7 +171,8 @@ gnet_stats_copy_basic(struct gnet_dump *d, memset(&sb, 0, sizeof(sb)); sb.bytes = bstats.bytes; sb.packets = bstats.packets; - return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb)); + return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb), + TCA_STATS_PAD); } return 0; } @@ -208,11 +211,13 @@ gnet_stats_copy_rate_est(struct gnet_dump *d, } if (d->tail) { - res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est)); + res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est), + TCA_STATS_PAD); if (res < 0 || est.bps == r->bps) return res; /* emit 64bit stats only if needed */ - return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r)); + return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r), + TCA_STATS_PAD); } return 0; @@ -286,7 +291,8 @@ gnet_stats_copy_queue(struct gnet_dump *d, if (d->tail) return gnet_stats_copy(d, TCA_STATS_QUEUE, - &qstats, sizeof(qstats)); + &qstats, sizeof(qstats), + TCA_STATS_PAD); return 0; } @@ -316,7 +322,8 @@ gnet_stats_copy_app(struct gnet_dump *d, void *st, int len) } if (d->tail) - return gnet_stats_copy(d, TCA_STATS_APP, st, len); + return gnet_stats_copy(d, TCA_STATS_APP, st, len, + TCA_STATS_PAD); return 0; @@ -347,12 +354,12 @@ gnet_stats_finish_copy(struct gnet_dump *d) if (d->compat_tc_stats) if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats, - sizeof(d->tc_stats)) < 0) + sizeof(d->tc_stats), d->padattr) < 0) return -1; if (d->compat_xstats && d->xstats) { if (gnet_stats_copy(d, d->compat_xstats, d->xstats, - d->xstats_len) < 0) + d->xstats_len, d->padattr) < 0) return -1; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 6a395d440228..29dd8cc22bbf 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1857,7 +1857,8 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, ndst.ndts_table_fulls += st->table_fulls; } - if (nla_put(skb, NDTA_STATS, sizeof(ndst), &ndst)) + if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst, + NDTA_PAD)) goto nla_put_failure; } diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 2bf83299600a..14d09345f00d 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -162,7 +162,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v) "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", sd->processed, sd->dropped, sd->time_squeeze, 0, 0, 0, 0, 0, /* was fastroute */ - sd->cpu_collision, sd->received_rps, flow_limit_count); + 0, /* was cpu_collision */ + sd->received_rps, flow_limit_count); return 0; } diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 20999aa596dd..8604ae245960 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3472,7 +3472,6 @@ xmit_more: pkt_dev->odevname, ret); pkt_dev->errors++; /* fallthru */ - case NETDEV_TX_LOCKED: case NETDEV_TX_BUSY: /* Retry it next time */ atomic_dec(&(pkt_dev->skb->users)); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9efc1f34ef3b..d69c4644f8f2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -876,7 +876,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */ + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ - + nla_total_size(sizeof(struct rtnl_link_ifmap)) + + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap)) + nla_total_size(sizeof(struct rtnl_link_stats)) + nla_total_size_64bit(sizeof(struct rtnl_link_stats64)) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ @@ -1173,15 +1173,17 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) { - struct rtnl_link_ifmap map = { - .mem_start = dev->mem_start, - .mem_end = dev->mem_end, - .base_addr = dev->base_addr, - .irq = dev->irq, - .dma = dev->dma, - .port = dev->if_port, - }; - if (nla_put(skb, IFLA_MAP, sizeof(map), &map)) + struct rtnl_link_ifmap map; + + memset(&map, 0, sizeof(map)); + map.mem_start = dev->mem_start; + map.mem_end = dev->mem_end; + map.base_addr = dev->base_addr; + map.irq = dev->irq; + map.dma = dev->dma; + map.port = dev->if_port; + + if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD)) return -EMSGSIZE; return 0; @@ -3444,13 +3446,21 @@ out: return err; } +static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr) +{ + return (mask & IFLA_STATS_FILTER_BIT(attrid)) && + (!idxattr || idxattr == attrid); +} + static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, - unsigned int flags, unsigned int filter_mask) + unsigned int flags, unsigned int filter_mask, + int *idxattr, int *prividx) { struct if_stats_msg *ifsm; struct nlmsghdr *nlh; struct nlattr *attr; + int s_prividx = *prividx; ASSERT_RTNL(); @@ -3462,7 +3472,7 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, ifsm->ifindex = dev->ifindex; ifsm->filter_mask = filter_mask; - if (filter_mask & IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_64)) { + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, *idxattr)) { struct rtnl_link_stats64 *sp; attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64, @@ -3475,12 +3485,36 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, dev_get_stats(dev, sp); } + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, *idxattr)) { + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + + if (ops && ops->fill_linkxstats) { + int err; + + *idxattr = IFLA_STATS_LINK_XSTATS; + attr = nla_nest_start(skb, + IFLA_STATS_LINK_XSTATS); + if (!attr) + goto nla_put_failure; + + err = ops->fill_linkxstats(skb, dev, prividx); + nla_nest_end(skb, attr); + if (err) + goto nla_put_failure; + *idxattr = 0; + } + } + nlmsg_end(skb, nlh); return 0; nla_put_failure: - nlmsg_cancel(skb, nlh); + /* not a multi message or no progress mean a real error */ + if (!(flags & NLM_F_MULTI) || s_prividx == *prividx) + nlmsg_cancel(skb, nlh); + else + nlmsg_end(skb, nlh); return -EMSGSIZE; } @@ -3494,17 +3528,28 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, { size_t size = 0; - if (filter_mask & IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_64)) + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0)) size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) { + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + + if (ops && ops->get_linkxstats_size) { + size += nla_total_size(ops->get_linkxstats_size(dev)); + /* for IFLA_STATS_LINK_XSTATS */ + size += nla_total_size(0); + } + } + return size; } static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); - struct if_stats_msg *ifsm; struct net_device *dev = NULL; + int idxattr = 0, prividx = 0; + struct if_stats_msg *ifsm; struct sk_buff *nskb; u32 filter_mask; int err; @@ -3528,7 +3573,7 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh) err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - 0, filter_mask); + 0, filter_mask, &idxattr, &prividx); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */ WARN_ON(err == -EMSGSIZE); @@ -3542,18 +3587,19 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh) static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) { + int h, s_h, err, s_idx, s_idxattr, s_prividx; struct net *net = sock_net(skb->sk); + unsigned int flags = NLM_F_MULTI; struct if_stats_msg *ifsm; - int h, s_h; - int idx = 0, s_idx; - struct net_device *dev; struct hlist_head *head; - unsigned int flags = NLM_F_MULTI; + struct net_device *dev; u32 filter_mask = 0; - int err; + int idx = 0; s_h = cb->args[0]; s_idx = cb->args[1]; + s_idxattr = cb->args[2]; + s_prividx = cb->args[3]; cb->seq = net->dev_base_seq; @@ -3571,7 +3617,8 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, - flags, filter_mask); + flags, filter_mask, + &s_idxattr, &s_prividx); /* If we ran out of room on the first message, * we're in trouble */ @@ -3579,13 +3626,16 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) goto out; - + s_prividx = 0; + s_idxattr = 0; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } } out: + cb->args[3] = s_prividx; + cb->args[2] = s_idxattr; cb->args[1] = idx; cb->args[0] = h; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7ff7788b0151..f2b77e549c03 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3080,8 +3080,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, unsigned int headroom; unsigned int len = head_skb->len; __be16 proto; - bool csum; - int sg = !!(features & NETIF_F_SG); + bool csum, sg; int nfrags = skb_shinfo(head_skb)->nr_frags; int err = -ENOMEM; int i = 0; @@ -3093,15 +3092,19 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (unlikely(!proto)) return ERR_PTR(-EINVAL); + sg = !!(features & NETIF_F_SG); csum = !!can_checksum_protocol(features, proto); /* GSO partial only requires that we trim off any excess that * doesn't fit into an MSS sized block, so take care of that * now. */ - if (features & NETIF_F_GSO_PARTIAL) { + if (sg && csum && (features & NETIF_F_GSO_PARTIAL)) { partial_segs = len / mss; - mss *= partial_segs; + if (partial_segs > 1) + mss *= partial_segs; + else + partial_segs = 0; } headroom = skb_headroom(head_skb); @@ -4622,3 +4625,239 @@ failure: return NULL; } EXPORT_SYMBOL(alloc_skb_with_frags); + +/* carve out the first off bytes from skb when off < headlen */ +static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, + const int headlen, gfp_t gfp_mask) +{ + int i; + int size = skb_end_offset(skb); + int new_hlen = headlen - off; + u8 *data; + + size = SKB_DATA_ALIGN(size); + + if (skb_pfmemalloc(skb)) + gfp_mask |= __GFP_MEMALLOC; + data = kmalloc_reserve(size + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), + gfp_mask, NUMA_NO_NODE, NULL); + if (!data) + return -ENOMEM; + + size = SKB_WITH_OVERHEAD(ksize(data)); + + /* Copy real data, and all frags */ + skb_copy_from_linear_data_offset(skb, off, data, new_hlen); + skb->len -= off; + + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), + offsetof(struct skb_shared_info, + frags[skb_shinfo(skb)->nr_frags])); + if (skb_cloned(skb)) { + /* drop the old head gracefully */ + if (skb_orphan_frags(skb, gfp_mask)) { + kfree(data); + return -ENOMEM; + } + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_frag_ref(skb, i); + if (skb_has_frag_list(skb)) + skb_clone_fraglist(skb); + skb_release_data(skb); + } else { + /* we can reuse existing recount- all we did was + * relocate values + */ + skb_free_head(skb); + } + + skb->head = data; + skb->data = data; + skb->head_frag = 0; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->end = size; +#else + skb->end = skb->head + size; +#endif + skb_set_tail_pointer(skb, skb_headlen(skb)); + skb_headers_offset_update(skb, 0); + skb->cloned = 0; + skb->hdr_len = 0; + skb->nohdr = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); + + return 0; +} + +static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); + +/* carve out the first eat bytes from skb's frag_list. May recurse into + * pskb_carve() + */ +static int pskb_carve_frag_list(struct sk_buff *skb, + struct skb_shared_info *shinfo, int eat, + gfp_t gfp_mask) +{ + struct sk_buff *list = shinfo->frag_list; + struct sk_buff *clone = NULL; + struct sk_buff *insp = NULL; + + do { + if (!list) { + pr_err("Not enough bytes to eat. Want %d\n", eat); + return -EFAULT; + } + if (list->len <= eat) { + /* Eaten as whole. */ + eat -= list->len; + list = list->next; + insp = list; + } else { + /* Eaten partially. */ + if (skb_shared(list)) { + clone = skb_clone(list, gfp_mask); + if (!clone) + return -ENOMEM; + insp = list->next; + list = clone; + } else { + /* This may be pulled without problems. */ + insp = list; + } + if (pskb_carve(list, eat, gfp_mask) < 0) { + kfree_skb(clone); + return -ENOMEM; + } + break; + } + } while (eat); + + /* Free pulled out fragments. */ + while ((list = shinfo->frag_list) != insp) { + shinfo->frag_list = list->next; + kfree_skb(list); + } + /* And insert new clone at head. */ + if (clone) { + clone->next = list; + shinfo->frag_list = clone; + } + return 0; +} + +/* carve off first len bytes from skb. Split line (off) is in the + * non-linear part of skb + */ +static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, + int pos, gfp_t gfp_mask) +{ + int i, k = 0; + int size = skb_end_offset(skb); + u8 *data; + const int nfrags = skb_shinfo(skb)->nr_frags; + struct skb_shared_info *shinfo; + + size = SKB_DATA_ALIGN(size); + + if (skb_pfmemalloc(skb)) + gfp_mask |= __GFP_MEMALLOC; + data = kmalloc_reserve(size + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), + gfp_mask, NUMA_NO_NODE, NULL); + if (!data) + return -ENOMEM; + + size = SKB_WITH_OVERHEAD(ksize(data)); + + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), offsetof(struct skb_shared_info, + frags[skb_shinfo(skb)->nr_frags])); + if (skb_orphan_frags(skb, gfp_mask)) { + kfree(data); + return -ENOMEM; + } + shinfo = (struct skb_shared_info *)(data + size); + for (i = 0; i < nfrags; i++) { + int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (pos + fsize > off) { + shinfo->frags[k] = skb_shinfo(skb)->frags[i]; + + if (pos < off) { + /* Split frag. + * We have two variants in this case: + * 1. Move all the frag to the second + * part, if it is possible. F.e. + * this approach is mandatory for TUX, + * where splitting is expensive. + * 2. Split is accurately. We make this. + */ + shinfo->frags[0].page_offset += off - pos; + skb_frag_size_sub(&shinfo->frags[0], off - pos); + } + skb_frag_ref(skb, i); + k++; + } + pos += fsize; + } + shinfo->nr_frags = k; + if (skb_has_frag_list(skb)) + skb_clone_fraglist(skb); + + if (k == 0) { + /* split line is in frag list */ + pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask); + } + skb_release_data(skb); + + skb->head = data; + skb->head_frag = 0; + skb->data = data; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->end = size; +#else + skb->end = skb->head + size; +#endif + skb_reset_tail_pointer(skb); + skb_headers_offset_update(skb, 0); + skb->cloned = 0; + skb->hdr_len = 0; + skb->nohdr = 0; + skb->len -= off; + skb->data_len = skb->len; + atomic_set(&skb_shinfo(skb)->dataref, 1); + return 0; +} + +/* remove len bytes from the beginning of the skb */ +static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp) +{ + int headlen = skb_headlen(skb); + + if (len < headlen) + return pskb_carve_inside_header(skb, len, headlen, gfp); + else + return pskb_carve_inside_nonlinear(skb, len, headlen, gfp); +} + +/* Extract to_copy bytes starting at off from skb, and return this in + * a new skb + */ +struct sk_buff *pskb_extract(struct sk_buff *skb, int off, + int to_copy, gfp_t gfp) +{ + struct sk_buff *clone = skb_clone(skb, gfp); + + if (!clone) + return NULL; + + if (pskb_carve(clone, off, gfp) < 0 || + pskb_trim(clone, to_copy)) { + kfree_skb(clone); + return NULL; + } + return clone; +} +EXPORT_SYMBOL(pskb_extract); diff --git a/net/core/sock.c b/net/core/sock.c index e16a5db853c6..08bf97eceeb3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1655,6 +1655,17 @@ void sock_wfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_wfree); +/* This variant of sock_wfree() is used by TCP, + * since it sets SOCK_USE_WRITE_QUEUE. + */ +void __sock_wfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) + __sk_free(sk); +} + void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); @@ -1677,8 +1688,21 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) } EXPORT_SYMBOL(skb_set_owner_w); +/* This helper is used by netem, as it can hold packets in its + * delay queue. We want to allow the owner socket to send more + * packets, as if they were already TX completed by a typical driver. + * But we also want to keep skb->sk set because some packet schedulers + * rely on it (sch_fq for example). So we set skb->truesize to a small + * amount (1) and decrease sk_wmem_alloc accordingly. + */ void skb_orphan_partial(struct sk_buff *skb) { + /* If this skb is a TCP pure ACK or already went here, + * we have nothing to do. 2 is already a very small truesize. + */ + if (skb->truesize <= 2) + return; + /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, * so we do not completely orphan skb, but transfert all * accounted bytes but one, to avoid unexpected reorders. @@ -2019,33 +2043,27 @@ static void __release_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { - struct sk_buff *skb = sk->sk_backlog.head; + struct sk_buff *skb, *next; - do { + while ((skb = sk->sk_backlog.head) != NULL) { sk->sk_backlog.head = sk->sk_backlog.tail = NULL; - bh_unlock_sock(sk); - do { - struct sk_buff *next = skb->next; + spin_unlock_bh(&sk->sk_lock.slock); + do { + next = skb->next; prefetch(next); WARN_ON_ONCE(skb_dst_is_noref(skb)); skb->next = NULL; sk_backlog_rcv(sk, skb); - /* - * We are in process context here with softirqs - * disabled, use cond_resched_softirq() to preempt. - * This is safe to do because we've taken the backlog - * queue private: - */ - cond_resched_softirq(); + cond_resched(); skb = next; } while (skb != NULL); - bh_lock_sock(sk); - } while ((skb = sk->sk_backlog.head) != NULL); + spin_lock_bh(&sk->sk_lock.slock); + } /* * Doing the zeroing here guarantee we can not loop forever @@ -2054,6 +2072,13 @@ static void __release_sock(struct sock *sk) sk->sk_backlog.len = 0; } +void __sk_flush_backlog(struct sock *sk) +{ + spin_lock_bh(&sk->sk_lock.slock); + __release_sock(sk); + spin_unlock_bh(&sk->sk_lock.slock); +} + /** * sk_wait_data - wait for data to arrive at sk_receive_queue * @sk: sock to wait on diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index ca9e35bbe13c..6b10573cc9fa 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -120,7 +120,7 @@ static size_t sock_diag_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct inet_diag_msg) + nla_total_size(sizeof(u8)) /* INET_DIAG_PROTOCOL */ - + nla_total_size(sizeof(struct tcp_info))); /* INET_DIAG_INFO */ + + nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */ } static void sock_diag_broadcast_destroy_work(struct work_struct *work) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index b0e28d24e1a7..0c55ffb859bf 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -198,9 +198,9 @@ struct dccp_mib { }; DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics); -#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) -#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field) -#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) +#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) +#define __DCCP_INC_STATS(field) __SNMP_INC_STATS(dccp_statistics, field) +#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) /* * Checksumming routines diff --git a/net/dccp/input.c b/net/dccp/input.c index 3bd14e885396..ba347184bda9 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -359,7 +359,7 @@ send_sync: goto discard; } - DCCP_INC_STATS_BH(DCCP_MIB_INERRS); + DCCP_INC_STATS(DCCP_MIB_INERRS); discard: __kfree_skb(skb); return 0; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index f6d183f8f332..5c7e413a3ae4 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -205,7 +205,7 @@ void dccp_req_err(struct sock *sk, u64 seq) * socket here. */ if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); } else { /* * Still in RESPOND, just remove it silently. @@ -247,7 +247,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) if (skb->len < offset + sizeof(*dh) || skb->len < offset + __dccp_basic_hdr_len(dh)) { - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return; } @@ -256,7 +256,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) iph->saddr, ntohs(dh->dccph_sport), inet_iif(skb)); if (!sk) { - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return; } @@ -273,7 +273,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) * servers this needs to be solved differently. */ if (sock_owned_by_user(sk)) - NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); + __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == DCCP_CLOSED) goto out; @@ -281,7 +281,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) dp = dccp_sk(sk); if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && !between48(seq, dp->dccps_awl, dp->dccps_awh)) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -318,7 +318,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) case DCCP_REQUESTING: case DCCP_RESPOND: if (!sock_owned_by_user(sk)) { - DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); sk->sk_err = err; sk->sk_error_report(sk); @@ -431,11 +431,11 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, return newsk; exit_overflow: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); exit_nonewsk: dst_release(dst); exit: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; put_and_exit: inet_csk_prepare_forced_close(newsk); @@ -462,7 +462,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { - IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } @@ -533,8 +533,8 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) bh_unlock_sock(ctl_sk); if (net_xmit_eval(err) == 0) { - DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); - DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS); + DCCP_INC_STATS(DCCP_MIB_OUTSEGS); + DCCP_INC_STATS(DCCP_MIB_OUTRSTS); } out: dst_release(dst); @@ -637,7 +637,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) drop_and_free: reqsk_free(req); drop: - DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); return -1; } EXPORT_SYMBOL_GPL(dccp_v4_conn_request); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 8ceb3cebcad4..d176f4e66369 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -80,8 +80,8 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (skb->len < offset + sizeof(*dh) || skb->len < offset + __dccp_basic_hdr_len(dh)) { - ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), - ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), + ICMP6_MIB_INERRORS); return; } @@ -91,8 +91,8 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, inet6_iif(skb)); if (!sk) { - ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), - ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), + ICMP6_MIB_INERRORS); return; } @@ -106,7 +106,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, bh_lock_sock(sk); if (sock_owned_by_user(sk)) - NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); + __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == DCCP_CLOSED) goto out; @@ -114,7 +114,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, dp = dccp_sk(sk); if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && !between48(seq, dp->dccps_awl, dp->dccps_awh)) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -156,7 +156,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, case DCCP_RESPOND: /* Cannot happen. It can, it SYNs are crossed. --ANK */ if (!sock_owned_by_user(sk)) { - DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); sk->sk_err = err; /* * Wake people up to see the error @@ -277,8 +277,8 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) if (!IS_ERR(dst)) { skb_dst_set(skb, dst); ip6_xmit(ctl_sk, skb, &fl6, NULL, 0); - DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); - DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS); + DCCP_INC_STATS(DCCP_MIB_OUTSEGS); + DCCP_INC_STATS(DCCP_MIB_OUTRSTS); return; } @@ -378,7 +378,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) drop_and_free: reqsk_free(req); drop: - DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); return -1; } @@ -527,11 +527,11 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, return newsk; out_overflow: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); out_nonewsk: dst_release(dst); out: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; } diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 1994f8af646b..53eddf99e4f6 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -127,7 +127,7 @@ struct sock *dccp_create_openreq_child(const struct sock *sk, } dccp_init_xmit_timers(newsk); - DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS); + __DCCP_INC_STATS(DCCP_MIB_PASSIVEOPENS); } return newsk; } diff --git a/net/dccp/options.c b/net/dccp/options.c index 9bce31886bda..74d29c56c367 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -253,7 +253,7 @@ out_nonsensical_length: return 0; out_invalid_option: - DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT); + DCCP_INC_STATS(DCCP_MIB_INVALIDOPT); rc = DCCP_RESET_CODE_OPTION_ERROR; out_featneg_failed: DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc); diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 3ef7acef3ce8..3a2c34027758 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -28,7 +28,7 @@ static void dccp_write_err(struct sock *sk) dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); dccp_done(sk); - DCCP_INC_STATS_BH(DCCP_MIB_ABORTONTIMEOUT); + __DCCP_INC_STATS(DCCP_MIB_ABORTONTIMEOUT); } /* A write timeout has occurred. Process the after effects. */ @@ -100,7 +100,7 @@ static void dccp_retransmit_timer(struct sock *sk) * total number of retransmissions of clones of original packets. */ if (icsk->icsk_retransmits == 0) - DCCP_INC_STATS_BH(DCCP_MIB_TIMEOUTS); + __DCCP_INC_STATS(DCCP_MIB_TIMEOUTS); if (dccp_retransmit_skb(sk) != 0) { /* @@ -179,7 +179,7 @@ static void dccp_delack_timer(unsigned long data) if (sock_owned_by_user(sk)) { /* Try again later. */ icsk->icsk_ack.blocked = 1; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); goto out; @@ -209,7 +209,7 @@ static void dccp_delack_timer(unsigned long data) icsk->icsk_ack.ato = TCP_ATO_MIN; } dccp_send_ack(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } out: bh_unlock_sock(sk); diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index d61ceed912be..eff5dfc2e33f 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -182,7 +182,7 @@ __ATTRIBUTE_GROUPS(dsa_hwmon); /* basic switch operations **************************************************/ static int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct net_device *master) { - struct dsa_chip_data *cd = ds->pd; + struct dsa_chip_data *cd = ds->cd; struct device_node *port_dn; struct phy_device *phydev; int ret, port, mode; @@ -219,7 +219,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) { struct dsa_switch_driver *drv = ds->drv; struct dsa_switch_tree *dst = ds->dst; - struct dsa_chip_data *pd = ds->pd; + struct dsa_chip_data *cd = ds->cd; bool valid_name_found = false; int index = ds->index; int i, ret; @@ -230,7 +230,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) for (i = 0; i < DSA_MAX_PORTS; i++) { char *name; - name = pd->port_names[i]; + name = cd->port_names[i]; if (name == NULL) continue; @@ -328,10 +328,10 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) if (!(ds->enabled_port_mask & (1 << i))) continue; - ret = dsa_slave_create(ds, parent, i, pd->port_names[i]); + ret = dsa_slave_create(ds, parent, i, cd->port_names[i]); if (ret < 0) { netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s): %d\n", - index, i, pd->port_names[i], ret); + index, i, cd->port_names[i], ret); ret = 0; } } @@ -379,7 +379,7 @@ static struct dsa_switch * dsa_switch_setup(struct dsa_switch_tree *dst, int index, struct device *parent, struct device *host_dev) { - struct dsa_chip_data *pd = dst->pd->chip + index; + struct dsa_chip_data *cd = dst->pd->chip + index; struct dsa_switch_driver *drv; struct dsa_switch *ds; int ret; @@ -389,7 +389,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, /* * Probe for switch model. */ - drv = dsa_switch_probe(parent, host_dev, pd->sw_addr, &name, &priv); + drv = dsa_switch_probe(parent, host_dev, cd->sw_addr, &name, &priv); if (drv == NULL) { netdev_err(dst->master_netdev, "[%d]: could not detect attached switch\n", index); @@ -408,10 +408,10 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, ds->dst = dst; ds->index = index; - ds->pd = pd; + ds->cd = cd; ds->drv = drv; ds->priv = priv; - ds->master_dev = host_dev; + ds->dev = parent; ret = dsa_switch_setup_one(ds, parent); if (ret) @@ -424,7 +424,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds) { struct device_node *port_dn; struct phy_device *phydev; - struct dsa_chip_data *cd = ds->pd; + struct dsa_chip_data *cd = ds->cd; int port; #ifdef CONFIG_NET_DSA_HWMON @@ -659,9 +659,6 @@ static int dsa_of_probe(struct device *dev) const char *port_name; int chip_index, port_index; const unsigned int *sw_addr, *port_reg; - int gpio; - enum of_gpio_flags of_flags; - unsigned long flags; u32 eeprom_len; int ret; @@ -740,19 +737,6 @@ static int dsa_of_probe(struct device *dev) put_device(cd->host_dev); cd->host_dev = &mdio_bus_switch->dev; } - gpio = of_get_named_gpio_flags(child, "reset-gpios", 0, - &of_flags); - if (gpio_is_valid(gpio)) { - flags = (of_flags == OF_GPIO_ACTIVE_LOW ? - GPIOF_ACTIVE_LOW : 0); - ret = devm_gpio_request_one(dev, gpio, flags, - "switch_reset"); - if (ret) - goto out_free_chip; - - cd->reset = gpio_to_desc(gpio); - gpiod_direction_output(cd->reset, 0); - } for_each_available_child_of_node(child, port) { port_reg = of_get_property(port, "reg", NULL); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 3b6750f5e68b..152436cdab30 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -50,8 +50,8 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds) ds->slave_mii_bus->read = dsa_slave_phy_read; ds->slave_mii_bus->write = dsa_slave_phy_write; snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d:%.2x", - ds->index, ds->pd->sw_addr); - ds->slave_mii_bus->parent = ds->master_dev; + ds->index, ds->cd->sw_addr); + ds->slave_mii_bus->parent = ds->dev; ds->slave_mii_bus->phy_mask = ~ds->phys_mii_mask; } @@ -615,8 +615,8 @@ static int dsa_slave_get_eeprom_len(struct net_device *dev) struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->pd->eeprom_len) - return ds->pd->eeprom_len; + if (ds->cd->eeprom_len) + return ds->cd->eeprom_len; if (ds->drv->get_eeprom_len) return ds->drv->get_eeprom_len(ds); @@ -666,6 +666,78 @@ static void dsa_slave_get_strings(struct net_device *dev, } } +static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, + uint64_t *data) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds = dst->ds[0]; + s8 cpu_port = dst->cpu_port; + int count = 0; + + if (dst->master_ethtool_ops.get_sset_count) { + count = dst->master_ethtool_ops.get_sset_count(dev, + ETH_SS_STATS); + dst->master_ethtool_ops.get_ethtool_stats(dev, stats, data); + } + + if (ds->drv->get_ethtool_stats) + ds->drv->get_ethtool_stats(ds, cpu_port, data + count); +} + +static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds = dst->ds[0]; + int count = 0; + + if (dst->master_ethtool_ops.get_sset_count) + count += dst->master_ethtool_ops.get_sset_count(dev, sset); + + if (sset == ETH_SS_STATS && ds->drv->get_sset_count) + count += ds->drv->get_sset_count(ds); + + return count; +} + +static void dsa_cpu_port_get_strings(struct net_device *dev, + uint32_t stringset, uint8_t *data) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds = dst->ds[0]; + s8 cpu_port = dst->cpu_port; + int len = ETH_GSTRING_LEN; + int mcount = 0, count; + unsigned int i; + uint8_t pfx[4]; + uint8_t *ndata; + + snprintf(pfx, sizeof(pfx), "p%.2d", cpu_port); + /* We do not want to be NULL-terminated, since this is a prefix */ + pfx[sizeof(pfx) - 1] = '_'; + + if (dst->master_ethtool_ops.get_sset_count) { + mcount = dst->master_ethtool_ops.get_sset_count(dev, + ETH_SS_STATS); + dst->master_ethtool_ops.get_strings(dev, stringset, data); + } + + if (stringset == ETH_SS_STATS && ds->drv->get_strings) { + ndata = data + mcount * len; + /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle + * the output after to prepend our CPU port prefix we + * constructed earlier + */ + ds->drv->get_strings(ds, cpu_port, ndata); + count = ds->drv->get_sset_count(ds); + for (i = 0; i < count; i++) { + memmove(ndata + (i * len + sizeof(pfx)), + ndata + i * len, len - sizeof(pfx)); + memcpy(ndata + i * len, pfx, sizeof(pfx)); + } + } +} + static void dsa_slave_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, uint64_t *data) @@ -821,6 +893,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_eee = dsa_slave_get_eee, }; +static struct ethtool_ops dsa_cpu_port_ethtool_ops; + static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, @@ -925,7 +999,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, struct net_device *slave_dev) { struct dsa_switch *ds = p->parent; - struct dsa_chip_data *cd = ds->pd; + struct dsa_chip_data *cd = ds->cd; struct device_node *phy_dn, *port_dn; bool phy_is_fixed = false; u32 phy_flags = 0; @@ -1038,6 +1112,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, int port, char *name) { struct net_device *master = ds->dst->master_netdev; + struct dsa_switch_tree *dst = ds->dst; struct net_device *slave_dev; struct dsa_slave_priv *p; int ret; @@ -1049,6 +1124,19 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, slave_dev->features = master->vlan_features; slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; + if (master->ethtool_ops != &dsa_cpu_port_ethtool_ops) { + memcpy(&dst->master_ethtool_ops, master->ethtool_ops, + sizeof(struct ethtool_ops)); + memcpy(&dsa_cpu_port_ethtool_ops, &dst->master_ethtool_ops, + sizeof(struct ethtool_ops)); + dsa_cpu_port_ethtool_ops.get_sset_count = + dsa_cpu_port_get_sset_count; + dsa_cpu_port_ethtool_ops.get_ethtool_stats = + dsa_cpu_port_get_ethtool_stats; + dsa_cpu_port_ethtool_ops.get_strings = + dsa_cpu_port_get_strings; + master->ethtool_ops = &dsa_cpu_port_ethtool_ops; + } eth_hw_addr_inherit(slave_dev, master); slave_dev->priv_flags |= IFF_NO_QUEUE; slave_dev->netdev_ops = &dsa_slave_netdev_ops; @@ -1059,7 +1147,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, NULL); SET_NETDEV_DEV(slave_dev, parent); - slave_dev->dev.of_node = ds->pd->port_dn[port]; + slave_dev->dev.of_node = ds->cd->port_dn[port]; slave_dev->vlan_features = master->vlan_features; p = netdev_priv(slave_dev); diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h index b4e17a7c0df0..5ac778962e4e 100644 --- a/net/ieee802154/6lowpan/6lowpan_i.h +++ b/net/ieee802154/6lowpan/6lowpan_i.h @@ -41,24 +41,12 @@ static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a) return (((__force u64)a->extended_addr) >> 32) ^ (((__force u64)a->extended_addr) & 0xffffffff); case IEEE802154_ADDR_SHORT: - return (__force u32)(a->short_addr); + return (__force u32)(a->short_addr + (a->pan_id << 16)); default: return 0; } } -/* private device info */ -struct lowpan_dev_info { - struct net_device *wdev; /* wpan device ptr */ - u16 fragment_tag; -}; - -static inline struct -lowpan_dev_info *lowpan_dev_info(const struct net_device *dev) -{ - return (struct lowpan_dev_info *)lowpan_priv(dev)->priv; -} - int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type); void lowpan_net_frag_exit(void); int lowpan_net_frag_init(void); diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 0023c9048812..dd085db8580e 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -148,7 +148,7 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev, return -EBUSY; } - lowpan_dev_info(ldev)->wdev = wdev; + lowpan_802154_dev(ldev)->wdev = wdev; /* Set the lowpan hardware address to the wpan hardware address. */ memcpy(ldev->dev_addr, wdev->dev_addr, IEEE802154_ADDR_LEN); /* We need headroom for possible wpan_dev_hard_header call and tailroom @@ -173,7 +173,7 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev, static void lowpan_dellink(struct net_device *ldev, struct list_head *head) { - struct net_device *wdev = lowpan_dev_info(ldev)->wdev; + struct net_device *wdev = lowpan_802154_dev(ldev)->wdev; ASSERT_RTNL(); @@ -184,7 +184,7 @@ static void lowpan_dellink(struct net_device *ldev, struct list_head *head) static struct rtnl_link_ops lowpan_link_ops __read_mostly = { .kind = "lowpan", - .priv_size = LOWPAN_PRIV_SIZE(sizeof(struct lowpan_dev_info)), + .priv_size = LOWPAN_PRIV_SIZE(sizeof(struct lowpan_802154_dev)), .setup = lowpan_setup, .newlink = lowpan_newlink, .dellink = lowpan_dellink, diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index d4353faced35..e459afd16bb3 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -84,7 +84,7 @@ static struct sk_buff* lowpan_alloc_frag(struct sk_buff *skb, int size, const struct ieee802154_hdr *master_hdr, bool frag1) { - struct net_device *wdev = lowpan_dev_info(skb->dev)->wdev; + struct net_device *wdev = lowpan_802154_dev(skb->dev)->wdev; struct sk_buff *frag; int rc; @@ -148,8 +148,8 @@ lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *ldev, int frag_cap, frag_len, payload_cap, rc; int skb_unprocessed, skb_offset; - frag_tag = htons(lowpan_dev_info(ldev)->fragment_tag); - lowpan_dev_info(ldev)->fragment_tag++; + frag_tag = htons(lowpan_802154_dev(ldev)->fragment_tag); + lowpan_802154_dev(ldev)->fragment_tag++; frag_hdr[0] = LOWPAN_DISPATCH_FRAG1 | ((dgram_size >> 8) & 0x07); frag_hdr[1] = dgram_size & 0xff; @@ -208,7 +208,7 @@ err: static int lowpan_header(struct sk_buff *skb, struct net_device *ldev, u16 *dgram_size, u16 *dgram_offset) { - struct wpan_dev *wpan_dev = lowpan_dev_info(ldev)->wdev->ieee802154_ptr; + struct wpan_dev *wpan_dev = lowpan_802154_dev(ldev)->wdev->ieee802154_ptr; struct ieee802154_addr sa, da; struct ieee802154_mac_cb *cb = mac_cb_init(skb); struct lowpan_addr_info info; @@ -248,8 +248,8 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *ldev, cb->ackreq = wpan_dev->ackreq; } - return wpan_dev_hard_header(skb, lowpan_dev_info(ldev)->wdev, &da, &sa, - 0); + return wpan_dev_hard_header(skb, lowpan_802154_dev(ldev)->wdev, &da, + &sa, 0); } netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev) @@ -283,7 +283,7 @@ netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev) max_single = ieee802154_max_payload(&wpan_hdr); if (skb_tail_pointer(skb) - skb_network_header(skb) <= max_single) { - skb->dev = lowpan_dev_info(ldev)->wdev; + skb->dev = lowpan_802154_dev(ldev)->wdev; ldev->stats.tx_packets++; ldev->stats.tx_bytes += dgram_size; return dev_queue_xmit(skb); diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 8035c93dd527..ca207dbf673b 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -1078,6 +1078,11 @@ static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info) if (netif_running(dev)) return -EBUSY; + if (wpan_dev->lowpan_dev) { + if (netif_running(wpan_dev->lowpan_dev)) + return -EBUSY; + } + /* don't change address fields on monitor */ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR || !info->attrs[NL802154_ATTR_PAN_ID]) @@ -1109,6 +1114,11 @@ static int nl802154_set_short_addr(struct sk_buff *skb, struct genl_info *info) if (netif_running(dev)) return -EBUSY; + if (wpan_dev->lowpan_dev) { + if (netif_running(wpan_dev->lowpan_dev)) + return -EBUSY; + } + /* don't change address fields on monitor */ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR || !info->attrs[NL802154_ATTR_SHORT_ADDR]) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index c34c7544d1db..89a8cac4726a 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -436,7 +436,7 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) if (IS_ERR(rt)) return 1; if (rt->dst.dev != dev) { - NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); + __NET_INC_STATS(net, LINUX_MIB_ARPFILTER); flag = 1; } ip_rt_put(rt); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 8a9246deccfe..ef2ebeb89d0f 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -110,6 +110,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id) hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); return tb; } +EXPORT_SYMBOL_GPL(fib_new_table); /* caller must hold either rtnl or rcu read lock */ struct fib_table *fib_get_table(struct net *net, u32 id) @@ -904,7 +905,11 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (!prim) { - pr_warn("%s: bug: prim == NULL\n", __func__); + /* if the device has been deleted, we don't perform + * address promotion + */ + if (!in_dev->dead) + pr_warn("%s: bug: prim == NULL\n", __func__); return; } if (iprim && iprim != prim) { diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 7ac5ec87b004..eeec7d60e5fd 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -227,8 +227,6 @@ static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, int err = -ENOSYS; const struct net_offload **offloads; - udp_tunnel_gro_complete(skb, nhoff); - rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); @@ -237,6 +235,8 @@ static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, err = ops->callbacks.gro_complete(skb, nhoff); + skb_set_inner_mac_header(skb, nhoff); + out_unlock: rcu_read_unlock(); @@ -412,6 +412,8 @@ static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) err = ops->callbacks.gro_complete(skb, nhoff + guehlen); + skb_set_inner_mac_header(skb, nhoff + guehlen); + out_unlock: rcu_read_unlock(); return err; diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index d9c552a721fc..d78e2eefc0f7 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -60,6 +60,67 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version) } EXPORT_SYMBOL_GPL(gre_del_protocol); +/* Fills in tpi and returns header length to be pulled. */ +int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, + bool *csum_err) +{ + const struct gre_base_hdr *greh; + __be32 *options; + int hdr_len; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) + return -EINVAL; + + greh = (struct gre_base_hdr *)skb_transport_header(skb); + if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) + return -EINVAL; + + tpi->flags = gre_flags_to_tnl_flags(greh->flags); + hdr_len = gre_calc_hlen(tpi->flags); + + if (!pskb_may_pull(skb, hdr_len)) + return -EINVAL; + + greh = (struct gre_base_hdr *)skb_transport_header(skb); + tpi->proto = greh->protocol; + + options = (__be32 *)(greh + 1); + if (greh->flags & GRE_CSUM) { + if (skb_checksum_simple_validate(skb)) { + *csum_err = true; + return -EINVAL; + } + + skb_checksum_try_convert(skb, IPPROTO_GRE, 0, + null_compute_pseudo); + options++; + } + + if (greh->flags & GRE_KEY) { + tpi->key = *options; + options++; + } else { + tpi->key = 0; + } + if (unlikely(greh->flags & GRE_SEQ)) { + tpi->seq = *options; + options++; + } else { + tpi->seq = 0; + } + /* WCCP version 1 and 2 protocol decoding. + * - Change protocol to IP + * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header + */ + if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { + tpi->proto = htons(ETH_P_IP); + if ((*(u8 *)options & 0xF0) != 0x40) + hdr_len += 4; + } + return hdr_len; +} +EXPORT_SYMBOL(gre_parse_header); + static int gre_rcv(struct sk_buff *skb) { const struct gre_protocol *proto; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 6333489771ed..38abe70e595f 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -363,7 +363,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param, icmp_param->data_len+icmp_param->head_len, icmp_param->head_len, ipc, rt, MSG_DONTWAIT) < 0) { - ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_OUTERRORS); + __ICMP_INC_STATS(sock_net(sk), ICMP_MIB_OUTERRORS); ip_flush_pending_frames(sk); } else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { struct icmphdr *icmph = icmp_hdr(skb); @@ -744,7 +744,7 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info) * avoid additional coding at protocol handlers. */ if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) { - ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS); return; } @@ -865,7 +865,7 @@ static bool icmp_unreach(struct sk_buff *skb) out: return true; out_err: - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return false; } @@ -877,7 +877,7 @@ out_err: static bool icmp_redirect(struct sk_buff *skb) { if (skb->len < sizeof(struct iphdr)) { - ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS); return false; } @@ -956,7 +956,7 @@ static bool icmp_timestamp(struct sk_buff *skb) return true; out_err: - ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); return false; } @@ -996,7 +996,7 @@ int icmp_rcv(struct sk_buff *skb) skb_set_network_header(skb, nh); } - ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS); + __ICMP_INC_STATS(net, ICMP_MIB_INMSGS); if (skb_checksum_simple_validate(skb)) goto csum_error; @@ -1006,7 +1006,7 @@ int icmp_rcv(struct sk_buff *skb) icmph = icmp_hdr(skb); - ICMPMSGIN_INC_STATS_BH(net, icmph->type); + ICMPMSGIN_INC_STATS(net, icmph->type); /* * 18 is the highest 'known' ICMP type. Anything else is a mystery * @@ -1052,9 +1052,9 @@ drop: kfree_skb(skb); return 0; csum_error: - ICMP_INC_STATS_BH(net, ICMP_MIB_CSUMERRORS); + __ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS); error: - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); goto drop; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index ab69da2d2a77..fa8c39804bdb 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -427,7 +427,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, route_err: ip_rt_put(rt); no_route: - IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } EXPORT_SYMBOL_GPL(inet_csk_route_req); @@ -466,7 +466,7 @@ route_err: ip_rt_put(rt); no_route: rcu_read_unlock(); - IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); @@ -706,7 +706,9 @@ void inet_csk_destroy_sock(struct sock *sk) sk_refcnt_debug_release(sk); + local_bh_disable(); percpu_counter_dec(sk->sk_prot->orphan_count); + local_bh_enable(); sock_put(sk); } EXPORT_SYMBOL(inet_csk_destroy_sock); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index ad7956fa659a..25af1243649b 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -220,8 +220,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, } if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) { - attr = nla_reserve(skb, INET_DIAG_INFO, - handler->idiag_info_size); + attr = nla_reserve_64bit(skb, INET_DIAG_INFO, + handler->idiag_info_size, + INET_DIAG_PAD); if (!attr) goto errout; @@ -1078,7 +1079,9 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk) } attr = handler->idiag_info_size - ? nla_reserve(skb, INET_DIAG_INFO, handler->idiag_info_size) + ? nla_reserve_64bit(skb, INET_DIAG_INFO, + handler->idiag_info_size, + INET_DIAG_PAD) : NULL; if (attr) info = nla_data(attr); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index b76b0d7e59c1..77c20a489218 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -360,7 +360,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { sk_nulls_del_node_init_rcu((struct sock *)tw); - NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); + __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -438,6 +438,7 @@ static int inet_reuseport_add_sock(struct sock *sk, const struct sock *sk2, bool match_wildcard)) { + struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; struct sock *sk2; kuid_t uid = sock_i_uid(sk); @@ -446,6 +447,7 @@ static int inet_reuseport_add_sock(struct sock *sk, sk2->sk_family == sk->sk_family && ipv6_only_sock(sk2) == ipv6_only_sock(sk) && sk2->sk_bound_dev_if == sk->sk_bound_dev_if && + inet_csk(sk2)->icsk_bind_hash == tb && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && saddr_same(sk, sk2, false)) return reuseport_add_sock(sk, sk2); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index c67f9bd7699c..206581674806 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -94,7 +94,7 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, } /* - * Enter the time wait state. This is called with locally disabled BH. + * Enter the time wait state. * Essentially we whip up a timewait bucket, copy the relevant info into it * from the SK, and mess with hash chains and list linkage. */ @@ -112,7 +112,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, hashinfo->bhash_size)]; - spin_lock(&bhead->lock); + spin_lock_bh(&bhead->lock); tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); @@ -138,7 +138,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); - spin_unlock(lock); + spin_unlock_bh(lock); } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); @@ -147,9 +147,9 @@ static void tw_timer_handler(unsigned long data) struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data; if (tw->tw_kill) - NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); + __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); else - NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED); + __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITED); inet_twsk_kill(tw); } diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index af18f1e4889e..cbfb1808fcc4 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -65,8 +65,8 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s { struct ip_options *opt = &(IPCB(skb)->opt); - IP_INC_STATS_BH(net, IPSTATS_MIB_OUTFORWDATAGRAMS); - IP_ADD_STATS_BH(net, IPSTATS_MIB_OUTOCTETS, skb->len); + __IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS); + __IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len); if (unlikely(opt->optlen)) ip_forward_options(skb); @@ -157,7 +157,7 @@ sr_failed: too_many_hops: /* Tell the sender its packet died... */ - IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS); + __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); drop: kfree_skb(skb); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index efbd47d1a531..bbe7f72db9c1 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -204,14 +204,14 @@ static void ip_expire(unsigned long arg) goto out; ipq_kill(qp); - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); + __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); if (!inet_frag_evicting(&qp->q)) { struct sk_buff *head = qp->q.fragments; const struct iphdr *iph; int err; - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); + __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments) goto out; @@ -291,7 +291,7 @@ static int ip_frag_too_far(struct ipq *qp) struct net *net; net = container_of(qp->q.net, struct net, ipv4.frags); - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); + __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); } return rc; @@ -635,7 +635,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, ip_send_check(iph); - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); + __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; qp->q.fragments_tail = NULL; return 0; @@ -647,7 +647,7 @@ out_nomem: out_oversize: net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); out_fail: - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); + __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); return err; } @@ -658,7 +658,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) int vif = l3mdev_master_ifindex_rcu(dev); struct ipq *qp; - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); + __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); skb_orphan(skb); /* Lookup (or create) queue header */ @@ -675,7 +675,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) return ret; } - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); + __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -ENOMEM; } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index eedd829a2f87..2b267e71ebf5 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -122,125 +122,6 @@ static int ipgre_tunnel_init(struct net_device *dev); static int ipgre_net_id __read_mostly; static int gre_tap_net_id __read_mostly; -static int ip_gre_calc_hlen(__be16 o_flags) -{ - int addend = 4; - - if (o_flags & TUNNEL_CSUM) - addend += 4; - if (o_flags & TUNNEL_KEY) - addend += 4; - if (o_flags & TUNNEL_SEQ) - addend += 4; - return addend; -} - -static __be16 gre_flags_to_tnl_flags(__be16 flags) -{ - __be16 tflags = 0; - - if (flags & GRE_CSUM) - tflags |= TUNNEL_CSUM; - if (flags & GRE_ROUTING) - tflags |= TUNNEL_ROUTING; - if (flags & GRE_KEY) - tflags |= TUNNEL_KEY; - if (flags & GRE_SEQ) - tflags |= TUNNEL_SEQ; - if (flags & GRE_STRICT) - tflags |= TUNNEL_STRICT; - if (flags & GRE_REC) - tflags |= TUNNEL_REC; - if (flags & GRE_VERSION) - tflags |= TUNNEL_VERSION; - - return tflags; -} - -static __be16 tnl_flags_to_gre_flags(__be16 tflags) -{ - __be16 flags = 0; - - if (tflags & TUNNEL_CSUM) - flags |= GRE_CSUM; - if (tflags & TUNNEL_ROUTING) - flags |= GRE_ROUTING; - if (tflags & TUNNEL_KEY) - flags |= GRE_KEY; - if (tflags & TUNNEL_SEQ) - flags |= GRE_SEQ; - if (tflags & TUNNEL_STRICT) - flags |= GRE_STRICT; - if (tflags & TUNNEL_REC) - flags |= GRE_REC; - if (tflags & TUNNEL_VERSION) - flags |= GRE_VERSION; - - return flags; -} - -static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, - bool *csum_err) -{ - const struct gre_base_hdr *greh; - __be32 *options; - int hdr_len; - - if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) - return -EINVAL; - - greh = (struct gre_base_hdr *)skb_transport_header(skb); - if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) - return -EINVAL; - - tpi->flags = gre_flags_to_tnl_flags(greh->flags); - hdr_len = ip_gre_calc_hlen(tpi->flags); - - if (!pskb_may_pull(skb, hdr_len)) - return -EINVAL; - - greh = (struct gre_base_hdr *)skb_transport_header(skb); - tpi->proto = greh->protocol; - - options = (__be32 *)(greh + 1); - if (greh->flags & GRE_CSUM) { - if (skb_checksum_simple_validate(skb)) { - *csum_err = true; - return -EINVAL; - } - - skb_checksum_try_convert(skb, IPPROTO_GRE, 0, - null_compute_pseudo); - options++; - } - - if (greh->flags & GRE_KEY) { - tpi->key = *options; - options++; - } else { - tpi->key = 0; - } - if (unlikely(greh->flags & GRE_SEQ)) { - tpi->seq = *options; - options++; - } else { - tpi->seq = 0; - } - /* WCCP version 1 and 2 protocol decoding. - * - Change protocol to IP - * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header - */ - if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { - tpi->proto = htons(ETH_P_IP); - if ((*(u8 *)options & 0xF0) != 0x40) { - hdr_len += 4; - if (!pskb_may_pull(skb, hdr_len)) - return -EINVAL; - } - } - return iptunnel_pull_header(skb, hdr_len, tpi->proto, false); -} - static void ipgre_err(struct sk_buff *skb, u32 info, const struct tnl_ptk_info *tpi) { @@ -341,7 +222,7 @@ static void gre_err(struct sk_buff *skb, u32 info) struct tnl_ptk_info tpi; bool csum_err = false; - if (parse_gre_header(skb, &tpi, &csum_err)) { + if (gre_parse_header(skb, &tpi, &csum_err) < 0) { if (!csum_err) /* ignore csum errors. */ return; } @@ -379,24 +260,22 @@ static __be32 tunnel_id_to_key(__be64 x) #endif } -static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) +static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, + struct ip_tunnel_net *itn, int hdr_len, bool raw_proto) { - struct net *net = dev_net(skb->dev); struct metadata_dst *tun_dst = NULL; - struct ip_tunnel_net *itn; const struct iphdr *iph; struct ip_tunnel *tunnel; - if (tpi->proto == htons(ETH_P_TEB)) - itn = net_generic(net, gre_tap_net_id); - else - itn = net_generic(net, ipgre_net_id); - iph = ip_hdr(skb); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, iph->saddr, iph->daddr, tpi->key); if (tunnel) { + if (__iptunnel_pull_header(skb, hdr_len, tpi->proto, + raw_proto, false) < 0) + goto drop; + skb_pop_mac_header(skb); if (tunnel->collect_md) { __be16 flags; @@ -412,13 +291,41 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); return PACKET_RCVD; } - return PACKET_REJECT; + return PACKET_NEXT; + +drop: + kfree_skb(skb); + return PACKET_RCVD; +} + +static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, + int hdr_len) +{ + struct net *net = dev_net(skb->dev); + struct ip_tunnel_net *itn; + int res; + + if (tpi->proto == htons(ETH_P_TEB)) + itn = net_generic(net, gre_tap_net_id); + else + itn = net_generic(net, ipgre_net_id); + + res = __ipgre_rcv(skb, tpi, itn, hdr_len, false); + if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) { + /* ipgre tunnels in collect metadata mode should receive + * also ETH_P_TEB traffic. + */ + itn = net_generic(net, ipgre_net_id); + res = __ipgre_rcv(skb, tpi, itn, hdr_len, true); + } + return res; } static int gre_rcv(struct sk_buff *skb) { struct tnl_ptk_info tpi; bool csum_err = false; + int hdr_len; #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(ip_hdr(skb)->daddr)) { @@ -428,10 +335,11 @@ static int gre_rcv(struct sk_buff *skb) } #endif - if (parse_gre_header(skb, &tpi, &csum_err) < 0) + hdr_len = gre_parse_header(skb, &tpi, &csum_err); + if (hdr_len < 0) goto drop; - if (ipgre_rcv(skb, &tpi) == PACKET_RCVD) + if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); @@ -440,49 +348,6 @@ drop: return 0; } -static __sum16 gre_checksum(struct sk_buff *skb) -{ - __wsum csum; - - if (skb->ip_summed == CHECKSUM_PARTIAL) - csum = lco_csum(skb); - else - csum = skb_checksum(skb, 0, skb->len, 0); - return csum_fold(csum); -} - -static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags, - __be16 proto, __be32 key, __be32 seq) -{ - struct gre_base_hdr *greh; - - skb_push(skb, hdr_len); - - skb_reset_transport_header(skb); - greh = (struct gre_base_hdr *)skb->data; - greh->flags = tnl_flags_to_gre_flags(flags); - greh->protocol = proto; - - if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) { - __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); - - if (flags & TUNNEL_SEQ) { - *ptr = seq; - ptr--; - } - if (flags & TUNNEL_KEY) { - *ptr = key; - ptr--; - } - if (flags & TUNNEL_CSUM && - !(skb_shinfo(skb)->gso_type & - (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { - *ptr = 0; - *(__sum16 *)ptr = gre_checksum(skb); - } - } -} - static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, __be16 proto) @@ -493,8 +358,9 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, tunnel->o_seqno++; /* Push GRE header. */ - build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, - proto, tunnel->parms.o_key, htonl(tunnel->o_seqno)); + gre_build_header(skb, tunnel->tun_hlen, + tunnel->parms.o_flags, proto, tunnel->parms.o_key, + htonl(tunnel->o_seqno)); skb_set_inner_protocol(skb, proto); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); @@ -522,7 +388,8 @@ static struct rtable *gre_get_rt(struct sk_buff *skb, return ip_route_output_key(net, fl); } -static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) +static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, + __be16 proto) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; @@ -552,7 +419,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) fl.saddr); } - tunnel_hlen = ip_gre_calc_hlen(key->tun_flags); + tunnel_hlen = gre_calc_hlen(key->tun_flags); min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + tunnel_hlen + sizeof(struct iphdr); @@ -571,8 +438,8 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) goto err_free_rt; flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); - build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB), - tunnel_id_to_key(tun_info->key.tun_id), 0); + gre_build_header(skb, tunnel_hlen, flags, proto, + tunnel_id_to_key(tun_info->key.tun_id), 0); df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; @@ -612,7 +479,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, const struct iphdr *tnl_params; if (tunnel->collect_md) { - gre_fb_xmit(skb, dev); + gre_fb_xmit(skb, dev, skb->protocol); return NETDEV_TX_OK; } @@ -654,7 +521,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, struct ip_tunnel *tunnel = netdev_priv(dev); if (tunnel->collect_md) { - gre_fb_xmit(skb, dev); + gre_fb_xmit(skb, dev, htons(ETH_P_TEB)); return NETDEV_TX_OK; } @@ -694,8 +561,8 @@ static int ipgre_tunnel_ioctl(struct net_device *dev, if (err) return err; - p.i_flags = tnl_flags_to_gre_flags(p.i_flags); - p.o_flags = tnl_flags_to_gre_flags(p.o_flags); + p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags); + p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) return -EFAULT; @@ -739,7 +606,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph)); greh = (struct gre_base_hdr *)(iph+1); - greh->flags = tnl_flags_to_gre_flags(t->parms.o_flags); + greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags); greh->protocol = htons(type); memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); @@ -840,7 +707,7 @@ static void __gre_tunnel_init(struct net_device *dev) int t_hlen; tunnel = netdev_priv(dev); - tunnel->tun_hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); + tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; @@ -885,7 +752,7 @@ static int ipgre_tunnel_init(struct net_device *dev) netif_keep_dst(dev); dev->addr_len = 4; - if (iph->daddr) { + if (iph->daddr && !tunnel->collect_md) { #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(iph->daddr)) { if (!iph->saddr) @@ -894,8 +761,9 @@ static int ipgre_tunnel_init(struct net_device *dev) dev->header_ops = &ipgre_header_ops; } #endif - } else + } else if (!tunnel->collect_md) { dev->header_ops = &ipgre_header_ops; + } return ip_tunnel_init(dev); } @@ -938,6 +806,11 @@ static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) if (flags & (GRE_VERSION|GRE_ROUTING)) return -EINVAL; + if (data[IFLA_GRE_COLLECT_METADATA] && + data[IFLA_GRE_ENCAP_TYPE] && + nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE) + return -EINVAL; + return 0; } @@ -1155,8 +1028,10 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) struct ip_tunnel_parm *p = &t->parms; if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || - nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) || - nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) || + nla_put_be16(skb, IFLA_GRE_IFLAGS, + gre_tnl_flags_to_gre_flags(p->i_flags)) || + nla_put_be16(skb, IFLA_GRE_OFLAGS, + gre_tnl_flags_to_gre_flags(p->o_flags)) || nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) || diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index e3d782746d9d..4b351af3e67b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -218,17 +218,17 @@ static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_b protocol = -ret; goto resubmit; } - IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); + __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); } else { if (!raw) { if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS); + __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } kfree_skb(skb); } else { - IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); + __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); consume_skb(skb); } } @@ -273,7 +273,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb) --ANK (980813) */ if (skb_cow(skb, skb_headroom(skb))) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); + __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto drop; } @@ -282,7 +282,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb) opt->optlen = iph->ihl*4 - sizeof(struct iphdr); if (ip_options_compile(dev_net(dev), opt, skb)) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); + __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS); goto drop; } @@ -313,6 +313,13 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip_rcv(skb); + if (!skb) + return NET_RX_SUCCESS; + if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk && @@ -337,7 +344,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) iph->tos, skb->dev); if (unlikely(err)) { if (err == -EXDEV) - NET_INC_STATS_BH(net, LINUX_MIB_IPRPFILTER); + __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); goto drop; } } @@ -358,9 +365,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { - IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len); + __IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) { - IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len); + __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len); } else if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { struct in_device *in_dev = __in_dev_get_rcu(skb->dev); @@ -409,11 +416,11 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, net = dev_net(dev); - IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_IN, skb->len); + __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { - IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS); + __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto out; } @@ -439,9 +446,9 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); - IP_ADD_STATS_BH(net, - IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), - max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); + __IP_ADD_STATS(net, + IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), + max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; @@ -453,7 +460,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, len = ntohs(iph->tot_len); if (skb->len < len) { - IP_INC_STATS_BH(net, IPSTATS_MIB_INTRUNCATEDPKTS); + __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; @@ -463,7 +470,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, * Note this now means skb->len holds ntohs(iph->tot_len). */ if (pskb_trim_rcsum(skb, len)) { - IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS); + __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto drop; } @@ -471,6 +478,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, /* Remove any debris in the socket control block */ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + IPCB(skb)->iif = skb->skb_iif; /* Must drop socket now because of tproxy. */ skb_orphan(skb); @@ -480,9 +488,9 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, ip_rcv_finish); csum_error: - IP_INC_STATS_BH(net, IPSTATS_MIB_CSUMERRORS); + __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: - IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS); + __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); drop: kfree_skb(skb); out: diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index bdb222c0c6a2..5805762d7fc7 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1193,7 +1193,12 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) ipv6_sk_rxinfo(sk); if (prepare && skb_rtable(skb)) { - pktinfo->ipi_ifindex = inet_iif(skb); + /* skb->cb is overloaded: prior to this point it is IP{6}CB + * which has interface index (iif) as the first member of the + * underlying inet{6}_skb_parm struct. This code then overlays + * PKTINFO_SKB_CB and in_pktinfo also has iif as the first + * element so the iif is picked up from the prior IPCB + */ pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); } else { pktinfo->ipi_ifindex = 0; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 6aad0192443d..a69ed94bda1b 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -326,12 +326,12 @@ static int ip_tunnel_bind_dev(struct net_device *dev) if (!IS_ERR(rt)) { tdev = rt->dst.dev; - dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, - fl4.saddr); ip_rt_put(rt); } if (dev->type != ARPHRD_ETHER) dev->flags |= IFF_POINTOPOINT; + + dst_cache_reset(&tunnel->dst_cache); } if (!tdev && tunnel->parms.link) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 786fa7ca28e0..9118b0e640ba 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -157,7 +157,7 @@ int iptunnel_handle_offloads(struct sk_buff *skb, } if (skb_is_gso(skb)) { - err = skb_unclone(skb, GFP_ATOMIC); + err = skb_header_unclone(skb, GFP_ATOMIC); if (unlikely(err)) return err; skb_shinfo(skb)->gso_type |= gso_type_mask; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 5cf10b777b7e..a917903d5e97 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -156,6 +156,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ int err; + int mtu; if (!dst) { dev->stats.tx_carrier_errors++; @@ -192,6 +193,23 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, tunnel->err_count = 0; } + mtu = dst_mtu(dst); + if (skb->len > mtu) { + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + if (skb->protocol == htons(ETH_P_IP)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + } else { + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + } + + dst_release(dst); + goto tx_error; + } + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 60f5161abcb4..2033f929aa66 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -34,27 +34,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("David S. Miller <davem@redhat.com>"); MODULE_DESCRIPTION("arptables core"); -/*#define DEBUG_ARP_TABLES*/ -/*#define DEBUG_ARP_TABLES_USER*/ - -#ifdef DEBUG_ARP_TABLES -#define dprintf(format, args...) pr_debug(format, ## args) -#else -#define dprintf(format, args...) -#endif - -#ifdef DEBUG_ARP_TABLES_USER -#define duprintf(format, args...) pr_debug(format, ## args) -#else -#define duprintf(format, args...) -#endif - -#ifdef CONFIG_NETFILTER_DEBUG -#define ARP_NF_ASSERT(x) WARN_ON(!(x)) -#else -#define ARP_NF_ASSERT(x) -#endif - void *arpt_alloc_initial_table(const struct xt_table *info) { return xt_alloc_initial_table(arpt, ARPT); @@ -113,36 +92,20 @@ static inline int arp_packet_match(const struct arphdr *arphdr, #define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg))) if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop, - ARPT_INV_ARPOP)) { - dprintf("ARP operation field mismatch.\n"); - dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n", - arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask); + ARPT_INV_ARPOP)) return 0; - } if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd, - ARPT_INV_ARPHRD)) { - dprintf("ARP hardware address format mismatch.\n"); - dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n", - arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask); + ARPT_INV_ARPHRD)) return 0; - } if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro, - ARPT_INV_ARPPRO)) { - dprintf("ARP protocol address format mismatch.\n"); - dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n", - arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask); + ARPT_INV_ARPPRO)) return 0; - } if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln, - ARPT_INV_ARPHLN)) { - dprintf("ARP hardware address length mismatch.\n"); - dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n", - arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask); + ARPT_INV_ARPHLN)) return 0; - } src_devaddr = arpptr; arpptr += dev->addr_len; @@ -155,49 +118,25 @@ static inline int arp_packet_match(const struct arphdr *arphdr, if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len), ARPT_INV_SRCDEVADDR) || FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len), - ARPT_INV_TGTDEVADDR)) { - dprintf("Source or target device address mismatch.\n"); - + ARPT_INV_TGTDEVADDR)) return 0; - } if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr, ARPT_INV_SRCIP) || FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr), - ARPT_INV_TGTIP)) { - dprintf("Source or target IP address mismatch.\n"); - - dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n", - &src_ipaddr, - &arpinfo->smsk.s_addr, - &arpinfo->src.s_addr, - arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : ""); - dprintf("TGT: %pI4 Mask: %pI4 Target: %pI4.%s\n", - &tgt_ipaddr, - &arpinfo->tmsk.s_addr, - &arpinfo->tgt.s_addr, - arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : ""); + ARPT_INV_TGTIP)) return 0; - } /* Look for ifname matches. */ ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask); - if (FWINV(ret != 0, ARPT_INV_VIA_IN)) { - dprintf("VIA in mismatch (%s vs %s).%s\n", - indev, arpinfo->iniface, - arpinfo->invflags & ARPT_INV_VIA_IN ? " (INV)" : ""); + if (FWINV(ret != 0, ARPT_INV_VIA_IN)) return 0; - } ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask); - if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) { - dprintf("VIA out mismatch (%s vs %s).%s\n", - outdev, arpinfo->outiface, - arpinfo->invflags & ARPT_INV_VIA_OUT ? " (INV)" : ""); + if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) return 0; - } return 1; #undef FWINV @@ -205,16 +144,10 @@ static inline int arp_packet_match(const struct arphdr *arphdr, static inline int arp_checkentry(const struct arpt_arp *arp) { - if (arp->flags & ~ARPT_F_MASK) { - duprintf("Unknown flag bits set: %08X\n", - arp->flags & ~ARPT_F_MASK); + if (arp->flags & ~ARPT_F_MASK) return 0; - } - if (arp->invflags & ~ARPT_INV_MASK) { - duprintf("Unknown invflag bits set: %08X\n", - arp->invflags & ~ARPT_INV_MASK); + if (arp->invflags & ~ARPT_INV_MASK) return 0; - } return 1; } @@ -406,11 +339,9 @@ static int mark_source_chains(const struct xt_table_info *newinfo, = (void *)arpt_get_target_c(e); int visited = e->comefrom & (1 << hook); - if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) { - pr_notice("arptables: loop hook %u pos %u %08X.\n", - hook, pos, e->comefrom); + if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) return 0; - } + e->comefrom |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); @@ -423,12 +354,8 @@ static int mark_source_chains(const struct xt_table_info *newinfo, if ((strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) { - duprintf("mark_source_chains: bad " - "negative verdict (%i)\n", - t->verdict); + t->verdict < -NF_MAX_VERDICT - 1) return 0; - } /* Return: backtrack through the last * big jump. @@ -462,8 +389,6 @@ static int mark_source_chains(const struct xt_table_info *newinfo, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ - duprintf("Jump rule %u -> %u\n", - pos, newpos); e = (struct arpt_entry *) (entry0 + newpos); if (!find_jump_target(newinfo, e)) @@ -480,8 +405,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, pos = newpos; } } -next: - duprintf("Finished chain %u\n", hook); +next: ; } return 1; } @@ -489,7 +413,6 @@ next: static inline int check_target(struct arpt_entry *e, const char *name) { struct xt_entry_target *t = arpt_get_target(e); - int ret; struct xt_tgchk_param par = { .table = name, .entryinfo = e, @@ -499,13 +422,7 @@ static inline int check_target(struct arpt_entry *e, const char *name) .family = NFPROTO_ARP, }; - ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); - if (ret < 0) { - duprintf("arp_tables: check failed for `%s'.\n", - t->u.kernel.target->name); - return ret; - } - return 0; + return xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); } static inline int @@ -513,17 +430,18 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) { struct xt_entry_target *t; struct xt_target *target; + unsigned long pcnt; int ret; - e->counters.pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(e->counters.pcnt)) + pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(pcnt)) return -ENOMEM; + e->counters.pcnt = pcnt; t = arpt_get_target(e); target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("find_check_entry: `%s' not found\n", t->u.user.name); ret = PTR_ERR(target); goto out; } @@ -569,17 +487,12 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 || (unsigned char *)e + sizeof(struct arpt_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p\n", e); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset - < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) return -EINVAL; - } if (!arp_checkentry(&e->arp)) return -EINVAL; @@ -596,12 +509,9 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { - if (!check_underflow(e)) { - pr_debug("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + if (!check_underflow(e)) return -EINVAL; - } + newinfo->underflow[h] = underflows[h]; } } @@ -646,7 +556,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, newinfo->underflow[i] = 0xFFFFFFFF; } - duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ @@ -663,31 +572,21 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } - duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); if (ret != 0) return ret; - if (i != repl->num_entries) { - duprintf("translate_table: %u not %u entries\n", - i, repl->num_entries); + if (i != repl->num_entries) return -EINVAL; - } /* Check hooks all assigned */ for (i = 0; i < NF_ARP_NUMHOOKS; i++) { /* Only hooks which are valid */ if (!(repl->valid_hooks & (1 << i))) continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) { - duprintf("Invalid hook entry %u %u\n", - i, repl->hook_entry[i]); + if (newinfo->hook_entry[i] == 0xFFFFFFFF) return -EINVAL; - } - if (newinfo->underflow[i] == 0xFFFFFFFF) { - duprintf("Invalid underflow %u %u\n", - i, repl->underflow[i]); + if (newinfo->underflow[i] == 0xFFFFFFFF) return -EINVAL; - } } if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) @@ -895,11 +794,8 @@ static int get_info(struct net *net, void __user *user, struct xt_table *t; int ret; - if (*len != sizeof(struct arpt_getinfo)) { - duprintf("length %u != %Zu\n", *len, - sizeof(struct arpt_getinfo)); + if (*len != sizeof(struct arpt_getinfo)) return -EINVAL; - } if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; @@ -955,33 +851,25 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, struct arpt_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("get_entries: %u < %Zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct arpt_get_entries) + get.size) { - duprintf("get_entries: %u != %Zu\n", *len, - sizeof(struct arpt_get_entries) + get.size); + if (*len != sizeof(struct arpt_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; - duprintf("t->private->number = %u\n", - private->number); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); - else { - duprintf("get_entries: I've got %u not %u!\n", - private->size, get.size); + else ret = -EAGAIN; - } + module_put(t->me); xt_table_unlock(t); } else @@ -1019,8 +907,6 @@ static int __do_replace(struct net *net, const char *name, /* You lied! */ if (valid_hooks != t->valid_hooks) { - duprintf("Valid hook crap: %08X vs %08X\n", - valid_hooks, t->valid_hooks); ret = -EINVAL; goto put_module; } @@ -1030,8 +916,6 @@ static int __do_replace(struct net *net, const char *name, goto put_module; /* Update module usage count based on number of rules */ - duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", - oldinfo->number, oldinfo->initial_entries, newinfo->number); if ((oldinfo->number > oldinfo->initial_entries) || (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); @@ -1101,8 +985,6 @@ static int do_replace(struct net *net, const void __user *user, if (ret != 0) goto free_newinfo; - duprintf("arp_tables: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); if (ret) @@ -1200,20 +1082,14 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, unsigned int entry_offset; int ret, off; - duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 || (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p, limit = %p\n", e, limit); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset < sizeof(struct compat_arpt_entry) + - sizeof(struct compat_xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + sizeof(struct compat_xt_entry_target)) return -EINVAL; - } if (!arp_checkentry(&e->arp)) return -EINVAL; @@ -1230,8 +1106,6 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", - t->u.user.name); ret = PTR_ERR(target); goto out; } @@ -1301,7 +1175,6 @@ static int translate_compat_table(struct xt_table_info **pinfo, size = compatr->size; info->number = compatr->num_entries; - duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(NFPROTO_ARP); xt_compat_init_offsets(NFPROTO_ARP, compatr->num_entries); @@ -1316,11 +1189,8 @@ static int translate_compat_table(struct xt_table_info **pinfo, } ret = -EINVAL; - if (j != compatr->num_entries) { - duprintf("translate_compat_table: %u not %u entries\n", - j, compatr->num_entries); + if (j != compatr->num_entries) goto out_unlock; - } ret = -ENOMEM; newinfo = xt_alloc_table_info(size); @@ -1411,8 +1281,6 @@ static int compat_do_replace(struct net *net, void __user *user, if (ret != 0) goto free_newinfo; - duprintf("compat_do_replace: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, compat_ptr(tmp.counters)); if (ret) @@ -1445,7 +1313,6 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, break; default: - duprintf("do_arpt_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1528,17 +1395,13 @@ static int compat_get_entries(struct net *net, struct compat_arpt_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct compat_arpt_get_entries) + get.size) { - duprintf("compat_get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct compat_arpt_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(NFPROTO_ARP); @@ -1547,16 +1410,13 @@ static int compat_get_entries(struct net *net, const struct xt_table_info *private = t->private; struct xt_table_info info; - duprintf("t->private->number = %u\n", private->number); ret = compat_table_info(private, &info); if (!ret && get.size == info.size) { ret = compat_copy_entries_to_user(private->size, t, uptr->entrytable); - } else if (!ret) { - duprintf("compat_get_entries: I've got %u not %u!\n", - private->size, get.size); + } else if (!ret) ret = -EAGAIN; - } + xt_compat_flush_offsets(NFPROTO_ARP); module_put(t->me); xt_table_unlock(t); @@ -1608,7 +1468,6 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned break; default: - duprintf("do_arpt_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1651,7 +1510,6 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len } default: - duprintf("do_arpt_get_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1696,7 +1554,6 @@ int arpt_register_table(struct net *net, memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(newinfo, loc_cpu_entry, repl); - duprintf("arpt_register_table: translate table gives %d\n", ret); if (ret != 0) goto out_free; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 735d1ee8c1ab..54906e0e8e0c 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -35,34 +35,12 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("IPv4 packet filter"); -/*#define DEBUG_IP_FIREWALL*/ -/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ -/*#define DEBUG_IP_FIREWALL_USER*/ - -#ifdef DEBUG_IP_FIREWALL -#define dprintf(format, args...) pr_info(format , ## args) -#else -#define dprintf(format, args...) -#endif - -#ifdef DEBUG_IP_FIREWALL_USER -#define duprintf(format, args...) pr_info(format , ## args) -#else -#define duprintf(format, args...) -#endif - #ifdef CONFIG_NETFILTER_DEBUG #define IP_NF_ASSERT(x) WARN_ON(!(x)) #else #define IP_NF_ASSERT(x) #endif -#if 0 -/* All the better to debug you with... */ -#define static -#define inline -#endif - void *ipt_alloc_initial_table(const struct xt_table *info) { return xt_alloc_initial_table(ipt, IPT); @@ -85,52 +63,28 @@ ip_packet_match(const struct iphdr *ip, if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, IPT_INV_SRCIP) || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, - IPT_INV_DSTIP)) { - dprintf("Source or dest mismatch.\n"); - - dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n", - &ip->saddr, &ipinfo->smsk.s_addr, &ipinfo->src.s_addr, - ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : ""); - dprintf("DST: %pI4 Mask: %pI4 Target: %pI4.%s\n", - &ip->daddr, &ipinfo->dmsk.s_addr, &ipinfo->dst.s_addr, - ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : ""); + IPT_INV_DSTIP)) return false; - } ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask); - if (FWINV(ret != 0, IPT_INV_VIA_IN)) { - dprintf("VIA in mismatch (%s vs %s).%s\n", - indev, ipinfo->iniface, - ipinfo->invflags & IPT_INV_VIA_IN ? " (INV)" : ""); + if (FWINV(ret != 0, IPT_INV_VIA_IN)) return false; - } ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask); - if (FWINV(ret != 0, IPT_INV_VIA_OUT)) { - dprintf("VIA out mismatch (%s vs %s).%s\n", - outdev, ipinfo->outiface, - ipinfo->invflags & IPT_INV_VIA_OUT ? " (INV)" : ""); + if (FWINV(ret != 0, IPT_INV_VIA_OUT)) return false; - } /* Check specific protocol */ if (ipinfo->proto && - FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) { - dprintf("Packet protocol %hi does not match %hi.%s\n", - ip->protocol, ipinfo->proto, - ipinfo->invflags & IPT_INV_PROTO ? " (INV)" : ""); + FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) return false; - } /* If we have a fragment rule but the packet is not a fragment * then we return zero */ - if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) { - dprintf("Fragment rule but not fragment.%s\n", - ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : ""); + if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) return false; - } return true; } @@ -138,16 +92,10 @@ ip_packet_match(const struct iphdr *ip, static bool ip_checkentry(const struct ipt_ip *ip) { - if (ip->flags & ~IPT_F_MASK) { - duprintf("Unknown flag bits set: %08X\n", - ip->flags & ~IPT_F_MASK); + if (ip->flags & ~IPT_F_MASK) return false; - } - if (ip->invflags & ~IPT_INV_MASK) { - duprintf("Unknown invflag bits set: %08X\n", - ip->invflags & ~IPT_INV_MASK); + if (ip->invflags & ~IPT_INV_MASK) return false; - } return true; } @@ -346,10 +294,6 @@ ipt_do_table(struct sk_buff *skb, e = get_entry(table_base, private->hook_entry[hook]); - pr_debug("Entering %s(hook %u), UF %p\n", - table->name, hook, - get_entry(table_base, private->underflow[hook])); - do { const struct xt_entry_target *t; const struct xt_entry_match *ematch; @@ -396,22 +340,15 @@ ipt_do_table(struct sk_buff *skb, if (stackidx == 0) { e = get_entry(table_base, private->underflow[hook]); - pr_debug("Underflow (this is normal) " - "to %p\n", e); } else { e = jumpstack[--stackidx]; - pr_debug("Pulled %p out from pos %u\n", - e, stackidx); e = ipt_next_entry(e); } continue; } if (table_base + v != ipt_next_entry(e) && - !(e->ip.flags & IPT_F_GOTO)) { + !(e->ip.flags & IPT_F_GOTO)) jumpstack[stackidx++] = e; - pr_debug("Pushed %p into pos %u\n", - e, stackidx - 1); - } e = get_entry(table_base, v); continue; @@ -429,18 +366,13 @@ ipt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!acpar.hotdrop); - pr_debug("Exiting %s; sp at %u\n", __func__, stackidx); xt_write_recseq_end(addend); local_bh_enable(); -#ifdef DEBUG_ALLOW_ALL - return NF_ACCEPT; -#else if (acpar.hotdrop) return NF_DROP; else return verdict; -#endif } static bool find_jump_target(const struct xt_table_info *t, @@ -480,11 +412,9 @@ mark_source_chains(const struct xt_table_info *newinfo, = (void *)ipt_get_target_c(e); int visited = e->comefrom & (1 << hook); - if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { - pr_err("iptables: loop hook %u pos %u %08X.\n", - hook, pos, e->comefrom); + if (e->comefrom & (1 << NF_INET_NUMHOOKS)) return 0; - } + e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ @@ -496,26 +426,13 @@ mark_source_chains(const struct xt_table_info *newinfo, if ((strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) { - duprintf("mark_source_chains: bad " - "negative verdict (%i)\n", - t->verdict); + t->verdict < -NF_MAX_VERDICT - 1) return 0; - } /* Return: backtrack through the last big jump. */ do { e->comefrom ^= (1<<NF_INET_NUMHOOKS); -#ifdef DEBUG_IP_FIREWALL_USER - if (e->comefrom - & (1 << NF_INET_NUMHOOKS)) { - duprintf("Back unset " - "on hook %u " - "rule %u\n", - hook, pos); - } -#endif oldpos = pos; pos = e->counters.pcnt; e->counters.pcnt = 0; @@ -543,8 +460,6 @@ mark_source_chains(const struct xt_table_info *newinfo, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ - duprintf("Jump rule %u -> %u\n", - pos, newpos); e = (struct ipt_entry *) (entry0 + newpos); if (!find_jump_target(newinfo, e)) @@ -561,8 +476,7 @@ mark_source_chains(const struct xt_table_info *newinfo, pos = newpos; } } -next: - duprintf("Finished chain %u\n", hook); +next: ; } return 1; } @@ -584,18 +498,12 @@ static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { const struct ipt_ip *ip = par->entryinfo; - int ret; par->match = m->u.kernel.match; par->matchinfo = m->data; - ret = xt_check_match(par, m->u.match_size - sizeof(*m), - ip->proto, ip->invflags & IPT_INV_PROTO); - if (ret < 0) { - duprintf("check failed for `%s'.\n", par->match->name); - return ret; - } - return 0; + return xt_check_match(par, m->u.match_size - sizeof(*m), + ip->proto, ip->invflags & IPT_INV_PROTO); } static int @@ -606,10 +514,8 @@ find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name, m->u.user.revision); - if (IS_ERR(match)) { - duprintf("find_check_match: `%s' not found\n", m->u.user.name); + if (IS_ERR(match)) return PTR_ERR(match); - } m->u.kernel.match = match; ret = check_match(m, par); @@ -634,16 +540,9 @@ static int check_target(struct ipt_entry *e, struct net *net, const char *name) .hook_mask = e->comefrom, .family = NFPROTO_IPV4, }; - int ret; - ret = xt_check_target(&par, t->u.target_size - sizeof(*t), - e->ip.proto, e->ip.invflags & IPT_INV_PROTO); - if (ret < 0) { - duprintf("check failed for `%s'.\n", - t->u.kernel.target->name); - return ret; - } - return 0; + return xt_check_target(&par, t->u.target_size - sizeof(*t), + e->ip.proto, e->ip.invflags & IPT_INV_PROTO); } static int @@ -656,10 +555,12 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; + unsigned long pcnt; - e->counters.pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(e->counters.pcnt)) + pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(pcnt)) return -ENOMEM; + e->counters.pcnt = pcnt; j = 0; mtpar.net = net; @@ -678,7 +579,6 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("find_check_entry: `%s' not found\n", t->u.user.name); ret = PTR_ERR(target); goto cleanup_matches; } @@ -732,17 +632,12 @@ check_entry_size_and_hooks(struct ipt_entry *e, if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 || (unsigned char *)e + sizeof(struct ipt_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p\n", e); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset - < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) return -EINVAL; - } if (!ip_checkentry(&e->ip)) return -EINVAL; @@ -759,12 +654,9 @@ check_entry_size_and_hooks(struct ipt_entry *e, if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { - if (!check_underflow(e)) { - pr_debug("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + if (!check_underflow(e)) return -EINVAL; - } + newinfo->underflow[h] = underflows[h]; } } @@ -816,7 +708,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, newinfo->underflow[i] = 0xFFFFFFFF; } - duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { @@ -833,27 +724,18 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, ++newinfo->stacksize; } - if (i != repl->num_entries) { - duprintf("translate_table: %u not %u entries\n", - i, repl->num_entries); + if (i != repl->num_entries) return -EINVAL; - } /* Check hooks all assigned */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { /* Only hooks which are valid */ if (!(repl->valid_hooks & (1 << i))) continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) { - duprintf("Invalid hook entry %u %u\n", - i, repl->hook_entry[i]); + if (newinfo->hook_entry[i] == 0xFFFFFFFF) return -EINVAL; - } - if (newinfo->underflow[i] == 0xFFFFFFFF) { - duprintf("Invalid underflow %u %u\n", - i, repl->underflow[i]); + if (newinfo->underflow[i] == 0xFFFFFFFF) return -EINVAL; - } } if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) @@ -1081,11 +963,8 @@ static int get_info(struct net *net, void __user *user, struct xt_table *t; int ret; - if (*len != sizeof(struct ipt_getinfo)) { - duprintf("length %u != %zu\n", *len, - sizeof(struct ipt_getinfo)); + if (*len != sizeof(struct ipt_getinfo)) return -EINVAL; - } if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; @@ -1143,31 +1022,23 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, struct ipt_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct ipt_get_entries) + get.size) { - duprintf("get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct ipt_get_entries) + get.size) return -EINVAL; - } get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; - duprintf("t->private->number = %u\n", private->number); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); - else { - duprintf("get_entries: I've got %u not %u!\n", - private->size, get.size); + else ret = -EAGAIN; - } + module_put(t->me); xt_table_unlock(t); } else @@ -1203,8 +1074,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, /* You lied! */ if (valid_hooks != t->valid_hooks) { - duprintf("Valid hook crap: %08X vs %08X\n", - valid_hooks, t->valid_hooks); ret = -EINVAL; goto put_module; } @@ -1214,8 +1083,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, goto put_module; /* Update module usage count based on number of rules */ - duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", - oldinfo->number, oldinfo->initial_entries, newinfo->number); if ((oldinfo->number > oldinfo->initial_entries) || (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); @@ -1284,8 +1151,6 @@ do_replace(struct net *net, const void __user *user, unsigned int len) if (ret != 0) goto free_newinfo; - duprintf("Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); if (ret) @@ -1411,11 +1276,9 @@ compat_find_calc_match(struct xt_entry_match *m, match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name, m->u.user.revision); - if (IS_ERR(match)) { - duprintf("compat_check_calc_match: `%s' not found\n", - m->u.user.name); + if (IS_ERR(match)) return PTR_ERR(match); - } + m->u.kernel.match = match; *size += xt_compat_match_offset(match); return 0; @@ -1447,20 +1310,14 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, unsigned int j; int ret, off; - duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 || (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p, limit = %p\n", e, limit); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset < sizeof(struct compat_ipt_entry) + - sizeof(struct compat_xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + sizeof(struct compat_xt_entry_target)) return -EINVAL; - } if (!ip_checkentry(&e->ip)) return -EINVAL; @@ -1484,8 +1341,6 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", - t->u.user.name); ret = PTR_ERR(target); goto release_matches; } @@ -1567,7 +1422,6 @@ translate_compat_table(struct net *net, size = compatr->size; info->number = compatr->num_entries; - duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(AF_INET); xt_compat_init_offsets(AF_INET, compatr->num_entries); @@ -1582,11 +1436,8 @@ translate_compat_table(struct net *net, } ret = -EINVAL; - if (j != compatr->num_entries) { - duprintf("translate_compat_table: %u not %u entries\n", - j, compatr->num_entries); + if (j != compatr->num_entries) goto out_unlock; - } ret = -ENOMEM; newinfo = xt_alloc_table_info(size); @@ -1683,8 +1534,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) if (ret != 0) goto free_newinfo; - duprintf("compat_do_replace: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, compat_ptr(tmp.counters)); if (ret) @@ -1718,7 +1567,6 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, break; default: - duprintf("do_ipt_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1768,19 +1616,15 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, struct compat_ipt_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct compat_ipt_get_entries) + get.size) { - duprintf("compat_get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct compat_ipt_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(AF_INET); @@ -1788,16 +1632,13 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; - duprintf("t->private->number = %u\n", private->number); ret = compat_table_info(private, &info); - if (!ret && get.size == info.size) { + if (!ret && get.size == info.size) ret = compat_copy_entries_to_user(private->size, t, uptr->entrytable); - } else if (!ret) { - duprintf("compat_get_entries: I've got %u not %u!\n", - private->size, get.size); + else if (!ret) ret = -EAGAIN; - } + xt_compat_flush_offsets(AF_INET); module_put(t->me); xt_table_unlock(t); @@ -1850,7 +1691,6 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) break; default: - duprintf("do_ipt_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1902,7 +1742,6 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) } default: - duprintf("do_ipt_get_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -2004,7 +1843,6 @@ icmp_match(const struct sk_buff *skb, struct xt_action_param *par) /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ - duprintf("Dropping evil ICMP tinygram.\n"); par->hotdrop = true; return false; } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index e3c46e8e2762..ae1a71a97132 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -360,7 +360,7 @@ static int ipv4_init_net(struct net *net) in->ctl_table[0].data = &nf_conntrack_max; in->ctl_table[1].data = &net->ct.count; - in->ctl_table[2].data = &net->ct.htable_size; + in->ctl_table[2].data = &nf_conntrack_htable_size; in->ctl_table[3].data = &net->ct.sysctl_checksum; in->ctl_table[4].data = &net->ct.sysctl_log_invalid; #endif diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index f0dfe92a00d6..c6f3c406f707 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -31,15 +31,14 @@ struct ct_iter_state { static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) { - struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < net->ct.htable_size; + st->bucket < nf_conntrack_htable_size; st->bucket++) { n = rcu_dereference( - hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); + hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -49,17 +48,16 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, struct hlist_nulls_node *head) { - struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= net->ct.htable_size) + if (++st->bucket >= nf_conntrack_htable_size) return NULL; } head = rcu_dereference( - hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); + hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); } return head; } @@ -114,6 +112,23 @@ static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) } #endif +static bool ct_seq_should_skip(const struct nf_conn *ct, + const struct net *net, + const struct nf_conntrack_tuple_hash *hash) +{ + /* we only want to print DIR_ORIGINAL */ + if (NF_CT_DIRECTION(hash)) + return true; + + if (nf_ct_l3num(ct) != AF_INET) + return true; + + if (!net_eq(nf_ct_net(ct), net)) + return true; + + return false; +} + static int ct_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_tuple_hash *hash = v; @@ -123,14 +138,15 @@ static int ct_seq_show(struct seq_file *s, void *v) int ret = 0; NF_CT_ASSERT(ct); - if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) + if (ct_seq_should_skip(ct, seq_file_net(s), hash)) return 0; + if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) + return 0; - /* we only want to print DIR_ORIGINAL */ - if (NF_CT_DIRECTION(hash)) - goto release; - if (nf_ct_l3num(ct) != AF_INET) + /* check if we raced w. object reuse */ + if (!nf_ct_is_confirmed(ct) || + ct_seq_should_skip(ct, seq_file_net(s), hash)) goto release; l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); @@ -220,13 +236,12 @@ struct ct_expect_iter_state { static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { - struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { n = rcu_dereference( - hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); if (n) return n; } @@ -236,7 +251,6 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { - struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(hlist_next_rcu(head)); @@ -244,7 +258,7 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq, if (++st->bucket >= nf_ct_expect_hsize) return NULL; head = rcu_dereference( - hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); } return head; } @@ -285,6 +299,9 @@ static int exp_seq_show(struct seq_file *s, void *v) exp = hlist_entry(n, struct nf_conntrack_expect, hnode); + if (!net_eq(nf_ct_net(exp->master), seq_file_net(s))) + return 0; + if (exp->tuple.src.l3num != AF_INET) return 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 60398a9370e7..a1f2830d8110 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -915,11 +915,11 @@ static int ip_error(struct sk_buff *skb) if (!IN_DEV_FORWARD(in_dev)) { switch (rt->dst.error) { case EHOSTUNREACH: - IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS); + __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS); break; case ENETUNREACH: - IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); + __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); break; } goto out; @@ -934,7 +934,7 @@ static int ip_error(struct sk_buff *skb) break; case ENETUNREACH: code = ICMP_NET_UNREACH; - IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); + __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); break; case EACCES: code = ICMP_PKT_FILTERED; @@ -2146,6 +2146,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, unsigned int flags = 0; struct fib_result res; struct rtable *rth; + int master_idx; int orig_oif; int err = -ENETUNREACH; @@ -2155,6 +2156,9 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, orig_oif = fl4->flowi4_oif; + master_idx = l3mdev_master_ifindex_by_index(net, fl4->flowi4_oif); + if (master_idx) + fl4->flowi4_oif = master_idx; fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_tos = tos & IPTOS_RT_MASK; fl4->flowi4_scope = ((tos & RTO_ONLINK) ? diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 4c04f09338e3..e3c4043c27de 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -312,11 +312,11 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) mss = __cookie_v4_check(ip_hdr(skb), th, cookie); if (mss == 0) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4d73858991af..5c7ed147449c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -430,14 +430,15 @@ EXPORT_SYMBOL(tcp_init_sock); static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) { - if (sk->sk_tsflags || tsflags) { + if (tsflags) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags); - if (shinfo->tx_flags & SKBTX_ANY_TSTAMP) + if (tsflags & SOF_TIMESTAMPING_TX_ACK) + tcb->txstamp_ack = 1; + if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; - tcb->txstamp_ack = !!(shinfo->tx_flags & SKBTX_ACK_TSTAMP); } } @@ -908,7 +909,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, int copy, i; bool can_coalesce; - if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { + if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 || + !tcp_skb_can_collapse_to(skb)) { new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; @@ -1082,6 +1084,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) struct sockcm_cookie sockc; int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; + bool process_backlog = false; bool sg; long timeo; @@ -1134,11 +1137,12 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) /* This should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - mss_now = tcp_send_mss(sk, &size_goal, flags); - /* Ok commence sending. */ copied = 0; +restart: + mss_now = tcp_send_mss(sk, &size_goal, flags); + err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto out_err; @@ -1156,7 +1160,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) copy = max - skb->len; } - if (copy <= 0) { + if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { new_segment: /* Allocate new segment. If the interface is SG, * allocate skb fitting to single page. @@ -1164,6 +1168,10 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; + if (process_backlog && sk_flush_backlog(sk)) { + process_backlog = false; + goto restart; + } skb = sk_stream_alloc_skb(sk, select_size(sk, sg), sk->sk_allocation, @@ -1171,6 +1179,7 @@ new_segment: if (!skb) goto wait_for_memory; + process_backlog = true; /* * Check whether we can use HW checksum. */ @@ -1250,6 +1259,8 @@ new_segment: copied += copy; if (!msg_data_left(msg)) { tcp_tx_timestamp(sk, sockc.tsflags, skb); + if (unlikely(flags & MSG_EOR)) + TCP_SKB_CB(skb)->eor = 1; goto out; } @@ -1443,14 +1454,10 @@ static void tcp_prequeue_process(struct sock *sk) struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); - NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED); - /* RX process wants to run with disabled BHs, though it is not - * necessary */ - local_bh_disable(); while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) sk_backlog_rcv(sk, skb); - local_bh_enable(); /* Clear memory counter. */ tp->ucopy.memory = 0; @@ -1777,7 +1784,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, chunk = len - tp->ucopy.len; if (chunk != 0) { - NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); len -= chunk; copied += chunk; } @@ -1789,7 +1796,7 @@ do_prequeue: chunk = len - tp->ucopy.len; if (chunk != 0) { - NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); len -= chunk; copied += chunk; } @@ -1875,7 +1882,7 @@ skip_copy: tcp_prequeue_process(sk); if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); len -= chunk; copied += chunk; } @@ -2065,13 +2072,13 @@ void tcp_close(struct sock *sk, long timeout) sk->sk_prot->disconnect(sk, 0); } else if (data_was_unread) { /* Unread data was tossed, zap the connection. */ - NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, sk->sk_allocation); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); - NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); } else if (tcp_close_state(sk)) { /* We FIN if the application ate all the data before * zapping the connection. @@ -2148,7 +2155,7 @@ adjudge_to_death: if (tp->linger2 < 0) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); - NET_INC_STATS_BH(sock_net(sk), + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONLINGER); } else { const int tmo = tcp_fin_time(sk); @@ -2167,7 +2174,7 @@ adjudge_to_death: if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); - NET_INC_STATS_BH(sock_net(sk), + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); } } @@ -3091,7 +3098,7 @@ void tcp_done(struct sock *sk) struct request_sock *req = tcp_sk(sk)->fastopen_rsk; if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index fd1405d37c14..36087bca9f48 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -197,15 +197,15 @@ static void bictcp_state(struct sock *sk, u8 new_state) /* Track delayed acknowledgment ratio using sliding window * ratio = (15*ratio + sample) / 16 */ -static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt) +static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_state == TCP_CA_Open) { struct bictcp *ca = inet_csk_ca(sk); - cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; - ca->delayed_ack += cnt; + ca->delayed_ack += sample->pkts_acked - + (ca->delayed_ack >> ACK_RATIO_SHIFT); } } diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index 167b6a3e1b98..03725b294286 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -155,11 +155,11 @@ static void tcp_cdg_hystart_update(struct sock *sk) ca->last_ack = now_us; if (after(now_us, ca->round_start + base_owd)) { - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPHYSTARTTRAINDETECT); - NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_TCPHYSTARTTRAINCWND, - tp->snd_cwnd); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINDETECT); + NET_ADD_STATS(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINCWND, + tp->snd_cwnd); tp->snd_ssthresh = tp->snd_cwnd; return; } @@ -174,11 +174,11 @@ static void tcp_cdg_hystart_update(struct sock *sk) 125U); if (ca->rtt.min > thresh) { - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPHYSTARTDELAYDETECT); - NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_TCPHYSTARTDELAYCWND, - tp->snd_cwnd); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYDETECT); + NET_ADD_STATS(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYCWND, + tp->snd_cwnd); tp->snd_ssthresh = tp->snd_cwnd; } } @@ -294,12 +294,12 @@ static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked) ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr); } -static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us) +static void tcp_cdg_acked(struct sock *sk, const struct ack_sample *sample) { struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - if (rtt_us <= 0) + if (sample->rtt_us <= 0) return; /* A heuristic for filtering delayed ACKs, adapted from: @@ -307,20 +307,20 @@ static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us) * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010. */ if (tp->sacked_out == 0) { - if (num_acked == 1 && ca->delack) { + if (sample->pkts_acked == 1 && ca->delack) { /* A delayed ACK is only used for the minimum if it is * provenly lower than an existing non-zero minimum. */ - ca->rtt.min = min(ca->rtt.min, rtt_us); + ca->rtt.min = min(ca->rtt.min, sample->rtt_us); ca->delack--; return; - } else if (num_acked > 1 && ca->delack < 5) { + } else if (sample->pkts_acked > 1 && ca->delack < 5) { ca->delack++; } } - ca->rtt.min = min_not_zero(ca->rtt.min, rtt_us); - ca->rtt.max = max(ca->rtt.max, rtt_us); + ca->rtt.min = min_not_zero(ca->rtt.min, sample->rtt_us); + ca->rtt.max = max(ca->rtt.max, sample->rtt_us); } static u32 tcp_cdg_ssthresh(struct sock *sk) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 448c2615fece..c99230efcd52 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -402,11 +402,11 @@ static void hystart_update(struct sock *sk, u32 delay) ca->last_ack = now; if ((s32)(now - ca->round_start) > ca->delay_min >> 4) { ca->found |= HYSTART_ACK_TRAIN; - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPHYSTARTTRAINDETECT); - NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_TCPHYSTARTTRAINCWND, - tp->snd_cwnd); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINDETECT); + NET_ADD_STATS(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINCWND, + tp->snd_cwnd); tp->snd_ssthresh = tp->snd_cwnd; } } @@ -423,11 +423,11 @@ static void hystart_update(struct sock *sk, u32 delay) if (ca->curr_rtt > ca->delay_min + HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { ca->found |= HYSTART_DELAY; - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPHYSTARTDELAYDETECT); - NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_TCPHYSTARTDELAYCWND, - tp->snd_cwnd); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYDETECT); + NET_ADD_STATS(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYCWND, + tp->snd_cwnd); tp->snd_ssthresh = tp->snd_cwnd; } } @@ -437,21 +437,21 @@ static void hystart_update(struct sock *sk, u32 delay) /* Track delayed acknowledgment ratio using sliding window * ratio = (15*ratio + sample) / 16 */ -static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) +static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); u32 delay; /* Some calls are for duplicates without timetamps */ - if (rtt_us < 0) + if (sample->rtt_us < 0) return; /* Discard delay samples right after fast recovery */ if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ) return; - delay = (rtt_us << 3) / USEC_PER_MSEC; + delay = (sample->rtt_us << 3) / USEC_PER_MSEC; if (delay == 0) delay = 1; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index cffd8f9ed1a9..54d9f9b0120f 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -255,9 +255,9 @@ static bool tcp_fastopen_queue_check(struct sock *sk) spin_lock(&fastopenq->lock); req1 = fastopenq->rskq_rst_head; if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) { + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); spin_unlock(&fastopenq->lock); - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); return false; } fastopenq->rskq_rst_head = req1->dl_next; @@ -282,7 +282,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct sock *child; if (foc->len == 0) /* Client requests a cookie */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && (syn_data || foc->len >= 0) && @@ -311,13 +311,13 @@ fastopen: child = tcp_fastopen_create_child(sk, skb, dst, req); if (child) { foc->len = -1; - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENPASSIVE); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVE); return child; } - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); } else if (foc->len > 0) /* Client presents an invalid cookie */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); valid_foc.exp = foc->exp; *foc = valid_foc; diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 82f0d9ed60f5..4a4d8e76738f 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -99,7 +99,7 @@ static inline void measure_rtt(struct sock *sk, u32 srtt) } static void measure_achieved_throughput(struct sock *sk, - u32 pkts_acked, s32 rtt) + const struct ack_sample *sample) { const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); @@ -107,10 +107,10 @@ static void measure_achieved_throughput(struct sock *sk, u32 now = tcp_time_stamp; if (icsk->icsk_ca_state == TCP_CA_Open) - ca->pkts_acked = pkts_acked; + ca->pkts_acked = sample->pkts_acked; - if (rtt > 0) - measure_rtt(sk, usecs_to_jiffies(rtt)); + if (sample->rtt_us > 0) + measure_rtt(sk, usecs_to_jiffies(sample->rtt_us)); if (!use_bandwidth_switch) return; @@ -122,7 +122,7 @@ static void measure_achieved_throughput(struct sock *sk, return; } - ca->packetcount += pkts_acked; + ca->packetcount += sample->pkts_acked; if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) && now - ca->lasttime >= ca->minRTT && diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 2ab9bbb6faff..c8e6d86be114 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -82,30 +82,31 @@ static void tcp_illinois_init(struct sock *sk) } /* Measure RTT for each ack. */ -static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, s32 rtt) +static void tcp_illinois_acked(struct sock *sk, const struct ack_sample *sample) { struct illinois *ca = inet_csk_ca(sk); + s32 rtt_us = sample->rtt_us; - ca->acked = pkts_acked; + ca->acked = sample->pkts_acked; /* dup ack, no rtt sample */ - if (rtt < 0) + if (rtt_us < 0) return; /* ignore bogus values, this prevents wraparound in alpha math */ - if (rtt > RTT_MAX) - rtt = RTT_MAX; + if (rtt_us > RTT_MAX) + rtt_us = RTT_MAX; /* keep track of minimum RTT seen so far */ - if (ca->base_rtt > rtt) - ca->base_rtt = rtt; + if (ca->base_rtt > rtt_us) + ca->base_rtt = rtt_us; /* and max */ - if (ca->max_rtt < rtt) - ca->max_rtt = rtt; + if (ca->max_rtt < rtt_us) + ca->max_rtt = rtt_us; ++ca->cnt_rtt; - ca->sum_rtt += rtt; + ca->sum_rtt += rtt_us; } /* Maximum queuing delay */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 967520dbe0bf..d6c8f4cd0800 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -869,7 +869,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, else mib_idx = LINUX_MIB_TCPSACKREORDER; - NET_INC_STATS_BH(sock_net(sk), mib_idx); + NET_INC_STATS(sock_net(sk), mib_idx); #if FASTRETRANS_DEBUG > 1 pr_debug("Disorder%d %d %u f%u s%u rr%d\n", tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, @@ -1062,7 +1062,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { dup_sack = true; tcp_dsack_seen(tp); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV); } else if (num_sacks > 1) { u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); @@ -1071,7 +1071,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, !before(start_seq_0, start_seq_1)) { dup_sack = true; tcp_dsack_seen(tp); - NET_INC_STATS_BH(sock_net(sk), + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV); } } @@ -1289,7 +1289,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, if (skb->len > 0) { BUG_ON(!tcp_skb_pcount(skb)); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED); return false; } @@ -1303,6 +1303,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, } TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; + TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) TCP_SKB_CB(prev)->end_seq++; @@ -1313,7 +1314,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED); return true; } @@ -1368,6 +1369,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) goto fallback; + if (!tcp_skb_can_collapse_to(prev)) + goto fallback; + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && !before(end_seq, TCP_SKB_CB(skb)->end_seq); @@ -1469,7 +1473,7 @@ noop: return skb; fallback: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK); + NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK); return NULL; } @@ -1657,7 +1661,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, mib_idx = LINUX_MIB_TCPSACKDISCARD; } - NET_INC_STATS_BH(sock_net(sk), mib_idx); + NET_INC_STATS(sock_net(sk), mib_idx); if (i == 0) first_sack_index = -1; continue; @@ -1909,7 +1913,7 @@ void tcp_enter_loss(struct sock *sk) skb = tcp_write_queue_head(sk); is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); if (is_reneg) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); tp->sacked_out = 0; tp->fackets_out = 0; } @@ -2395,7 +2399,7 @@ static bool tcp_try_undo_recovery(struct sock *sk) else mib_idx = LINUX_MIB_TCPFULLUNDO; - NET_INC_STATS_BH(sock_net(sk), mib_idx); + NET_INC_STATS(sock_net(sk), mib_idx); } if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { /* Hold old state until something *above* high_seq @@ -2417,7 +2421,7 @@ static bool tcp_try_undo_dsack(struct sock *sk) if (tp->undo_marker && !tp->undo_retrans) { DBGUNDO(sk, "D-SACK"); tcp_undo_cwnd_reduction(sk, false); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); return true; } return false; @@ -2432,10 +2436,10 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) tcp_undo_cwnd_reduction(sk, true); DBGUNDO(sk, "partial loss"); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); if (frto_undo) - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPSPURIOUSRTOS); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPSPURIOUSRTOS); inet_csk(sk)->icsk_retransmits = 0; if (frto_undo || tcp_is_sack(tp)) tcp_set_ca_state(sk, TCP_CA_Open); @@ -2559,7 +2563,7 @@ static void tcp_mtup_probe_failed(struct sock *sk) icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1; icsk->icsk_mtup.probe_size = 0; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPFAIL); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL); } static void tcp_mtup_probe_success(struct sock *sk) @@ -2579,7 +2583,7 @@ static void tcp_mtup_probe_success(struct sock *sk) icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; icsk->icsk_mtup.probe_size = 0; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS); } /* Do a simple retransmit without using the backoff mechanisms in @@ -2643,7 +2647,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) else mib_idx = LINUX_MIB_TCPSACKRECOVERY; - NET_INC_STATS_BH(sock_net(sk), mib_idx); + NET_INC_STATS(sock_net(sk), mib_idx); tp->prior_ssthresh = 0; tcp_init_undo(tp); @@ -2736,7 +2740,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked) DBGUNDO(sk, "partial recovery"); tcp_undo_cwnd_reduction(sk, true); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); tcp_try_keep_open(sk); return true; } @@ -3087,8 +3091,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, return; shinfo = skb_shinfo(skb); - if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) && - !before(shinfo->tskey, prior_snd_una) && + if (!before(shinfo->tskey, prior_snd_una) && before(shinfo->tskey, tcp_sk(sk)->snd_una)) __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); } @@ -3245,8 +3248,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tcp_rearm_rto(sk); } - if (icsk->icsk_ca_ops->pkts_acked) - icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us); + if (icsk->icsk_ca_ops->pkts_acked) { + struct ack_sample sample = { .pkts_acked = pkts_acked, + .rtt_us = ca_rtt_us }; + + icsk->icsk_ca_ops->pkts_acked(sk, &sample); + } #if FASTRETRANS_DEBUG > 0 WARN_ON((int)tp->sacked_out < 0); @@ -3352,9 +3359,10 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) { u32 delta = ack - tp->snd_una; - u64_stats_update_begin(&tp->syncp); + sock_owned_by_me((struct sock *)tp); + u64_stats_update_begin_raw(&tp->syncp); tp->bytes_acked += delta; - u64_stats_update_end(&tp->syncp); + u64_stats_update_end_raw(&tp->syncp); tp->snd_una = ack; } @@ -3363,9 +3371,10 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) { u32 delta = seq - tp->rcv_nxt; - u64_stats_update_begin(&tp->syncp); + sock_owned_by_me((struct sock *)tp); + u64_stats_update_begin_raw(&tp->syncp); tp->bytes_received += delta; - u64_stats_update_end(&tp->syncp); + u64_stats_update_end_raw(&tp->syncp); tp->rcv_nxt = seq; } @@ -3431,7 +3440,7 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { - NET_INC_STATS_BH(net, mib_idx); + NET_INC_STATS(net, mib_idx); return true; /* rate-limited: don't send yet! */ } } @@ -3464,7 +3473,7 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) challenge_count = 0; } if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); tcp_send_ack(sk); } } @@ -3513,8 +3522,8 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) tcp_set_ca_state(sk, TCP_CA_CWR); tcp_end_cwnd_reduction(sk); tcp_try_keep_open(sk); - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPLOSSPROBERECOVERY); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPLOSSPROBERECOVERY); } else if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP | FLAG_DATA_SACKED))) { /* Pure dupack: original and TLP probe arrived; no loss */ @@ -3618,14 +3627,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { u32 ack_ev_flags = CA_ACK_SLOWPATH; if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS); flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); @@ -4128,7 +4137,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) else mib_idx = LINUX_MIB_TCPDSACKOFOSENT; - NET_INC_STATS_BH(sock_net(sk), mib_idx); + NET_INC_STATS(sock_net(sk), mib_idx); tp->rx_opt.dsack = 1; tp->duplicate_sack[0].start_seq = seq; @@ -4152,7 +4161,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); + NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk); if (tcp_is_sack(tp) && sysctl_tcp_dsack) { @@ -4302,7 +4311,7 @@ static bool tcp_try_coalesce(struct sock *sk, atomic_add(delta, &sk->sk_rmem_alloc); sk_mem_charge(sk, delta); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags; @@ -4390,7 +4399,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) tcp_ecn_check_ce(tp, skb); if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); tcp_drop(sk, skb); return; } @@ -4399,7 +4408,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) tp->pred_flags = 0; inet_csk_schedule_ack(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); @@ -4454,7 +4463,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); tcp_drop(sk, skb); skb = NULL; tcp_dsack_set(sk, seq, end_seq); @@ -4493,7 +4502,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) __skb_unlink(skb1, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); tcp_drop(sk, skb1); } @@ -4608,14 +4617,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) __set_current_state(TASK_RUNNING); - local_bh_enable(); if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) { tp->ucopy.len -= chunk; tp->copied_seq += chunk; eaten = (chunk == skb->len); tcp_rcv_space_adjust(sk); } - local_bh_disable(); } if (eaten <= 0) { @@ -4658,7 +4665,7 @@ queue_and_out: if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { /* A retransmit, 2nd most common case. Force an immediate ack. */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); + NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); out_of_window: @@ -4704,7 +4711,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, __skb_unlink(skb, list); __kfree_skb(skb); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); return next; } @@ -4863,7 +4870,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk) bool res = false; if (!skb_queue_empty(&tp->out_of_order_queue)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); __skb_queue_purge(&tp->out_of_order_queue); /* Reset SACK state. A conforming SACK implementation will @@ -4892,7 +4899,7 @@ static int tcp_prune_queue(struct sock *sk) SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); @@ -4922,7 +4929,7 @@ static int tcp_prune_queue(struct sock *sk) * drop receive data on the floor. It will get retransmitted * and hopefully then we'll have sufficient space. */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED); /* Massive buffer overcommit. */ tp->pred_flags = 0; @@ -5131,7 +5138,6 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) int chunk = skb->len - hlen; int err; - local_bh_enable(); if (skb_csum_unnecessary(skb)) err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk); else @@ -5143,32 +5149,9 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) tcp_rcv_space_adjust(sk); } - local_bh_disable(); return err; } -static __sum16 __tcp_checksum_complete_user(struct sock *sk, - struct sk_buff *skb) -{ - __sum16 result; - - if (sock_owned_by_user(sk)) { - local_bh_enable(); - result = __tcp_checksum_complete(skb); - local_bh_disable(); - } else { - result = __tcp_checksum_complete(skb); - } - return result; -} - -static inline bool tcp_checksum_complete_user(struct sock *sk, - struct sk_buff *skb) -{ - return !skb_csum_unnecessary(skb) && - __tcp_checksum_complete_user(sk, skb); -} - /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ @@ -5181,7 +5164,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && tcp_paws_discard(sk, skb)) { if (!th->rst) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDPAWS, &tp->last_oow_ack_time)) @@ -5233,8 +5216,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, if (th->syn) { syn_challenge: if (syn_inerr) - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); + TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); tcp_send_challenge_ack(sk, skb); goto discard; } @@ -5349,7 +5332,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_data_snd_check(sk); return; } else { /* Header too small */ - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; } } else { @@ -5377,12 +5360,13 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, __skb_pull(skb, tcp_header_len); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPHPHITSTOUSER); eaten = 1; } } if (!eaten) { - if (tcp_checksum_complete_user(sk, skb)) + if (tcp_checksum_complete(skb)) goto csum_error; if ((int)skb->truesize > sk->sk_forward_alloc) @@ -5399,7 +5383,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_rcv_rtt_measure_ts(sk, skb); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ eaten = tcp_queue_rcv(sk, skb, tcp_header_len, @@ -5426,7 +5410,7 @@ no_ack: } slow_path: - if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb)) + if (len < (th->doff << 2) || tcp_checksum_complete(skb)) goto csum_error; if (!th->ack && !th->rst && !th->syn) @@ -5456,8 +5440,8 @@ step5: return; csum_error: - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); discard: tcp_drop(sk, skb); @@ -5549,12 +5533,14 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, break; } tcp_rearm_rto(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVEFAIL); return true; } tp->syn_data_acked = tp->syn_data; if (tp->syn_data_acked) - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVE); tcp_fastopen_add_skb(sk, synack); @@ -5589,7 +5575,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, tcp_time_stamp)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_PAWSACTIVEREJECTED); goto reset_and_undo; } @@ -5958,7 +5945,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { tcp_done(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return 1; } @@ -6015,7 +6002,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (sk->sk_shutdown & RCV_SHUTDOWN) { if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); tcp_reset(sk); return 1; } @@ -6153,10 +6140,10 @@ static bool tcp_syn_flood_action(const struct sock *sk, if (net->ipv4.sysctl_tcp_syncookies) { msg = "Sending cookies"; want_cookie = true; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); } else #endif - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); if (!queue->synflood_warned && net->ipv4.sysctl_tcp_syncookies != 2 && @@ -6217,7 +6204,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, * timeout. */ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); goto drop; } @@ -6264,7 +6251,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (dst && strict && !tcp_peer_is_proven(req, dst, true, tmp_opt.saw_tstamp)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d2a5763e5abc..8219d0d8dc83 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -320,7 +320,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort) * an established socket here. */ if (seq != tcp_rsk(req)->snt_isn) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); } else if (abort) { /* * Still in SYN_RECV, just remove it silently. @@ -372,7 +372,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) th->dest, iph->saddr, ntohs(th->source), inet_iif(icmp_skb)); if (!sk) { - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return; } if (sk->sk_state == TCP_TIME_WAIT) { @@ -396,13 +396,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) */ if (sock_owned_by_user(sk)) { if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) - NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); + __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); } if (sk->sk_state == TCP_CLOSE) goto out; if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { - NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto out; } @@ -413,7 +413,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && !between(seq, snd_una, tp->snd_nxt)) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -692,13 +692,15 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) offsetof(struct inet_timewait_sock, tw_bound_dev_if)); arg.tos = ip_hdr(skb)->tos; + local_bh_disable(); ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); - TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); - TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); + __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); + __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); + local_bh_enable(); #ifdef CONFIG_TCP_MD5SIG out: @@ -774,12 +776,14 @@ static void tcp_v4_send_ack(struct net *net, if (oif) arg.bound_dev_if = oif; arg.tos = tos; + local_bh_disable(); ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); - TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); + __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); + local_bh_enable(); } static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) @@ -1151,12 +1155,12 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, return false; if (hash_expected && !hash_location) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); return true; } if (!hash_expected && hash_location) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); return true; } @@ -1342,7 +1346,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, return newsk; exit_overflow: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); exit_nonewsk: dst_release(dst); exit: @@ -1432,8 +1436,8 @@ discard: return 0; csum_err: - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; } EXPORT_SYMBOL(tcp_v4_do_rcv); @@ -1506,16 +1510,16 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) __skb_queue_tail(&tp->ucopy.prequeue, skb); tp->ucopy.memory += skb->truesize; - if (tp->ucopy.memory > sk->sk_rcvbuf) { + if (skb_queue_len(&tp->ucopy.prequeue) >= 32 || + tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { struct sk_buff *skb1; BUG_ON(sock_owned_by_user(sk)); + __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED, + skb_queue_len(&tp->ucopy.prequeue)); - while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) { + while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) sk_backlog_rcv(sk, skb1); - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPPREQUEUEDROPPED); - } tp->ucopy.memory = 0; } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { @@ -1547,7 +1551,7 @@ int tcp_v4_rcv(struct sk_buff *skb) goto discard_it; /* Count it even if it's bad */ - TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); + __TCP_INC_STATS(net, TCP_MIB_INSEGS); if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; @@ -1629,7 +1633,7 @@ process: } } if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { - NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; } @@ -1662,7 +1666,7 @@ process: } else if (unlikely(sk_add_backlog(sk, skb, sk->sk_rcvbuf + sk->sk_sndbuf))) { bh_unlock_sock(sk); - NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); + __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP); goto discard_and_relse; } bh_unlock_sock(sk); @@ -1679,9 +1683,9 @@ no_tcp_socket: if (tcp_checksum_complete(skb)) { csum_error: - TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); + __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); bad_packet: - TCP_INC_STATS_BH(net, TCP_MIB_INERRS); + __TCP_INC_STATS(net, TCP_MIB_INERRS); } else { tcp_v4_send_reset(NULL, skb); } @@ -1835,7 +1839,9 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_free_fastopen_req(tp); tcp_saved_syn_free(tp); + local_bh_disable(); sk_sockets_allocated_dec(sk); + local_bh_enable(); if (mem_cgroup_sockets_enabled && sk->sk_memcg) sock_release_memcg(sk); diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index 1e70fa8fa793..c67ece1390c2 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -260,13 +260,13 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt) * newReno in increase case. * We work it out by following the idea from TCP-LP's paper directly */ -static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us) +static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); - if (rtt_us > 0) - tcp_lp_rtt_sample(sk, rtt_us); + if (sample->rtt_us > 0) + tcp_lp_rtt_sample(sk, sample->rtt_us); /* calc inference */ if (tcp_time_stamp > tp->rx_opt.rcv_tsecr) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 4c53e7c86586..4b95ec4ed2c8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -235,7 +235,7 @@ kill: } if (paws_reject) - NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); + __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. @@ -337,7 +337,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) * socket up. We've got bigger problems than * non-graceful socket closings. */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); } tcp_update_metrics(sk); @@ -545,7 +545,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rack.mstamp.v64 = 0; newtp->rack.advanced = 0; - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); } return newsk; } @@ -710,7 +710,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); if (paws_reject) - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); return NULL; } @@ -729,7 +729,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * "fourth, check the SYN bit" */ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); goto embryonic_reset; } @@ -752,7 +752,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); return NULL; } @@ -791,7 +791,7 @@ embryonic_reset: } if (!fastopen) { inet_csk_reqsk_queue_drop(sk, req); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); } return NULL; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9d3b4b364652..8daefd8b1b49 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -949,7 +949,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_orphan(skb); skb->sk = sk; - skb->destructor = skb_is_tcp_pure_ack(skb) ? sock_wfree : tcp_wfree; + skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree; skb_set_hash_from_sk(skb, sk); atomic_add(skb->truesize, &sk->sk_wmem_alloc); @@ -1111,11 +1111,17 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de tcp_verify_left_out(tp); } +static bool tcp_has_tx_tstamp(const struct sk_buff *skb) +{ + return TCP_SKB_CB(skb)->txstamp_ack || + (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP); +} + static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2) { struct skb_shared_info *shinfo = skb_shinfo(skb); - if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) && + if (unlikely(tcp_has_tx_tstamp(skb)) && !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) { struct skb_shared_info *shinfo2 = skb_shinfo(skb2); u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP; @@ -1128,6 +1134,12 @@ static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2) } } +static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) +{ + TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor; + TCP_SKB_CB(skb)->eor = 0; +} + /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. @@ -1173,6 +1185,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->tcp_flags = flags; TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; + tcp_skb_fragment_eor(skb, buff); if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { /* Copy and checksum data tail into the new buffer. */ @@ -1733,6 +1746,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* This packet was never sent out yet, so no SACK bits. */ TCP_SKB_CB(buff)->sacked = 0; + tcp_skb_fragment_eor(skb, buff); + buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL; skb_split(skb, buff, len); tcp_fragment_tstamp(skb, buff); @@ -2206,14 +2221,13 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Thanks to skb fast clones, we can detect if a prior transmit of * a packet is still in a qdisc or driver queue. * In this case, there is very little point doing a retransmit ! - * Note: This is called from BH context only. */ static bool skb_still_in_host_queue(const struct sock *sk, const struct sk_buff *skb) { if (unlikely(skb_fclone_busy(sk, skb))) { - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); return true; } return false; @@ -2275,7 +2289,7 @@ void tcp_send_loss_probe(struct sock *sk) tp->tlp_high_seq = tp->snd_nxt; probe_sent: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); /* Reset s.t. tcp_rearm_rto will restart timer from now */ inet_csk(sk)->icsk_pending = 0; rearm_timer: @@ -2446,13 +2460,12 @@ u32 __tcp_select_window(struct sock *sk) void tcp_skb_collapse_tstamp(struct sk_buff *skb, const struct sk_buff *next_skb) { - const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb); - u8 tsflags = next_shinfo->tx_flags & SKBTX_ANY_TSTAMP; - - if (unlikely(tsflags)) { + if (unlikely(tcp_has_tx_tstamp(next_skb))) { + const struct skb_shared_info *next_shinfo = + skb_shinfo(next_skb); struct skb_shared_info *shinfo = skb_shinfo(skb); - shinfo->tx_flags |= tsflags; + shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP; shinfo->tskey = next_shinfo->tskey; TCP_SKB_CB(skb)->txstamp_ack |= TCP_SKB_CB(next_skb)->txstamp_ack; @@ -2494,6 +2507,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) * packet counting does not break. */ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS; + TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor; /* changed transmit queue under us so clear hints */ tcp_clear_retrans_hints_partial(tp); @@ -2545,6 +2559,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (!tcp_can_collapse(sk, skb)) break; + if (!tcp_skb_can_collapse_to(to)) + break; + space -= skb->len; if (first) { @@ -2656,7 +2673,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) /* Update global TCP statistics. */ TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); tp->total_retrans += segs; } return err; @@ -2681,7 +2698,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) tp->retrans_stamp = tcp_skb_timestamp(skb); } else if (err != -EBUSY) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } if (tp->undo_retrans < 0) @@ -2805,7 +2822,7 @@ begin_fwd: if (tcp_retransmit_skb(sk, skb, segs)) return; - NET_INC_STATS_BH(sock_net(sk), mib_idx); + NET_INC_STATS(sock_net(sk), mib_idx); if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); @@ -3042,7 +3059,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, th->window = htons(min(req->rsk_rcv_wnd, 65535U)); tcp_options_write((__be32 *)(th + 1), NULL, &opts); th->doff = (tcp_header_size >> 2); - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ @@ -3540,8 +3557,8 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) tcp_rsk(req)->txhash = net_tx_rndhash(); res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL); if (!res) { - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); } return res; } diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 5353085fd0b2..e36df4fcfeba 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -65,8 +65,8 @@ int tcp_rack_mark_lost(struct sock *sk) if (scb->sacked & TCPCB_SACKED_RETRANS) { scb->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPLOSTRETRANSMIT); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPLOSTRETRANSMIT); } } else if (!(scb->sacked & TCPCB_RETRANS)) { /* Original data are sent sequentially so stop early diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 373b03e78aaa..debdd8b33e69 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -30,7 +30,7 @@ static void tcp_write_err(struct sock *sk) sk->sk_error_report(sk); tcp_done(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); } /* Do not allow orphaned sockets to eat all our resources. @@ -68,7 +68,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) if (do_reset) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; } return 0; @@ -162,8 +162,8 @@ static int tcp_write_timeout(struct sock *sk) if (tp->syn_fastopen || tp->syn_data) tcp_fastopen_cache_set(sk, 0, NULL, true, 0); if (tp->syn_data && icsk->icsk_retransmits == 1) - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVEFAIL); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVEFAIL); } retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; syn_set = true; @@ -178,8 +178,8 @@ static int tcp_write_timeout(struct sock *sk) tp->bytes_acked <= tp->rx_opt.mss_clamp) { tcp_fastopen_cache_set(sk, 0, NULL, true, 0); if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1) - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVEFAIL); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVEFAIL); } /* Black hole detection */ tcp_mtu_probing(icsk, sk); @@ -209,6 +209,7 @@ static int tcp_write_timeout(struct sock *sk) return 0; } +/* Called with BH disabled */ void tcp_delack_timer_handler(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -228,7 +229,7 @@ void tcp_delack_timer_handler(struct sock *sk) if (!skb_queue_empty(&tp->ucopy.prequeue)) { struct sk_buff *skb; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) sk_backlog_rcv(sk, skb); @@ -248,7 +249,7 @@ void tcp_delack_timer_handler(struct sock *sk) icsk->icsk_ack.ato = TCP_ATO_MIN; } tcp_send_ack(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } out: @@ -265,7 +266,7 @@ static void tcp_delack_timer(unsigned long data) tcp_delack_timer_handler(sk); } else { inet_csk(sk)->icsk_ack.blocked = 1; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) sock_hold(sk); @@ -431,7 +432,7 @@ void tcp_retransmit_timer(struct sock *sk) } else { mib_idx = LINUX_MIB_TCPTIMEOUTS; } - NET_INC_STATS_BH(sock_net(sk), mib_idx); + __NET_INC_STATS(sock_net(sk), mib_idx); } tcp_enter_loss(sk); @@ -493,6 +494,7 @@ out_reset_timer: out:; } +/* Called with BH disabled */ void tcp_write_timer_handler(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -549,7 +551,7 @@ void tcp_syn_ack_timeout(const struct request_sock *req) { struct net *net = read_pnet(&inet_rsk(req)->ireq_net); - NET_INC_STATS_BH(net, LINUX_MIB_TCPTIMEOUTS); + __NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS); } EXPORT_SYMBOL(tcp_syn_ack_timeout); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 13951c4087d4..4c4bac1b5eab 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -107,16 +107,16 @@ EXPORT_SYMBOL_GPL(tcp_vegas_init); * o min-filter RTT samples from a much longer window (forever for now) * to find the propagation delay (baseRTT) */ -void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us) +void tcp_vegas_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct vegas *vegas = inet_csk_ca(sk); u32 vrtt; - if (rtt_us < 0) + if (sample->rtt_us < 0) return; /* Never allow zero rtt or baseRTT */ - vrtt = rtt_us + 1; + vrtt = sample->rtt_us + 1; /* Filter to find propagation delay: */ if (vrtt < vegas->baseRTT) diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h index ef9da5306c68..248cfc0ff9ae 100644 --- a/net/ipv4/tcp_vegas.h +++ b/net/ipv4/tcp_vegas.h @@ -17,7 +17,7 @@ struct vegas { void tcp_vegas_init(struct sock *sk); void tcp_vegas_state(struct sock *sk, u8 ca_state); -void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); +void tcp_vegas_pkts_acked(struct sock *sk, const struct ack_sample *sample); void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info); diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 0d094b995cd9..40171e163cff 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -69,16 +69,17 @@ static void tcp_veno_init(struct sock *sk) } /* Do rtt sampling needed for Veno. */ -static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us) +static void tcp_veno_pkts_acked(struct sock *sk, + const struct ack_sample *sample) { struct veno *veno = inet_csk_ca(sk); u32 vrtt; - if (rtt_us < 0) + if (sample->rtt_us < 0) return; /* Never allow zero rtt or baseRTT */ - vrtt = rtt_us + 1; + vrtt = sample->rtt_us + 1; /* Filter to find propagation delay: */ if (vrtt < veno->basertt) diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index c10732e39837..4b03a2e2a050 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -99,12 +99,13 @@ static void westwood_filter(struct westwood *w, u32 delta) * Called after processing group of packets. * but all westwood needs is the last sample of srtt. */ -static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, s32 rtt) +static void tcp_westwood_pkts_acked(struct sock *sk, + const struct ack_sample *sample) { struct westwood *w = inet_csk_ca(sk); - if (rtt > 0) - w->rtt = usecs_to_jiffies(rtt); + if (sample->rtt_us > 0) + w->rtt = usecs_to_jiffies(sample->rtt_us); } /* diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 3e6a472e6b88..028eb046ea40 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -56,15 +56,16 @@ static void tcp_yeah_init(struct sock *sk) tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); } -static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us) +static void tcp_yeah_pkts_acked(struct sock *sk, + const struct ack_sample *sample) { const struct inet_connection_sock *icsk = inet_csk(sk); struct yeah *yeah = inet_csk_ca(sk); if (icsk->icsk_ca_state == TCP_CA_Open) - yeah->pkts_acked = pkts_acked; + yeah->pkts_acked = sample->pkts_acked; - tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us); + tcp_vegas_pkts_acked(sk, sample); } static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 76ea0a8be090..f67f52ba4809 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -688,7 +688,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) iph->saddr, uh->source, skb->dev->ifindex, udptable, NULL); if (!sk) { - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return; /* No socket for error */ } @@ -882,13 +882,13 @@ send: err = ip_send_skb(sock_net(sk), skb); if (err) { if (err == -ENOBUFS && !inet->recverr) { - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_SNDBUFERRORS, is_udplite); + UDP_INC_STATS(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); err = 0; } } else - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_OUTDATAGRAMS, is_udplite); + UDP_INC_STATS(sock_net(sk), + UDP_MIB_OUTDATAGRAMS, is_udplite); return err; } @@ -1157,8 +1157,8 @@ out: * seems like overkill. */ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_SNDBUFERRORS, is_udplite); + UDP_INC_STATS(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); } return err; @@ -1242,10 +1242,10 @@ static unsigned int first_packet_length(struct sock *sk) spin_lock_bh(&rcvq->lock); while ((skb = skb_peek(rcvq)) != NULL && udp_lib_checksum_complete(skb)) { - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, - IS_UDPLITE(sk)); - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, - IS_UDPLITE(sk)); + __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, + IS_UDPLITE(sk)); + __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, + IS_UDPLITE(sk)); atomic_inc(&sk->sk_drops); __skb_unlink(skb, rcvq); __skb_queue_tail(&list_kill, skb); @@ -1352,16 +1352,16 @@ try_again: trace_kfree_skb(skb, udp_recvmsg); if (!peeked) { atomic_inc(&sk->sk_drops); - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_INERRORS, is_udplite); + UDP_INC_STATS(sock_net(sk), + UDP_MIB_INERRORS, is_udplite); } skb_free_datagram_locked(sk, skb); return err; } if (!peeked) - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_INDATAGRAMS, is_udplite); + UDP_INC_STATS(sock_net(sk), + UDP_MIB_INDATAGRAMS, is_udplite); sock_recv_ts_and_drops(msg, sk, skb); @@ -1386,8 +1386,8 @@ try_again: csum_copy_err: slow = lock_sock_fast(sk); if (!skb_kill_datagram(sk, skb, flags)) { - UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); - UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); + UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } unlock_sock_fast(sk, slow); @@ -1514,9 +1514,9 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) /* Note that an ENOMEM error is charged twice */ if (rc == -ENOMEM) - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, - is_udplite); - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS, + is_udplite); + UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); kfree_skb(skb); trace_udp_fail_queue_rcv_skb(rc, sk); return -1; @@ -1580,9 +1580,9 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) ret = encap_rcv(sk, skb); if (ret <= 0) { - UDP_INC_STATS_BH(sock_net(sk), - UDP_MIB_INDATAGRAMS, - is_udplite); + __UDP_INC_STATS(sock_net(sk), + UDP_MIB_INDATAGRAMS, + is_udplite); return -ret; } } @@ -1633,8 +1633,8 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) udp_csum_pull_header(skb); if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, - is_udplite); + __UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS, + is_udplite); goto drop; } @@ -1653,9 +1653,9 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return rc; csum_error: - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); + __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); kfree_skb(skb); return -1; @@ -1715,10 +1715,10 @@ start_lookup: if (unlikely(!nskb)) { atomic_inc(&sk->sk_drops); - UDP_INC_STATS_BH(net, UDP_MIB_RCVBUFERRORS, - IS_UDPLITE(sk)); - UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, - IS_UDPLITE(sk)); + __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS, + IS_UDPLITE(sk)); + __UDP_INC_STATS(net, UDP_MIB_INERRORS, + IS_UDPLITE(sk)); continue; } if (udp_queue_rcv_skb(sk, nskb) > 0) @@ -1736,8 +1736,8 @@ start_lookup: consume_skb(skb); } else { kfree_skb(skb); - UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, - proto == IPPROTO_UDPLITE); + __UDP_INC_STATS(net, UDP_MIB_IGNOREDMULTI, + proto == IPPROTO_UDPLITE); } return 0; } @@ -1851,7 +1851,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp_lib_checksum_complete(skb)) goto csum_error; - UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); + __UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); /* @@ -1878,9 +1878,9 @@ csum_error: proto == IPPROTO_UDPLITE ? "Lite" : "", &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest), ulen); - UDP_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); + __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); drop: - UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); + __UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); kfree_skb(skb); return 0; } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 097060def7f0..6b7459c92bb2 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -350,6 +350,11 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, uh->len = newlen; + /* Set encapsulation before calling into inner gro_complete() functions + * to make them set up the inner offsets. + */ + skb->encapsulation = 1; + rcu_read_lock(); sk = (*lookup)(skb, uh->source, uh->dest); if (sk && udp_sk(sk)->gro_complete) @@ -360,9 +365,6 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, if (skb->remcsum_offload) skb_shinfo(skb)->gso_type |= SKB_GSO_TUNNEL_REMCSUM; - skb->encapsulation = 1; - skb_set_inner_mac_header(skb, nhoff + sizeof(struct udphdr)); - return err; } EXPORT_SYMBOL(udp_gro_complete); diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 11e875ffd7ac..3f8411328de5 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -218,6 +218,7 @@ config IPV6_GRE tristate "IPv6: GRE tunnel" select IPV6_TUNNEL select NET_IP_TUNNEL + depends on NET_IPGRE_DEMUX ---help--- Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f5a77a9dd34e..47f837a58e0a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3175,35 +3175,9 @@ static void addrconf_gre_config(struct net_device *dev) } #endif -#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) -/* If the host route is cached on the addr struct make sure it is associated - * with the proper table. e.g., enslavement can change and if so the cached - * host route needs to move to the new table. - */ -static void l3mdev_check_host_rt(struct inet6_dev *idev, - struct inet6_ifaddr *ifp) -{ - if (ifp->rt) { - u32 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; - - if (tb_id != ifp->rt->rt6i_table->tb6_id) { - ip6_del_rt(ifp->rt); - ifp->rt = NULL; - } - } -} -#else -static void l3mdev_check_host_rt(struct inet6_dev *idev, - struct inet6_ifaddr *ifp) -{ -} -#endif - static int fixup_permanent_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) { - l3mdev_check_host_rt(idev, ifp); - if (!ifp->rt) { struct rt6_info *rt; @@ -3303,6 +3277,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, break; if (event == NETDEV_UP) { + /* restore routes for permanent addresses */ + addrconf_permanent_addr(dev); + if (!addrconf_qdisc_ok(dev)) { /* device is not ready yet. */ pr_info("ADDRCONF(NETDEV_UP): %s: link is not ready\n", @@ -3336,9 +3313,6 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, run_pending = 1; } - /* restore routes for permanent addresses */ - addrconf_permanent_addr(dev); - switch (dev->type) { #if IS_ENABLED(CONFIG_IPV6_SIT) case ARPHRD_SIT: @@ -3555,6 +3529,8 @@ restart: INIT_LIST_HEAD(&del_list); list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) { + struct rt6_info *rt = NULL; + addrconf_del_dad_work(ifa); write_unlock_bh(&idev->lock); @@ -3567,6 +3543,9 @@ restart: ifa->state = 0; if (!(ifa->flags & IFA_F_NODAD)) ifa->flags |= IFA_F_TENTATIVE; + + rt = ifa->rt; + ifa->rt = NULL; } else { state = ifa->state; ifa->state = INET6_IFADDR_STATE_DEAD; @@ -3577,6 +3556,9 @@ restart: spin_unlock_bh(&ifa->lock); + if (rt) + ip6_del_rt(rt); + if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); inet6addr_notifier_call_chain(NETDEV_DOWN, ifa); @@ -5344,10 +5326,10 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) if (rt) ip6_del_rt(rt); } - dst_hold(&ifp->rt->dst); - - ip6_del_rt(ifp->rt); - + if (ifp->rt) { + dst_hold(&ifp->rt->dst); + ip6_del_rt(ifp->rt); + } rt_genid_bump_ipv6(net); break; } diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index ea9ee5cce5cf..00d0c2903173 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -727,14 +727,13 @@ EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl); int ip6_datagram_send_ctl(struct net *net, struct sock *sk, struct msghdr *msg, struct flowi6 *fl6, - struct ipv6_txoptions *opt, - int *hlimit, int *tclass, int *dontfrag, - struct sockcm_cookie *sockc) + struct ipcm6_cookie *ipc6, struct sockcm_cookie *sockc) { struct in6_pktinfo *src_info; struct cmsghdr *cmsg; struct ipv6_rt_hdr *rthdr; struct ipv6_opt_hdr *hdr; + struct ipv6_txoptions *opt = ipc6->opt; int len; int err = 0; @@ -953,8 +952,8 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, goto exit_f; } - *hlimit = *(int *)CMSG_DATA(cmsg); - if (*hlimit < -1 || *hlimit > 0xff) { + ipc6->hlimit = *(int *)CMSG_DATA(cmsg); + if (ipc6->hlimit < -1 || ipc6->hlimit > 0xff) { err = -EINVAL; goto exit_f; } @@ -974,7 +973,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, goto exit_f; err = 0; - *tclass = tc; + ipc6->tclass = tc; break; } @@ -992,7 +991,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, goto exit_f; err = 0; - *dontfrag = df; + ipc6->dontfrag = df; break; } diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index ea7c4d64a00a..8de5dd7aaa05 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -258,8 +258,8 @@ static int ipv6_destopt_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { - IP6_INC_STATS_BH(dev_net(dst->dev), ip6_dst_idev(dst), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), + IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } @@ -280,8 +280,8 @@ static int ipv6_destopt_rcv(struct sk_buff *skb) return 1; } - IP6_INC_STATS_BH(dev_net(dst->dev), - ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(dev_net(dst->dev), + ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); return -1; } @@ -309,8 +309,8 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } @@ -319,8 +319,8 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) || skb->pkt_type != PACKET_HOST) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } @@ -334,8 +334,8 @@ looped_back: * processed by own */ if (!addr) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } @@ -360,8 +360,8 @@ looped_back: goto unknown_rh; /* Silently discard invalid RTH type 2 */ if (hdr->hdrlen != 2 || hdr->segments_left != 1) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } @@ -379,8 +379,8 @@ looped_back: n = hdr->hdrlen >> 1; if (hdr->segments_left > n) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); @@ -393,8 +393,8 @@ looped_back: if (skb_cloned(skb)) { /* the copy is a forwarded packet */ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_OUTDISCARDS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return -1; } @@ -416,14 +416,14 @@ looped_back: if (xfrm6_input_addr(skb, (xfrm_address_t *)addr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr, IPPROTO_ROUTING) < 0) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } @@ -434,8 +434,8 @@ looped_back: } if (ipv6_addr_is_multicast(addr)) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } @@ -454,8 +454,8 @@ looped_back: if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); @@ -470,7 +470,7 @@ looped_back: return -1; unknown_rh: - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->type) - skb_network_header(skb)); return -1; @@ -568,28 +568,28 @@ static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) if (nh[optoff + 1] != 4 || (optoff & 3) != 2) { net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", nh[optoff+1]); - IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ipv6_skb_idev(skb), + IPSTATS_MIB_INHDRERRORS); goto drop; } pkt_len = ntohl(*(__be32 *)(nh + optoff + 2)); if (pkt_len <= IPV6_MAXPLEN) { - IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ipv6_skb_idev(skb), + IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2); return false; } if (ipv6_hdr(skb)->payload_len) { - IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ipv6_skb_idev(skb), + IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff); return false; } if (pkt_len > skb->len - sizeof(struct ipv6hdr)) { - IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INTRUNCATEDPKTS); + __IP6_INC_STATS(net, ipv6_skb_idev(skb), + IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 6b573ebe49de..4527285fcaa2 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -401,10 +401,10 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) struct flowi6 fl6; struct icmpv6_msg msg; struct sockcm_cookie sockc_unused = {0}; + struct ipcm6_cookie ipc6; int iif = 0; int addr_type = 0; int len; - int hlimit; int err = 0; u32 mark = IP6_REPLY_MARK(net, skb->mark); @@ -446,6 +446,8 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) if (__ipv6_addr_needs_scope_id(addr_type)) iif = skb->dev->ifindex; + else + iif = l3mdev_master_ifindex(skb->dev); /* * Must not send error if the source does not uniquely @@ -500,14 +502,14 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - if (!fl6.flowi6_oif) - fl6.flowi6_oif = l3mdev_master_ifindex(skb->dev); - dst = icmpv6_route_lookup(net, skb, sk, &fl6); if (IS_ERR(dst)) goto out; - hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + ipc6.tclass = np->tclass; + ipc6.dontfrag = np->dontfrag; + ipc6.opt = NULL; msg.skb = skb; msg.offset = skb_network_offset(skb); @@ -526,9 +528,9 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) err = ip6_append_data(sk, icmpv6_getfrag, &msg, len + sizeof(struct icmp6hdr), - sizeof(struct icmp6hdr), hlimit, - np->tclass, NULL, &fl6, (struct rt6_info *)dst, - MSG_DONTWAIT, np->dontfrag, &sockc_unused); + sizeof(struct icmp6hdr), + &ipc6, &fl6, (struct rt6_info *)dst, + MSG_DONTWAIT, &sockc_unused); if (err) { ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); @@ -563,9 +565,8 @@ static void icmpv6_echo_reply(struct sk_buff *skb) struct flowi6 fl6; struct icmpv6_msg msg; struct dst_entry *dst; + struct ipcm6_cookie ipc6; int err = 0; - int hlimit; - u8 tclass; u32 mark = IP6_REPLY_MARK(net, skb->mark); struct sockcm_cookie sockc_unused = {0}; @@ -607,22 +608,24 @@ static void icmpv6_echo_reply(struct sk_buff *skb) if (IS_ERR(dst)) goto out; - hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - idev = __in6_dev_get(skb->dev); msg.skb = skb; msg.offset = 0; msg.type = ICMPV6_ECHO_REPLY; - tclass = ipv6_get_dsfield(ipv6_hdr(skb)); + ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb)); + ipc6.dontfrag = np->dontfrag; + ipc6.opt = NULL; + err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), - sizeof(struct icmp6hdr), hlimit, tclass, NULL, &fl6, + sizeof(struct icmp6hdr), &ipc6, &fl6, (struct rt6_info *)dst, MSG_DONTWAIT, - np->dontfrag, &sockc_unused); + &sockc_unused); if (err) { - ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTERRORS); + __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, @@ -674,7 +677,7 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) return; out: - ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); } /* @@ -710,7 +713,7 @@ static int icmpv6_rcv(struct sk_buff *skb) skb_set_network_header(skb, nh); } - ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INMSGS); + __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INMSGS); saddr = &ipv6_hdr(skb)->saddr; daddr = &ipv6_hdr(skb)->daddr; @@ -728,7 +731,7 @@ static int icmpv6_rcv(struct sk_buff *skb) type = hdr->icmp6_type; - ICMP6MSGIN_INC_STATS_BH(dev_net(dev), idev, type); + ICMP6MSGIN_INC_STATS(dev_net(dev), idev, type); switch (type) { case ICMPV6_ECHO_REQUEST: @@ -812,9 +815,9 @@ static int icmpv6_rcv(struct sk_buff *skb) return 0; csum_error: - ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS); + __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS); discard_it: - ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INERRORS); drop_no_count: kfree_skb(skb); return 0; diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index 28542cb2b387..d08fd2d48a78 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -23,10 +23,76 @@ #include <net/protocol.h> #include <uapi/linux/ila.h> +struct ila_locator { + union { + __u8 v8[8]; + __be16 v16[4]; + __be32 v32[2]; + __be64 v64; + }; +}; + +struct ila_identifier { + union { + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u8 __space:4; + u8 csum_neutral:1; + u8 type:3; +#elif defined(__BIG_ENDIAN_BITFIELD) + u8 type:3; + u8 csum_neutral:1; + u8 __space:4; +#else +#error "Adjust your <asm/byteorder.h> defines" +#endif + u8 __space2[7]; + }; + __u8 v8[8]; + __be16 v16[4]; + __be32 v32[2]; + __be64 v64; + }; +}; + +enum { + ILA_ATYPE_IID = 0, + ILA_ATYPE_LUID, + ILA_ATYPE_VIRT_V4, + ILA_ATYPE_VIRT_UNI_V6, + ILA_ATYPE_VIRT_MULTI_V6, + ILA_ATYPE_RSVD_1, + ILA_ATYPE_RSVD_2, + ILA_ATYPE_RSVD_3, +}; + +#define CSUM_NEUTRAL_FLAG htonl(0x10000000) + +struct ila_addr { + union { + struct in6_addr addr; + struct { + struct ila_locator loc; + struct ila_identifier ident; + }; + }; +}; + +static inline struct ila_addr *ila_a2i(struct in6_addr *addr) +{ + return (struct ila_addr *)addr; +} + +static inline bool ila_addr_is_ila(struct ila_addr *iaddr) +{ + return (iaddr->ident.type != ILA_ATYPE_IID); +} + struct ila_params { - __be64 locator; - __be64 locator_match; + struct ila_locator locator; + struct ila_locator locator_match; __wsum csum_diff; + u8 csum_mode; }; static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to) @@ -38,7 +104,14 @@ static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to) return csum_partial(diff, sizeof(diff), 0); } -void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p); +static inline bool ila_csum_neutral_set(struct ila_identifier ident) +{ + return !!(ident.csum_neutral); +} + +void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p); + +void ila_init_saved_csum(struct ila_params *p); int ila_lwt_init(void); void ila_lwt_fini(void); diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index 30613050e4ca..0e94042d1289 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -15,20 +15,52 @@ static __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p) { - if (*(__be64 *)&ip6h->daddr == p->locator_match) + struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); + + if (p->locator_match.v64) return p->csum_diff; else - return compute_csum_diff8((__be32 *)&ip6h->daddr, + return compute_csum_diff8((__be32 *)&iaddr->loc, + (__be32 *)&p->locator); +} + +static void ila_csum_do_neutral(struct ila_addr *iaddr, + struct ila_params *p) +{ + __sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3]; + __wsum diff, fval; + + /* Check if checksum adjust value has been cached */ + if (p->locator_match.v64) { + diff = p->csum_diff; + } else { + diff = compute_csum_diff8((__be32 *)iaddr, (__be32 *)&p->locator); + } + + fval = (__force __wsum)(ila_csum_neutral_set(iaddr->ident) ? + ~CSUM_NEUTRAL_FLAG : CSUM_NEUTRAL_FLAG); + + diff = csum_add(diff, fval); + + *adjust = ~csum_fold(csum_add(diff, csum_unfold(*adjust))); + + /* Flip the csum-neutral bit. Either we are doing a SIR->ILA + * translation with ILA_CSUM_NEUTRAL_MAP as the csum_method + * and the C-bit is not set, or we are doing an ILA-SIR + * tranlsation and the C-bit is set. + */ + iaddr->ident.csum_neutral ^= 1; } -void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p) +static void ila_csum_adjust_transport(struct sk_buff *skb, + struct ila_params *p) { __wsum diff; struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); size_t nhoff = sizeof(struct ipv6hdr); - /* First update checksum */ switch (ip6h->nexthdr) { case NEXTHDR_TCP: if (likely(pskb_may_pull(skb, nhoff + sizeof(struct tcphdr)))) { @@ -68,7 +100,46 @@ void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p) } /* Now change destination address */ - *(__be64 *)&ip6h->daddr = p->locator; + iaddr->loc = p->locator; +} + +void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p) +{ + struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); + + /* First deal with the transport checksum */ + if (ila_csum_neutral_set(iaddr->ident)) { + /* C-bit is set in the locator indicating that this + * is a locator being translated to a SIR address. + * Perform (receiver) checksum-neutral translation. + */ + ila_csum_do_neutral(iaddr, p); + } else { + switch (p->csum_mode) { + case ILA_CSUM_ADJUST_TRANSPORT: + ila_csum_adjust_transport(skb, p); + break; + case ILA_CSUM_NEUTRAL_MAP: + ila_csum_do_neutral(iaddr, p); + break; + case ILA_CSUM_NO_ACTION: + break; + } + } + + /* Now change destination address */ + iaddr->loc = p->locator; +} + +void ila_init_saved_csum(struct ila_params *p) +{ + if (!p->locator_match.v64) + return; + + p->csum_diff = compute_csum_diff8( + (__be32 *)&p->locator_match, + (__be32 *)&p->locator); } static int __init ila_init(void) diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 9db3621b2126..1dfb64166d7d 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -26,7 +26,7 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); + ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); return dst->lwtstate->orig_output(net, sk, skb); @@ -42,7 +42,7 @@ static int ila_input(struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); + ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); return dst->lwtstate->orig_input(skb); @@ -53,6 +53,7 @@ drop: static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, + [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, }; static int ila_build_state(struct net_device *dev, struct nlattr *nla, @@ -64,11 +65,28 @@ static int ila_build_state(struct net_device *dev, struct nlattr *nla, size_t encap_len = sizeof(*p); struct lwtunnel_state *newts; const struct fib6_config *cfg6 = cfg; + struct ila_addr *iaddr; int ret; if (family != AF_INET6) return -EINVAL; + if (cfg6->fc_dst_len < sizeof(struct ila_locator) + 1) { + /* Need to have full locator and at least type field + * included in destination + */ + return -EINVAL; + } + + iaddr = (struct ila_addr *)&cfg6->fc_dst; + + if (!ila_addr_is_ila(iaddr) || ila_csum_neutral_set(iaddr->ident)) { + /* Don't allow translation for a non-ILA address or checksum + * neutral flag to be set. + */ + return -EINVAL; + } + ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy); if (ret < 0) @@ -84,16 +102,19 @@ static int ila_build_state(struct net_device *dev, struct nlattr *nla, newts->len = encap_len; p = ila_params_lwtunnel(newts); - p->locator = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]); + p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]); - if (cfg6->fc_dst_len > sizeof(__be64)) { - /* Precompute checksum difference for translation since we - * know both the old locator and the new one. - */ - p->locator_match = *(__be64 *)&cfg6->fc_dst; - p->csum_diff = compute_csum_diff8( - (__be32 *)&p->locator_match, (__be32 *)&p->locator); - } + /* Precompute checksum difference for translation since we + * know both the old locator and the new one. + */ + p->locator_match = iaddr->loc; + p->csum_diff = compute_csum_diff8( + (__be32 *)&p->locator_match, (__be32 *)&p->locator); + + if (tb[ILA_ATTR_CSUM_MODE]) + p->csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]); + + ila_init_saved_csum(p); newts->type = LWTUNNEL_ENCAP_ILA; newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT | @@ -109,9 +130,11 @@ static int ila_fill_encap_info(struct sk_buff *skb, { struct ila_params *p = ila_params_lwtunnel(lwtstate); - if (nla_put_u64_64bit(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator, + if (nla_put_u64_64bit(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator.v64, ILA_ATTR_PAD)) goto nla_put_failure; + if (nla_put_u8(skb, ILA_ATTR_CSUM_MODE, (__force u8)p->csum_mode)) + goto nla_put_failure; return 0; @@ -121,8 +144,9 @@ nla_put_failure: static int ila_encap_nlsize(struct lwtunnel_state *lwtstate) { - /* No encapsulation overhead */ - return 0; + return nla_total_size_64bit(sizeof(u64)) + /* ILA_ATTR_LOCATOR */ + nla_total_size(sizeof(u8)) + /* ILA_ATTR_CSUM_MODE */ + 0; } static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) @@ -130,7 +154,7 @@ static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) struct ila_params *a_p = ila_params_lwtunnel(a); struct ila_params *b_p = ila_params_lwtunnel(b); - return (a_p->locator != b_p->locator); + return (a_p->locator.v64 != b_p->locator.v64); } static const struct lwtunnel_encap_ops ila_encap_ops = { diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 0e9e579410da..a90e57229c6c 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -11,13 +11,11 @@ struct ila_xlat_params { struct ila_params ip; - __be64 identifier; int ifindex; - unsigned int dir; }; struct ila_map { - struct ila_xlat_params p; + struct ila_xlat_params xp; struct rhash_head node; struct ila_map __rcu *next; struct rcu_head rcu; @@ -66,31 +64,29 @@ static __always_inline void __ila_hash_secret_init(void) net_get_random_once(&hashrnd, sizeof(hashrnd)); } -static inline u32 ila_identifier_hash(__be64 identifier) +static inline u32 ila_locator_hash(struct ila_locator loc) { - u32 *v = (u32 *)&identifier; + u32 *v = (u32 *)loc.v32; return jhash_2words(v[0], v[1], hashrnd); } -static inline spinlock_t *ila_get_lock(struct ila_net *ilan, __be64 identifier) +static inline spinlock_t *ila_get_lock(struct ila_net *ilan, + struct ila_locator loc) { - return &ilan->locks[ila_identifier_hash(identifier) & ilan->locks_mask]; + return &ilan->locks[ila_locator_hash(loc) & ilan->locks_mask]; } -static inline int ila_cmp_wildcards(struct ila_map *ila, __be64 loc, - int ifindex, unsigned int dir) +static inline int ila_cmp_wildcards(struct ila_map *ila, + struct ila_addr *iaddr, int ifindex) { - return (ila->p.ip.locator_match && ila->p.ip.locator_match != loc) || - (ila->p.ifindex && ila->p.ifindex != ifindex) || - !(ila->p.dir & dir); + return (ila->xp.ifindex && ila->xp.ifindex != ifindex); } -static inline int ila_cmp_params(struct ila_map *ila, struct ila_xlat_params *p) +static inline int ila_cmp_params(struct ila_map *ila, + struct ila_xlat_params *xp) { - return (ila->p.ip.locator_match != p->ip.locator_match) || - (ila->p.ifindex != p->ifindex) || - (ila->p.dir != p->dir); + return (ila->xp.ifindex != xp->ifindex); } static int ila_cmpfn(struct rhashtable_compare_arg *arg, @@ -98,17 +94,14 @@ static int ila_cmpfn(struct rhashtable_compare_arg *arg, { const struct ila_map *ila = obj; - return (ila->p.identifier != *(__be64 *)arg->key); + return (ila->xp.ip.locator_match.v64 != *(__be64 *)arg->key); } static inline int ila_order(struct ila_map *ila) { int score = 0; - if (ila->p.ip.locator_match) - score += 1 << 0; - - if (ila->p.ifindex) + if (ila->xp.ifindex) score += 1 << 1; return score; @@ -117,7 +110,7 @@ static inline int ila_order(struct ila_map *ila) static const struct rhashtable_params rht_params = { .nelem_hint = 1024, .head_offset = offsetof(struct ila_map, node), - .key_offset = offsetof(struct ila_map, p.identifier), + .key_offset = offsetof(struct ila_map, xp.ip.locator_match), .key_len = sizeof(u64), /* identifier */ .max_size = 1048576, .min_size = 256, @@ -136,50 +129,45 @@ static struct genl_family ila_nl_family = { }; static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { - [ILA_ATTR_IDENTIFIER] = { .type = NLA_U64, }, [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, }, [ILA_ATTR_IFINDEX] = { .type = NLA_U32, }, - [ILA_ATTR_DIR] = { .type = NLA_U32, }, + [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, }; static int parse_nl_config(struct genl_info *info, - struct ila_xlat_params *p) + struct ila_xlat_params *xp) { - memset(p, 0, sizeof(*p)); - - if (info->attrs[ILA_ATTR_IDENTIFIER]) - p->identifier = (__force __be64)nla_get_u64( - info->attrs[ILA_ATTR_IDENTIFIER]); + memset(xp, 0, sizeof(*xp)); if (info->attrs[ILA_ATTR_LOCATOR]) - p->ip.locator = (__force __be64)nla_get_u64( + xp->ip.locator.v64 = (__force __be64)nla_get_u64( info->attrs[ILA_ATTR_LOCATOR]); if (info->attrs[ILA_ATTR_LOCATOR_MATCH]) - p->ip.locator_match = (__force __be64)nla_get_u64( + xp->ip.locator_match.v64 = (__force __be64)nla_get_u64( info->attrs[ILA_ATTR_LOCATOR_MATCH]); - if (info->attrs[ILA_ATTR_IFINDEX]) - p->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]); + if (info->attrs[ILA_ATTR_CSUM_MODE]) + xp->ip.csum_mode = nla_get_u8(info->attrs[ILA_ATTR_CSUM_MODE]); - if (info->attrs[ILA_ATTR_DIR]) - p->dir = nla_get_u32(info->attrs[ILA_ATTR_DIR]); + if (info->attrs[ILA_ATTR_IFINDEX]) + xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]); return 0; } /* Must be called with rcu readlock */ -static inline struct ila_map *ila_lookup_wildcards(__be64 id, __be64 loc, +static inline struct ila_map *ila_lookup_wildcards(struct ila_addr *iaddr, int ifindex, - unsigned int dir, struct ila_net *ilan) { struct ila_map *ila; - ila = rhashtable_lookup_fast(&ilan->rhash_table, &id, rht_params); + ila = rhashtable_lookup_fast(&ilan->rhash_table, &iaddr->loc, + rht_params); while (ila) { - if (!ila_cmp_wildcards(ila, loc, ifindex, dir)) + if (!ila_cmp_wildcards(ila, iaddr, ifindex)) return ila; ila = rcu_access_pointer(ila->next); } @@ -188,15 +176,16 @@ static inline struct ila_map *ila_lookup_wildcards(__be64 id, __be64 loc, } /* Must be called with rcu readlock */ -static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *p, +static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *xp, struct ila_net *ilan) { struct ila_map *ila; - ila = rhashtable_lookup_fast(&ilan->rhash_table, &p->identifier, + ila = rhashtable_lookup_fast(&ilan->rhash_table, + &xp->ip.locator_match, rht_params); while (ila) { - if (!ila_cmp_params(ila, p)) + if (!ila_cmp_params(ila, xp)) return ila; ila = rcu_access_pointer(ila->next); } @@ -221,14 +210,14 @@ static void ila_free_cb(void *ptr, void *arg) } } -static int ila_xlat_addr(struct sk_buff *skb, int dir); +static int ila_xlat_addr(struct sk_buff *skb); static unsigned int ila_nf_input(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - ila_xlat_addr(skb, ILA_DIR_IN); + ila_xlat_addr(skb); return NF_ACCEPT; } @@ -241,11 +230,11 @@ static struct nf_hook_ops ila_nf_hook_ops[] __read_mostly = { }, }; -static int ila_add_mapping(struct net *net, struct ila_xlat_params *p) +static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_map *ila, *head; - spinlock_t *lock = ila_get_lock(ilan, p->identifier); + spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = 0, order; if (!ilan->hooks_registered) { @@ -264,22 +253,16 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *p) if (!ila) return -ENOMEM; - ila->p = *p; + ila_init_saved_csum(&xp->ip); - if (p->ip.locator_match) { - /* Precompute checksum difference for translation since we - * know both the old identifier and the new one. - */ - ila->p.ip.csum_diff = compute_csum_diff8( - (__be32 *)&p->ip.locator_match, - (__be32 *)&p->ip.locator); - } + ila->xp = *xp; order = ila_order(ila); spin_lock(lock); - head = rhashtable_lookup_fast(&ilan->rhash_table, &p->identifier, + head = rhashtable_lookup_fast(&ilan->rhash_table, + &xp->ip.locator_match, rht_params); if (!head) { /* New entry for the rhash_table */ @@ -289,7 +272,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *p) struct ila_map *tila = head, *prev = NULL; do { - if (!ila_cmp_params(tila, p)) { + if (!ila_cmp_params(tila, xp)) { err = -EEXIST; goto out; } @@ -326,23 +309,23 @@ out: return err; } -static int ila_del_mapping(struct net *net, struct ila_xlat_params *p) +static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_map *ila, *head, *prev; - spinlock_t *lock = ila_get_lock(ilan, p->identifier); + spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = -ENOENT; spin_lock(lock); head = rhashtable_lookup_fast(&ilan->rhash_table, - &p->identifier, rht_params); + &xp->ip.locator_match, rht_params); ila = head; prev = NULL; while (ila) { - if (ila_cmp_params(ila, p)) { + if (ila_cmp_params(ila, xp)) { prev = ila; ila = rcu_dereference_protected(ila->next, lockdep_is_held(lock)); @@ -404,31 +387,28 @@ static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info) static int ila_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); - struct ila_xlat_params p; + struct ila_xlat_params xp; int err; - err = parse_nl_config(info, &p); + err = parse_nl_config(info, &xp); if (err) return err; - ila_del_mapping(net, &p); + ila_del_mapping(net, &xp); return 0; } static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg) { - if (nla_put_u64_64bit(msg, ILA_ATTR_IDENTIFIER, - (__force u64)ila->p.identifier, - ILA_ATTR_PAD) || - nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR, - (__force u64)ila->p.ip.locator, + if (nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR, + (__force u64)ila->xp.ip.locator.v64, ILA_ATTR_PAD) || nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR_MATCH, - (__force u64)ila->p.ip.locator_match, + (__force u64)ila->xp.ip.locator_match.v64, ILA_ATTR_PAD) || - nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->p.ifindex) || - nla_put_u32(msg, ILA_ATTR_DIR, ila->p.dir)) + nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->xp.ifindex) || + nla_put_u32(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode)) return -1; return 0; @@ -460,11 +440,11 @@ static int ila_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) struct net *net = genl_info_net(info); struct ila_net *ilan = net_generic(net, ila_net_id); struct sk_buff *msg; - struct ila_xlat_params p; + struct ila_xlat_params xp; struct ila_map *ila; int ret; - ret = parse_nl_config(info, &p); + ret = parse_nl_config(info, &xp); if (ret) return ret; @@ -474,7 +454,7 @@ static int ila_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) rcu_read_lock(); - ila = ila_lookup_by_params(&p, ilan); + ila = ila_lookup_by_params(&xp, ilan); if (ila) { ret = ila_dump_info(ila, info->snd_portid, @@ -617,45 +597,32 @@ static struct pernet_operations ila_net_ops = { .size = sizeof(struct ila_net), }; -static int ila_xlat_addr(struct sk_buff *skb, int dir) +static int ila_xlat_addr(struct sk_buff *skb) { struct ila_map *ila; struct ipv6hdr *ip6h = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); struct ila_net *ilan = net_generic(net, ila_net_id); - __be64 identifier, locator_match; - size_t nhoff; + struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); /* Assumes skb contains a valid IPv6 header that is pulled */ - identifier = *(__be64 *)&ip6h->daddr.in6_u.u6_addr8[8]; - locator_match = *(__be64 *)&ip6h->daddr.in6_u.u6_addr8[0]; - nhoff = sizeof(struct ipv6hdr); + if (!ila_addr_is_ila(iaddr)) { + /* Type indicates this is not an ILA address */ + return 0; + } rcu_read_lock(); - ila = ila_lookup_wildcards(identifier, locator_match, - skb->dev->ifindex, dir, ilan); + ila = ila_lookup_wildcards(iaddr, skb->dev->ifindex, ilan); if (ila) - update_ipv6_locator(skb, &ila->p.ip); + ila_update_ipv6_locator(skb, &ila->xp.ip); rcu_read_unlock(); return 0; } -int ila_xlat_incoming(struct sk_buff *skb) -{ - return ila_xlat_addr(skb, ILA_DIR_IN); -} -EXPORT_SYMBOL(ila_xlat_incoming); - -int ila_xlat_outgoing(struct sk_buff *skb) -{ - return ila_xlat_addr(skb, ILA_DIR_OUT); -} -EXPORT_SYMBOL(ila_xlat_outgoing); - int ila_xlat_init(void) { int ret; diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index f1678388fb0d..00cf28ad4565 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -222,7 +222,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { sk_nulls_del_node_init_rcu((struct sock *)tw); - NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); + __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index ea071fad67a0..1bcef2369d64 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -240,6 +240,7 @@ struct fib6_table *fib6_new_table(struct net *net, u32 id) return tb; } +EXPORT_SYMBOL_GPL(fib6_new_table); struct fib6_table *fib6_get_table(struct net *net, u32 id) { diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 35d3ddc328f8..b912f0dbaf72 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -373,7 +373,7 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, struct msghdr msg; struct flowi6 flowi6; struct sockcm_cookie sockc_junk; - int junk; + struct ipcm6_cookie ipc6; err = -ENOMEM; fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL); @@ -390,8 +390,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, msg.msg_control = (void *)(fl->opt+1); memset(&flowi6, 0, sizeof(flowi6)); - err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, fl->opt, - &junk, &junk, &junk, &sockc_junk); + ipc6.opt = fl->opt; + err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6, &sockc_junk); if (err) goto done; err = -EINVAL; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index ca5a2c5675c5..ee62ec469ab3 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -54,6 +54,7 @@ #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/ip6_tunnel.h> +#include <net/gre.h> static bool log_ecn_error = true; @@ -342,7 +343,7 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, goto failed_free; /* Can use a lockless transmit, unless we generate output sequences */ - if (!(nt->parms.o_flags & GRE_SEQ)) + if (!(nt->parms.o_flags & TUNNEL_SEQ)) dev->features |= NETIF_F_LLTX; dev_hold(dev); @@ -443,137 +444,41 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, t->err_time = jiffies; } -static int ip6gre_rcv(struct sk_buff *skb) +static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) { const struct ipv6hdr *ipv6h; - u8 *h; - __be16 flags; - __sum16 csum = 0; - __be32 key = 0; - u32 seqno = 0; struct ip6_tnl *tunnel; - int offset = 4; - __be16 gre_proto; - int err; - - if (!pskb_may_pull(skb, sizeof(struct in6_addr))) - goto drop; ipv6h = ipv6_hdr(skb); - h = skb->data; - flags = *(__be16 *)h; - - if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { - /* - Version must be 0. - - We do not support routing headers. - */ - if (flags&(GRE_VERSION|GRE_ROUTING)) - goto drop; - - if (flags&GRE_CSUM) { - csum = skb_checksum_simple_validate(skb); - offset += 4; - } - if (flags&GRE_KEY) { - key = *(__be32 *)(h + offset); - offset += 4; - } - if (flags&GRE_SEQ) { - seqno = ntohl(*(__be32 *)(h + offset)); - offset += 4; - } - } - - gre_proto = *(__be16 *)(h + 2); - tunnel = ip6gre_tunnel_lookup(skb->dev, - &ipv6h->saddr, &ipv6h->daddr, key, - gre_proto); + &ipv6h->saddr, &ipv6h->daddr, tpi->key, + tpi->proto); if (tunnel) { - struct pcpu_sw_netstats *tstats; + ip6_tnl_rcv(tunnel, skb, tpi, NULL, false); - if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto drop; - - if (!ip6_tnl_rcv_ctl(tunnel, &ipv6h->daddr, &ipv6h->saddr)) { - tunnel->dev->stats.rx_dropped++; - goto drop; - } - - skb->protocol = gre_proto; - /* WCCP version 1 and 2 protocol decoding. - * - Change protocol to IPv6 - * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header - */ - if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) { - skb->protocol = htons(ETH_P_IPV6); - if ((*(h + offset) & 0xF0) != 0x40) - offset += 4; - } - - skb->mac_header = skb->network_header; - __pskb_pull(skb, offset); - skb_postpull_rcsum(skb, skb_transport_header(skb), offset); - - if (((flags&GRE_CSUM) && csum) || - (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { - tunnel->dev->stats.rx_crc_errors++; - tunnel->dev->stats.rx_errors++; - goto drop; - } - if (tunnel->parms.i_flags&GRE_SEQ) { - if (!(flags&GRE_SEQ) || - (tunnel->i_seqno && - (s32)(seqno - tunnel->i_seqno) < 0)) { - tunnel->dev->stats.rx_fifo_errors++; - tunnel->dev->stats.rx_errors++; - goto drop; - } - tunnel->i_seqno = seqno + 1; - } - - /* Warning: All skb pointers will be invalidated! */ - if (tunnel->dev->type == ARPHRD_ETHER) { - if (!pskb_may_pull(skb, ETH_HLEN)) { - tunnel->dev->stats.rx_length_errors++; - tunnel->dev->stats.rx_errors++; - goto drop; - } - - ipv6h = ipv6_hdr(skb); - skb->protocol = eth_type_trans(skb, tunnel->dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - } - - __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); + return PACKET_RCVD; + } - skb_reset_network_header(skb); + return PACKET_REJECT; +} - err = IP6_ECN_decapsulate(ipv6h, skb); - if (unlikely(err)) { - if (log_ecn_error) - net_info_ratelimited("non-ECT from %pI6 with dsfield=%#x\n", - &ipv6h->saddr, - ipv6_get_dsfield(ipv6h)); - if (err > 1) { - ++tunnel->dev->stats.rx_frame_errors; - ++tunnel->dev->stats.rx_errors; - goto drop; - } - } +static int gre_rcv(struct sk_buff *skb) +{ + struct tnl_ptk_info tpi; + bool csum_err = false; + int hdr_len; - tstats = this_cpu_ptr(tunnel->dev->tstats); - u64_stats_update_begin(&tstats->syncp); - tstats->rx_packets++; - tstats->rx_bytes += skb->len; - u64_stats_update_end(&tstats->syncp); + hdr_len = gre_parse_header(skb, &tpi, &csum_err); + if (hdr_len < 0) + goto drop; - netif_rx(skb); + if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false)) + goto drop; + if (ip6gre_rcv(skb, &tpi) == PACKET_RCVD) return 0; - } - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; @@ -584,199 +489,40 @@ struct ipv6_tel_txoption { __u8 dst_opt[8]; }; -static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) +static int gre_handle_offloads(struct sk_buff *skb, bool csum) { - memset(opt, 0, sizeof(struct ipv6_tel_txoption)); - - opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT; - opt->dst_opt[3] = 1; - opt->dst_opt[4] = encap_limit; - opt->dst_opt[5] = IPV6_TLV_PADN; - opt->dst_opt[6] = 1; - - opt->ops.dst0opt = (struct ipv6_opt_hdr *) opt->dst_opt; - opt->ops.opt_nflen = 8; + return iptunnel_handle_offloads(skb, + csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } -static __sum16 gre6_checksum(struct sk_buff *skb) -{ - __wsum csum; - - if (skb->ip_summed == CHECKSUM_PARTIAL) - csum = lco_csum(skb); - else - csum = skb_checksum(skb, sizeof(struct ipv6hdr), - skb->len - sizeof(struct ipv6hdr), 0); - return csum_fold(csum); -} - -static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, - struct net_device *dev, - __u8 dsfield, - struct flowi6 *fl6, - int encap_limit, - __u32 *pmtu) +static netdev_tx_t __gre6_xmit(struct sk_buff *skb, + struct net_device *dev, __u8 dsfield, + struct flowi6 *fl6, int encap_limit, + __u32 *pmtu, __be16 proto) { struct ip6_tnl *tunnel = netdev_priv(dev); - struct net *net = tunnel->net; - struct net_device *tdev; /* Device to other host */ - struct ipv6hdr *ipv6h; /* Our new IP header */ - unsigned int min_headroom = 0; /* The extra header space needed */ - int gre_hlen; - struct ipv6_tel_txoption opt; - int mtu; - struct dst_entry *dst = NULL, *ndst = NULL; - struct net_device_stats *stats = &tunnel->dev->stats; - int err = -1; - u8 proto; - __be16 protocol; + __be16 protocol = (dev->type == ARPHRD_ETHER) ? + htons(ETH_P_TEB) : proto; if (dev->type == ARPHRD_ETHER) IPCB(skb)->flags = 0; - if (dev->header_ops && dev->type == ARPHRD_IP6GRE) { - gre_hlen = 0; - ipv6h = (struct ipv6hdr *)skb->data; - fl6->daddr = ipv6h->daddr; - } else { - gre_hlen = tunnel->hlen; + if (dev->header_ops && dev->type == ARPHRD_IP6GRE) + fl6->daddr = ((struct ipv6hdr *)skb->data)->daddr; + else fl6->daddr = tunnel->parms.raddr; - } - - if (!fl6->flowi6_mark) - dst = dst_cache_get(&tunnel->dst_cache); - - if (!dst) { - dst = ip6_route_output(net, NULL, fl6); - - if (dst->error) - goto tx_err_link_failure; - dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); - if (IS_ERR(dst)) { - err = PTR_ERR(dst); - dst = NULL; - goto tx_err_link_failure; - } - ndst = dst; - } - - tdev = dst->dev; - if (tdev == dev) { - stats->collisions++; - net_warn_ratelimited("%s: Local routing loop detected!\n", - tunnel->parms.name); - goto tx_err_dst_release; - } - - mtu = dst_mtu(dst) - sizeof(*ipv6h); - if (encap_limit >= 0) { - min_headroom += 8; - mtu -= 8; - } - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - if (skb->len > mtu && !skb_is_gso(skb)) { - *pmtu = mtu; - err = -EMSGSIZE; - goto tx_err_dst_release; - } - - if (tunnel->err_count > 0) { - if (time_before(jiffies, - tunnel->err_time + IP6TUNNEL_ERR_TIMEO)) { - tunnel->err_count--; - - dst_link_failure(skb); - } else - tunnel->err_count = 0; - } - - skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); - - min_headroom += LL_RESERVED_SPACE(tdev) + gre_hlen + dst->header_len; - - if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { - int head_delta = SKB_DATA_ALIGN(min_headroom - - skb_headroom(skb) + - 16); - - err = pskb_expand_head(skb, max_t(int, head_delta, 0), - 0, GFP_ATOMIC); - if (min_headroom > dev->needed_headroom) - dev->needed_headroom = min_headroom; - if (unlikely(err)) - goto tx_err_dst_release; - } - - if (!fl6->flowi6_mark && ndst) - dst_cache_set_ip6(&tunnel->dst_cache, ndst, &fl6->saddr); - skb_dst_set(skb, dst); + if (tunnel->parms.o_flags & TUNNEL_SEQ) + tunnel->o_seqno++; - proto = NEXTHDR_GRE; - if (encap_limit >= 0) { - init_tel_txopt(&opt, encap_limit); - ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL); - } - - err = iptunnel_handle_offloads(skb, - (tunnel->parms.o_flags & GRE_CSUM) ? - SKB_GSO_GRE_CSUM : SKB_GSO_GRE); - if (err) - goto tx_err_dst_release; - - skb_push(skb, gre_hlen); - skb_reset_network_header(skb); - skb_set_transport_header(skb, sizeof(*ipv6h)); - - /* - * Push down and install the IP header. - */ - ipv6h = ipv6_hdr(skb); - ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), - ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); - ipv6h->hop_limit = tunnel->parms.hop_limit; - ipv6h->nexthdr = proto; - ipv6h->saddr = fl6->saddr; - ipv6h->daddr = fl6->daddr; - - ((__be16 *)(ipv6h + 1))[0] = tunnel->parms.o_flags; - protocol = (dev->type == ARPHRD_ETHER) ? - htons(ETH_P_TEB) : skb->protocol; - ((__be16 *)(ipv6h + 1))[1] = protocol; - - if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { - __be32 *ptr = (__be32 *)(((u8 *)ipv6h) + tunnel->hlen - 4); - - if (tunnel->parms.o_flags&GRE_SEQ) { - ++tunnel->o_seqno; - *ptr = htonl(tunnel->o_seqno); - ptr--; - } - if (tunnel->parms.o_flags&GRE_KEY) { - *ptr = tunnel->parms.o_key; - ptr--; - } - if ((tunnel->parms.o_flags & GRE_CSUM) && - !(skb_shinfo(skb)->gso_type & - (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { - *ptr = 0; - *(__sum16 *)ptr = gre6_checksum(skb); - } - } + /* Push GRE header. */ + gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, + protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); skb_set_inner_protocol(skb, protocol); - ip6tunnel_xmit(NULL, skb, dev); - return 0; -tx_err_link_failure: - stats->tx_carrier_errors++; - dst_link_failure(skb); -tx_err_dst_release: - dst_release(dst); - return err; + return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, + NEXTHDR_GRE); } static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) @@ -795,7 +541,6 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) encap_limit = t->parms.encap_limit; memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_GRE; dsfield = ipv4_get_dsfield(iph); @@ -805,7 +550,12 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; - err = ip6gre_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); + err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); + if (err) + return -1; + + err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, + skb->protocol); if (err != 0) { /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) @@ -845,7 +595,6 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) encap_limit = t->parms.encap_limit; memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_GRE; dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) @@ -855,7 +604,11 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; - err = ip6gre_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); + if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM))) + return -1; + + err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, + &mtu, skb->protocol); if (err != 0) { if (err == -EMSGSIZE) icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); @@ -899,7 +652,11 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev) memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); fl6.flowi6_proto = skb->protocol; - err = ip6gre_xmit2(skb, dev, 0, &fl6, encap_limit, &mtu); + err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); + if (err) + return err; + + err = __gre6_xmit(skb, dev, 0, &fl6, encap_limit, &mtu, skb->protocol); return err; } @@ -943,7 +700,7 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) struct net_device *dev = t->dev; struct __ip6_tnl_parm *p = &t->parms; struct flowi6 *fl6 = &t->fl.u.ip6; - int addend = sizeof(struct ipv6hdr) + 4; + int t_hlen; if (dev->type != ARPHRD_ETHER) { memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); @@ -970,16 +727,11 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) else dev->flags &= ~IFF_POINTOPOINT; - /* Precalculate GRE options length */ - if (t->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { - if (t->parms.o_flags&GRE_CSUM) - addend += 4; - if (t->parms.o_flags&GRE_KEY) - addend += 4; - if (t->parms.o_flags&GRE_SEQ) - addend += 4; - } - t->hlen = addend; + t->tun_hlen = gre_calc_hlen(t->parms.o_flags); + + t->hlen = t->tun_hlen; + + t_hlen = t->hlen + sizeof(struct ipv6hdr); if (p->flags & IP6_TNL_F_CAP_XMIT) { int strict = (ipv6_addr_type(&p->raddr) & @@ -993,10 +745,11 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) return; if (rt->dst.dev) { - dev->hard_header_len = rt->dst.dev->hard_header_len + addend; + dev->hard_header_len = rt->dst.dev->hard_header_len + + t_hlen; if (set_mtu) { - dev->mtu = rt->dst.dev->mtu - addend; + dev->mtu = rt->dst.dev->mtu - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; if (dev->type == ARPHRD_ETHER) @@ -1042,8 +795,8 @@ static void ip6gre_tnl_parm_from_user(struct __ip6_tnl_parm *p, p->link = u->link; p->i_key = u->i_key; p->o_key = u->o_key; - p->i_flags = u->i_flags; - p->o_flags = u->o_flags; + p->i_flags = gre_flags_to_tnl_flags(u->i_flags); + p->o_flags = gre_flags_to_tnl_flags(u->o_flags); memcpy(p->name, u->name, sizeof(u->name)); } @@ -1060,8 +813,8 @@ static void ip6gre_tnl_parm_to_user(struct ip6_tnl_parm2 *u, u->link = p->link; u->i_key = p->i_key; u->o_key = p->o_key; - u->i_flags = p->i_flags; - u->o_flags = p->o_flags; + u->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags); + u->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags); memcpy(u->name, p->name, sizeof(u->name)); } @@ -1075,6 +828,8 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, struct net *net = t->net; struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + memset(&p1, 0, sizeof(p1)); + switch (cmd) { case SIOCGETTUNNEL: if (dev == ign->fb_tunnel_dev) { @@ -1174,15 +929,6 @@ done: return err; } -static int ip6gre_tunnel_change_mtu(struct net_device *dev, int new_mtu) -{ - if (new_mtu < 68 || - new_mtu > 0xFFF8 - dev->hard_header_len) - return -EINVAL; - dev->mtu = new_mtu; - return 0; -} - static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) @@ -1226,7 +972,7 @@ static const struct net_device_ops ip6gre_netdev_ops = { .ndo_uninit = ip6gre_tunnel_uninit, .ndo_start_xmit = ip6gre_tunnel_xmit, .ndo_do_ioctl = ip6gre_tunnel_ioctl, - .ndo_change_mtu = ip6gre_tunnel_change_mtu, + .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1242,17 +988,11 @@ static void ip6gre_dev_free(struct net_device *dev) static void ip6gre_tunnel_setup(struct net_device *dev) { - struct ip6_tnl *t; - dev->netdev_ops = &ip6gre_netdev_ops; dev->destructor = ip6gre_dev_free; dev->type = ARPHRD_IP6GRE; - dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr) + 4; - dev->mtu = ETH_DATA_LEN - sizeof(struct ipv6hdr) - 4; - t = netdev_priv(dev); - if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - dev->mtu -= 8; + dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); netif_keep_dst(dev); @@ -1262,6 +1002,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) { struct ip6_tnl *tunnel; int ret; + int t_hlen; tunnel = netdev_priv(dev); @@ -1280,6 +1021,17 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) return ret; } + tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); + + tunnel->hlen = tunnel->tun_hlen; + + t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); + + dev->hard_header_len = LL_MAX_HEADER + t_hlen; + dev->mtu = ETH_DATA_LEN - t_hlen; + if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu -= 8; + return 0; } @@ -1318,7 +1070,7 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev) static struct inet6_protocol ip6gre_protocol __read_mostly = { - .handler = ip6gre_rcv, + .handler = gre_rcv, .err_handler = ip6gre_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; @@ -1462,10 +1214,12 @@ static void ip6gre_netlink_parms(struct nlattr *data[], parms->link = nla_get_u32(data[IFLA_GRE_LINK]); if (data[IFLA_GRE_IFLAGS]) - parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]); + parms->i_flags = gre_flags_to_tnl_flags( + nla_get_be16(data[IFLA_GRE_IFLAGS])); if (data[IFLA_GRE_OFLAGS]) - parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]); + parms->o_flags = gre_flags_to_tnl_flags( + nla_get_be16(data[IFLA_GRE_OFLAGS])); if (data[IFLA_GRE_IKEY]) parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); @@ -1514,7 +1268,7 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = { .ndo_start_xmit = ip6gre_tunnel_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = ip6gre_tunnel_change_mtu, + .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1560,7 +1314,7 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, dev->features |= GRE6_FEATURES; dev->hw_features |= GRE6_FEATURES; - if (!(nt->parms.o_flags & GRE_SEQ)) { + if (!(nt->parms.o_flags & TUNNEL_SEQ)) { /* TCP segmentation offload is not supported when we * generate output sequences. */ @@ -1657,8 +1411,10 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) struct __ip6_tnl_parm *p = &t->parms; if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || - nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) || - nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) || + nla_put_be16(skb, IFLA_GRE_IFLAGS, + gre_tnl_flags_to_gre_flags(p->i_flags)) || + nla_put_be16(skb, IFLA_GRE_OFLAGS, + gre_tnl_flags_to_gre_flags(p->o_flags)) || nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || nla_put_in6_addr(skb, IFLA_GRE_LOCAL, &p->laddr) || diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index c05c425c2389..f185cbcda114 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -49,6 +49,13 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip6_rcv(skb); + if (!skb) + return NET_RX_SUCCESS; + if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct inet6_protocol *ipprot; @@ -78,11 +85,11 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt idev = __in6_dev_get(skb->dev); - IP6_UPD_PO_STATS_BH(net, idev, IPSTATS_MIB_IN, skb->len); + __IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_IN, skb->len); if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL || !idev || unlikely(idev->cnf.disable_ipv6)) { - IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } @@ -109,10 +116,10 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (hdr->version != 6) goto err; - IP6_ADD_STATS_BH(net, idev, - IPSTATS_MIB_NOECTPKTS + + __IP6_ADD_STATS(net, idev, + IPSTATS_MIB_NOECTPKTS + (ipv6_get_dsfield(hdr) & INET_ECN_MASK), - max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); + max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); /* * RFC4291 2.5.3 * A packet received on an interface with a destination address @@ -169,12 +176,12 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt /* pkt_len may be zero if Jumbo payload option is present */ if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { if (pkt_len + sizeof(struct ipv6hdr) > skb->len) { - IP6_INC_STATS_BH(net, - idev, IPSTATS_MIB_INTRUNCATEDPKTS); + __IP6_INC_STATS(net, + idev, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) { - IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); goto drop; } hdr = ipv6_hdr(skb); @@ -182,7 +189,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (hdr->nexthdr == NEXTHDR_HOP) { if (ipv6_parse_hopopts(skb) < 0) { - IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); rcu_read_unlock(); return NET_RX_DROP; } @@ -197,7 +204,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt net, NULL, skb, dev, NULL, ip6_rcv_finish); err: - IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); drop: rcu_read_unlock(); kfree_skb(skb); @@ -259,18 +266,18 @@ resubmit: if (ret > 0) goto resubmit; else if (ret == 0) - IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS); } else { if (!raw) { if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { - IP6_INC_STATS_BH(net, idev, - IPSTATS_MIB_INUNKNOWNPROTOS); + __IP6_INC_STATS(net, idev, + IPSTATS_MIB_INUNKNOWNPROTOS); icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_UNK_NEXTHDR, nhoff); } kfree_skb(skb); } else { - IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS); consume_skb(skb); } } @@ -278,7 +285,7 @@ resubmit: return 0; discard: - IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); rcu_read_unlock(); kfree_skb(skb); return 0; @@ -297,7 +304,7 @@ int ip6_mc_input(struct sk_buff *skb) const struct ipv6hdr *hdr; bool deliver; - IP6_UPD_PO_STATS_BH(dev_net(skb_dst(skb)->dev), + __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST, skb->len); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 171518e3ca21..cbf127ae7c67 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -395,8 +395,8 @@ int ip6_forward(struct sk_buff *skb) goto drop; if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { - IP6_INC_STATS_BH(net, ip6_dst_idev(dst), - IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_INDISCARDS); goto drop; } @@ -427,8 +427,8 @@ int ip6_forward(struct sk_buff *skb) /* Force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); - IP6_INC_STATS_BH(net, ip6_dst_idev(dst), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -ETIMEDOUT; @@ -441,15 +441,15 @@ int ip6_forward(struct sk_buff *skb) if (proxied > 0) return ip6_input(skb); else if (proxied < 0) { - IP6_INC_STATS_BH(net, ip6_dst_idev(dst), - IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_INDISCARDS); goto drop; } } if (!xfrm6_route_forward(skb)) { - IP6_INC_STATS_BH(net, ip6_dst_idev(dst), - IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_INDISCARDS); goto drop; } dst = skb_dst(skb); @@ -505,17 +505,17 @@ int ip6_forward(struct sk_buff *skb) /* Again, force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP6_INC_STATS_BH(net, ip6_dst_idev(dst), - IPSTATS_MIB_INTOOBIGERRORS); - IP6_INC_STATS_BH(net, ip6_dst_idev(dst), - IPSTATS_MIB_FRAGFAILS); + __IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_INTOOBIGERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; } if (skb_cow(skb, dst->dev->hard_header_len)) { - IP6_INC_STATS_BH(net, ip6_dst_idev(dst), - IPSTATS_MIB_OUTDISCARDS); + __IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_OUTDISCARDS); goto drop; } @@ -525,14 +525,14 @@ int ip6_forward(struct sk_buff *skb) hdr->hop_limit--; - IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); - IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); + __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); + __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, net, NULL, skb, skb->dev, dst->dev, ip6_forward_finish); error: - IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); drop: kfree_skb(skb); return -EINVAL; @@ -1182,12 +1182,12 @@ static void ip6_append_data_mtu(unsigned int *mtu, } static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, - struct inet6_cork *v6_cork, - int hlimit, int tclass, struct ipv6_txoptions *opt, + struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, struct rt6_info *rt, struct flowi6 *fl6) { struct ipv6_pinfo *np = inet6_sk(sk); unsigned int mtu; + struct ipv6_txoptions *opt = ipc6->opt; /* * setup for corking @@ -1229,8 +1229,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, dst_hold(&rt->dst); cork->base.dst = &rt->dst; cork->fl.u.ip6 = *fl6; - v6_cork->hop_limit = hlimit; - v6_cork->tclass = tclass; + v6_cork->hop_limit = ipc6->hlimit; + v6_cork->tclass = ipc6->tclass; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? rt->dst.dev->mtu : dst_mtu(&rt->dst); @@ -1258,7 +1258,7 @@ static int __ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, - unsigned int flags, int dontfrag, + unsigned int flags, struct ipcm6_cookie *ipc6, const struct sockcm_cookie *sockc) { struct sk_buff *skb, *skb_prev = NULL; @@ -1298,7 +1298,7 @@ static int __ip6_append_data(struct sock *sk, sizeof(struct frag_hdr) : 0) + rt->rt6i_nfheader_len; - if (cork->length + length > mtu - headersize && dontfrag && + if (cork->length + length > mtu - headersize && ipc6->dontfrag && (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_RAW)) { ipv6_local_rxpmtu(sk, fl6, mtu - headersize + @@ -1564,9 +1564,9 @@ error: int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), - void *from, int length, int transhdrlen, int hlimit, - int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6, - struct rt6_info *rt, unsigned int flags, int dontfrag, + void *from, int length, int transhdrlen, + struct ipcm6_cookie *ipc6, struct flowi6 *fl6, + struct rt6_info *rt, unsigned int flags, const struct sockcm_cookie *sockc) { struct inet_sock *inet = inet_sk(sk); @@ -1580,12 +1580,12 @@ int ip6_append_data(struct sock *sk, /* * setup for corking */ - err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit, - tclass, opt, rt, fl6); + err = ip6_setup_cork(sk, &inet->cork, &np->cork, + ipc6, rt, fl6); if (err) return err; - exthdrlen = (opt ? opt->opt_flen : 0); + exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); length += exthdrlen; transhdrlen += exthdrlen; } else { @@ -1595,8 +1595,7 @@ int ip6_append_data(struct sock *sk, return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, &np->cork, sk_page_frag(sk), getfrag, - from, length, transhdrlen, flags, dontfrag, - sockc); + from, length, transhdrlen, flags, ipc6, sockc); } EXPORT_SYMBOL_GPL(ip6_append_data); @@ -1752,15 +1751,14 @@ struct sk_buff *ip6_make_skb(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, - int hlimit, int tclass, - struct ipv6_txoptions *opt, struct flowi6 *fl6, + struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags, - int dontfrag, const struct sockcm_cookie *sockc) + const struct sockcm_cookie *sockc) { struct inet_cork_full cork; struct inet6_cork v6_cork; struct sk_buff_head queue; - int exthdrlen = (opt ? opt->opt_flen : 0); + int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); int err; if (flags & MSG_PROBE) @@ -1772,17 +1770,17 @@ struct sk_buff *ip6_make_skb(struct sock *sk, cork.base.addr = 0; cork.base.opt = NULL; v6_cork.opt = NULL; - err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6); + err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6); if (err) return ERR_PTR(err); - if (dontfrag < 0) - dontfrag = inet6_sk(sk)->dontfrag; + if (ipc6->dontfrag < 0) + ipc6->dontfrag = inet6_sk(sk)->dontfrag; err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork, ¤t->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, - flags, dontfrag, sockc); + flags, ipc6, sockc); if (err) { __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork); return ERR_PTR(err); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 1f20345cbc97..e79330f214bd 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -238,6 +238,7 @@ static void ip6_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); + gro_cells_destroy(&t->gro_cells); dst_cache_destroy(&t->dst_cache); free_percpu(dev->tstats); free_netdev(dev); @@ -753,97 +754,157 @@ int ip6_tnl_rcv_ctl(struct ip6_tnl *t, } EXPORT_SYMBOL_GPL(ip6_tnl_rcv_ctl); -/** - * ip6_tnl_rcv - decapsulate IPv6 packet and retransmit it locally - * @skb: received socket buffer - * @protocol: ethernet protocol ID - * @dscp_ecn_decapsulate: the function to decapsulate DSCP code and ECN - * - * Return: 0 - **/ - -static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, - __u8 ipproto, - int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, - const struct ipv6hdr *ipv6h, - struct sk_buff *skb)) +static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, + const struct tnl_ptk_info *tpi, + struct metadata_dst *tun_dst, + int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb), + bool log_ecn_err) { - struct ip6_tnl *t; + struct pcpu_sw_netstats *tstats; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); - u8 tproto; int err; - rcu_read_lock(); - t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr); - if (t) { - struct pcpu_sw_netstats *tstats; + if ((!(tpi->flags & TUNNEL_CSUM) && + (tunnel->parms.i_flags & TUNNEL_CSUM)) || + ((tpi->flags & TUNNEL_CSUM) && + !(tunnel->parms.i_flags & TUNNEL_CSUM))) { + tunnel->dev->stats.rx_crc_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; + } - tproto = ACCESS_ONCE(t->parms.proto); - if (tproto != ipproto && tproto != 0) { - rcu_read_unlock(); - goto discard; + if (tunnel->parms.i_flags & TUNNEL_SEQ) { + if (!(tpi->flags & TUNNEL_SEQ) || + (tunnel->i_seqno && + (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { + tunnel->dev->stats.rx_fifo_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; } + tunnel->i_seqno = ntohl(tpi->seq) + 1; + } - if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { - rcu_read_unlock(); - goto discard; - } + skb->protocol = tpi->proto; - if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) { - t->dev->stats.rx_dropped++; - rcu_read_unlock(); - goto discard; + /* Warning: All skb pointers will be invalidated! */ + if (tunnel->dev->type == ARPHRD_ETHER) { + if (!pskb_may_pull(skb, ETH_HLEN)) { + tunnel->dev->stats.rx_length_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; } - skb->mac_header = skb->network_header; - skb_reset_network_header(skb); - skb->protocol = htons(protocol); - memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); - - __skb_tunnel_rx(skb, t->dev, t->net); - - err = dscp_ecn_decapsulate(t, ipv6h, skb); - if (unlikely(err)) { - if (log_ecn_error) - net_info_ratelimited("non-ECT from %pI6 with dsfield=%#x\n", - &ipv6h->saddr, - ipv6_get_dsfield(ipv6h)); - if (err > 1) { - ++t->dev->stats.rx_frame_errors; - ++t->dev->stats.rx_errors; - rcu_read_unlock(); - goto discard; - } + + ipv6h = ipv6_hdr(skb); + skb->protocol = eth_type_trans(skb, tunnel->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + } else { + skb->dev = tunnel->dev; + } + + skb_reset_network_header(skb); + memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); + + __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); + + err = dscp_ecn_decapsulate(tunnel, ipv6h, skb); + if (unlikely(err)) { + if (log_ecn_err) + net_info_ratelimited("non-ECT from %pI6 with DS=%#x\n", + &ipv6h->saddr, + ipv6_get_dsfield(ipv6h)); + if (err > 1) { + ++tunnel->dev->stats.rx_frame_errors; + ++tunnel->dev->stats.rx_errors; + goto drop; } + } - tstats = this_cpu_ptr(t->dev->tstats); - u64_stats_update_begin(&tstats->syncp); - tstats->rx_packets++; - tstats->rx_bytes += skb->len; - u64_stats_update_end(&tstats->syncp); + tstats = this_cpu_ptr(tunnel->dev->tstats); + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); - netif_rx(skb); + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); - rcu_read_unlock(); - return 0; + gro_cells_receive(&tunnel->gro_cells, skb); + return 0; + +drop: + kfree_skb(skb); + return 0; +} + +int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb, + const struct tnl_ptk_info *tpi, + struct metadata_dst *tun_dst, + bool log_ecn_err) +{ + return __ip6_tnl_rcv(t, skb, tpi, NULL, ip6ip6_dscp_ecn_decapsulate, + log_ecn_err); +} +EXPORT_SYMBOL(ip6_tnl_rcv); + +static const struct tnl_ptk_info tpi_v6 = { + /* no tunnel info required for ipxip6. */ + .proto = htons(ETH_P_IPV6), +}; + +static const struct tnl_ptk_info tpi_v4 = { + /* no tunnel info required for ipxip6. */ + .proto = htons(ETH_P_IP), +}; + +static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, + const struct tnl_ptk_info *tpi, + int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb)) +{ + struct ip6_tnl *t; + const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + int ret = -1; + + rcu_read_lock(); + t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr); + + if (t) { + u8 tproto = ACCESS_ONCE(t->parms.proto); + + if (tproto != ipproto && tproto != 0) + goto drop; + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) + goto drop; + if (iptunnel_pull_header(skb, 0, tpi->proto, false)) + goto drop; + ret = __ip6_tnl_rcv(t, skb, tpi, NULL, dscp_ecn_decapsulate, + log_ecn_error); } + rcu_read_unlock(); - return 1; -discard: + return ret; + +drop: + rcu_read_unlock(); kfree_skb(skb); return 0; } static int ip4ip6_rcv(struct sk_buff *skb) { - return ip6_tnl_rcv(skb, ETH_P_IP, IPPROTO_IPIP, - ip4ip6_dscp_ecn_decapsulate); + return ipxip6_rcv(skb, IPPROTO_IPIP, &tpi_v4, + ip4ip6_dscp_ecn_decapsulate); } static int ip6ip6_rcv(struct sk_buff *skb) { - return ip6_tnl_rcv(skb, ETH_P_IPV6, IPPROTO_IPV6, - ip6ip6_dscp_ecn_decapsulate); + return ipxip6_rcv(skb, IPPROTO_IPV6, &tpi_v6, + ip6ip6_dscp_ecn_decapsulate); } struct ipv6_tel_txoption { @@ -918,13 +979,14 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t, EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl); /** - * ip6_tnl_xmit2 - encapsulate packet and send + * ip6_tnl_xmit - encapsulate packet and send * @skb: the outgoing socket buffer * @dev: the outgoing tunnel device * @dsfield: dscp code for outer header - * @fl: flow of tunneled packet + * @fl6: flow of tunneled packet * @encap_limit: encapsulation limit * @pmtu: Path MTU is stored if packet is too big + * @proto: next header value * * Description: * Build new header and do some sanity checks on the packet before sending @@ -936,12 +998,9 @@ EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl); * %-EMSGSIZE message too big. return mtu in this case. **/ -static int ip6_tnl_xmit2(struct sk_buff *skb, - struct net_device *dev, - __u8 dsfield, - struct flowi6 *fl6, - int encap_limit, - __u32 *pmtu) +int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, + struct flowi6 *fl6, int encap_limit, __u32 *pmtu, + __u8 proto) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; @@ -952,7 +1011,6 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, struct net_device *tdev; int mtu; unsigned int max_headroom = sizeof(struct ipv6hdr); - u8 proto; int err = -1; /* NBMA tunnel */ @@ -1014,12 +1072,23 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, mtu = IPV6_MIN_MTU; if (skb_dst(skb)) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - if (skb->len > mtu) { + if (skb->len > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; } + if (t->err_count > 0) { + if (time_before(jiffies, + t->err_time + IP6TUNNEL_ERR_TIMEO)) { + t->err_count--; + + dst_link_failure(skb); + } else { + t->err_count = 0; + } + } + skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); /* @@ -1045,9 +1114,6 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); skb_dst_set(skb, dst); - skb->transport_header = skb->network_header; - - proto = fl6->flowi6_proto; if (encap_limit >= 0) { init_tel_txopt(&opt, encap_limit); ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL); @@ -1058,6 +1124,11 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, skb->encapsulation = 1; } + max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr) + + dst->header_len; + if (max_headroom > dev->needed_headroom) + dev->needed_headroom = max_headroom; + skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); @@ -1076,6 +1147,7 @@ tx_err_dst_release: dst_release(dst); return err; } +EXPORT_SYMBOL(ip6_tnl_xmit); static inline int ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) @@ -1099,7 +1171,6 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) encap_limit = t->parms.encap_limit; memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPIP; dsfield = ipv4_get_dsfield(iph); @@ -1109,7 +1180,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; - err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); + err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, + IPPROTO_IPIP); if (err != 0) { /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) @@ -1153,7 +1225,6 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) encap_limit = t->parms.encap_limit; memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPV6; dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) @@ -1163,7 +1234,8 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; - err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); + err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, + IPPROTO_IPV6); if (err != 0) { if (err == -EMSGSIZE) icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); @@ -1174,7 +1246,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) } static netdev_tx_t -ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net_device_stats *stats = &t->dev->stats; @@ -1370,6 +1442,8 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + memset(&p1, 0, sizeof(p1)); + switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { @@ -1464,8 +1538,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) * %-EINVAL if mtu too small **/ -static int -ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) +int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); @@ -1481,6 +1554,7 @@ ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) dev->mtu = new_mtu; return 0; } +EXPORT_SYMBOL(ip6_tnl_change_mtu); int ip6_tnl_get_iflink(const struct net_device *dev) { @@ -1493,7 +1567,7 @@ EXPORT_SYMBOL(ip6_tnl_get_iflink); static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_init = ip6_tnl_dev_init, .ndo_uninit = ip6_tnl_dev_uninit, - .ndo_start_xmit = ip6_tnl_xmit, + .ndo_start_xmit = ip6_tnl_start_xmit, .ndo_do_ioctl = ip6_tnl_ioctl, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats = ip6_get_stats, @@ -1549,13 +1623,25 @@ ip6_tnl_dev_init_gen(struct net_device *dev) return -ENOMEM; ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); - if (ret) { - free_percpu(dev->tstats); - dev->tstats = NULL; - return ret; - } + if (ret) + goto free_stats; + + ret = gro_cells_init(&t->gro_cells, dev); + if (ret) + goto destroy_dst; + + t->hlen = 0; + t->tun_hlen = 0; return 0; + +destroy_dst: + dst_cache_destroy(&t->dst_cache); +free_stats: + free_percpu(dev->tstats); + dev->tstats = NULL; + + return ret; } /** diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index bf678324fd52..f2e2013f8346 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1984,10 +1984,10 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_OUTFORWDATAGRAMS); - IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_OUTOCTETS, skb->len); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTFORWDATAGRAMS); + __IP6_ADD_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTOCTETS, skb->len); return dst_output(net, sk, skb); } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 4ff4b29894eb..a9895e15ee9c 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -473,7 +473,7 @@ sticky_done: struct msghdr msg; struct flowi6 fl6; struct sockcm_cookie sockc_junk; - int junk; + struct ipcm6_cookie ipc6; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = sk->sk_bound_dev_if; @@ -503,9 +503,9 @@ sticky_done: msg.msg_controllen = optlen; msg.msg_control = (void *)(opt+1); + ipc6.opt = opt; - retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, opt, &junk, - &junk, &junk, &sockc_junk); + retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6, &sockc_junk); if (retv) goto done; update: diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 73e606c719ef..63e06c3dd319 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -39,34 +39,12 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("IPv6 packet filter"); -/*#define DEBUG_IP_FIREWALL*/ -/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ -/*#define DEBUG_IP_FIREWALL_USER*/ - -#ifdef DEBUG_IP_FIREWALL -#define dprintf(format, args...) pr_info(format , ## args) -#else -#define dprintf(format, args...) -#endif - -#ifdef DEBUG_IP_FIREWALL_USER -#define duprintf(format, args...) pr_info(format , ## args) -#else -#define duprintf(format, args...) -#endif - #ifdef CONFIG_NETFILTER_DEBUG #define IP_NF_ASSERT(x) WARN_ON(!(x)) #else #define IP_NF_ASSERT(x) #endif -#if 0 -/* All the better to debug you with... */ -#define static -#define inline -#endif - void *ip6t_alloc_initial_table(const struct xt_table *info) { return xt_alloc_initial_table(ip6t, IP6T); @@ -100,35 +78,18 @@ ip6_packet_match(const struct sk_buff *skb, if (FWINV(ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk, &ip6info->src), IP6T_INV_SRCIP) || FWINV(ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk, - &ip6info->dst), IP6T_INV_DSTIP)) { - dprintf("Source or dest mismatch.\n"); -/* - dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr, - ipinfo->smsk.s_addr, ipinfo->src.s_addr, - ipinfo->invflags & IP6T_INV_SRCIP ? " (INV)" : ""); - dprintf("DST: %u. Mask: %u. Target: %u.%s\n", ip->daddr, - ipinfo->dmsk.s_addr, ipinfo->dst.s_addr, - ipinfo->invflags & IP6T_INV_DSTIP ? " (INV)" : "");*/ + &ip6info->dst), IP6T_INV_DSTIP)) return false; - } ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask); - if (FWINV(ret != 0, IP6T_INV_VIA_IN)) { - dprintf("VIA in mismatch (%s vs %s).%s\n", - indev, ip6info->iniface, - ip6info->invflags & IP6T_INV_VIA_IN ? " (INV)" : ""); + if (FWINV(ret != 0, IP6T_INV_VIA_IN)) return false; - } ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask); - if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) { - dprintf("VIA out mismatch (%s vs %s).%s\n", - outdev, ip6info->outiface, - ip6info->invflags & IP6T_INV_VIA_OUT ? " (INV)" : ""); + if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) return false; - } /* ... might want to do something with class and flowlabel here ... */ @@ -145,11 +106,6 @@ ip6_packet_match(const struct sk_buff *skb, } *fragoff = _frag_off; - dprintf("Packet protocol %hi ?= %s%hi.\n", - protohdr, - ip6info->invflags & IP6T_INV_PROTO ? "!":"", - ip6info->proto); - if (ip6info->proto == protohdr) { if (ip6info->invflags & IP6T_INV_PROTO) return false; @@ -169,16 +125,11 @@ ip6_packet_match(const struct sk_buff *skb, static bool ip6_checkentry(const struct ip6t_ip6 *ipv6) { - if (ipv6->flags & ~IP6T_F_MASK) { - duprintf("Unknown flag bits set: %08X\n", - ipv6->flags & ~IP6T_F_MASK); + if (ipv6->flags & ~IP6T_F_MASK) return false; - } - if (ipv6->invflags & ~IP6T_INV_MASK) { - duprintf("Unknown invflag bits set: %08X\n", - ipv6->invflags & ~IP6T_INV_MASK); + if (ipv6->invflags & ~IP6T_INV_MASK) return false; - } + return true; } @@ -446,13 +397,9 @@ ip6t_do_table(struct sk_buff *skb, xt_write_recseq_end(addend); local_bh_enable(); -#ifdef DEBUG_ALLOW_ALL - return NF_ACCEPT; -#else if (acpar.hotdrop) return NF_DROP; else return verdict; -#endif } static bool find_jump_target(const struct xt_table_info *t, @@ -492,11 +439,9 @@ mark_source_chains(const struct xt_table_info *newinfo, = (void *)ip6t_get_target_c(e); int visited = e->comefrom & (1 << hook); - if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { - pr_err("iptables: loop hook %u pos %u %08X.\n", - hook, pos, e->comefrom); + if (e->comefrom & (1 << NF_INET_NUMHOOKS)) return 0; - } + e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ @@ -508,26 +453,13 @@ mark_source_chains(const struct xt_table_info *newinfo, if ((strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) { - duprintf("mark_source_chains: bad " - "negative verdict (%i)\n", - t->verdict); + t->verdict < -NF_MAX_VERDICT - 1) return 0; - } /* Return: backtrack through the last big jump. */ do { e->comefrom ^= (1<<NF_INET_NUMHOOKS); -#ifdef DEBUG_IP_FIREWALL_USER - if (e->comefrom - & (1 << NF_INET_NUMHOOKS)) { - duprintf("Back unset " - "on hook %u " - "rule %u\n", - hook, pos); - } -#endif oldpos = pos; pos = e->counters.pcnt; e->counters.pcnt = 0; @@ -555,8 +487,6 @@ mark_source_chains(const struct xt_table_info *newinfo, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ - duprintf("Jump rule %u -> %u\n", - pos, newpos); e = (struct ip6t_entry *) (entry0 + newpos); if (!find_jump_target(newinfo, e)) @@ -573,8 +503,7 @@ mark_source_chains(const struct xt_table_info *newinfo, pos = newpos; } } -next: - duprintf("Finished chain %u\n", hook); +next: ; } return 1; } @@ -595,19 +524,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net) static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { const struct ip6t_ip6 *ipv6 = par->entryinfo; - int ret; par->match = m->u.kernel.match; par->matchinfo = m->data; - ret = xt_check_match(par, m->u.match_size - sizeof(*m), - ipv6->proto, ipv6->invflags & IP6T_INV_PROTO); - if (ret < 0) { - duprintf("ip_tables: check failed for `%s'.\n", - par.match->name); - return ret; - } - return 0; + return xt_check_match(par, m->u.match_size - sizeof(*m), + ipv6->proto, ipv6->invflags & IP6T_INV_PROTO); } static int @@ -618,10 +540,9 @@ find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, m->u.user.revision); - if (IS_ERR(match)) { - duprintf("find_check_match: `%s' not found\n", m->u.user.name); + if (IS_ERR(match)) return PTR_ERR(match); - } + m->u.kernel.match = match; ret = check_match(m, par); @@ -646,17 +567,11 @@ static int check_target(struct ip6t_entry *e, struct net *net, const char *name) .hook_mask = e->comefrom, .family = NFPROTO_IPV6, }; - int ret; t = ip6t_get_target(e); - ret = xt_check_target(&par, t->u.target_size - sizeof(*t), - e->ipv6.proto, e->ipv6.invflags & IP6T_INV_PROTO); - if (ret < 0) { - duprintf("ip_tables: check failed for `%s'.\n", - t->u.kernel.target->name); - return ret; - } - return 0; + return xt_check_target(&par, t->u.target_size - sizeof(*t), + e->ipv6.proto, + e->ipv6.invflags & IP6T_INV_PROTO); } static int @@ -669,10 +584,12 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; + unsigned long pcnt; - e->counters.pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(e->counters.pcnt)) + pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(pcnt)) return -ENOMEM; + e->counters.pcnt = pcnt; j = 0; mtpar.net = net; @@ -691,7 +608,6 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("find_check_entry: `%s' not found\n", t->u.user.name); ret = PTR_ERR(target); goto cleanup_matches; } @@ -744,17 +660,12 @@ check_entry_size_and_hooks(struct ip6t_entry *e, if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 || (unsigned char *)e + sizeof(struct ip6t_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p\n", e); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset - < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) return -EINVAL; - } if (!ip6_checkentry(&e->ipv6)) return -EINVAL; @@ -771,12 +682,9 @@ check_entry_size_and_hooks(struct ip6t_entry *e, if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { - if (!check_underflow(e)) { - pr_debug("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + if (!check_underflow(e)) return -EINVAL; - } + newinfo->underflow[h] = underflows[h]; } } @@ -828,7 +736,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, newinfo->underflow[i] = 0xFFFFFFFF; } - duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { @@ -845,27 +752,18 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, ++newinfo->stacksize; } - if (i != repl->num_entries) { - duprintf("translate_table: %u not %u entries\n", - i, repl->num_entries); + if (i != repl->num_entries) return -EINVAL; - } /* Check hooks all assigned */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { /* Only hooks which are valid */ if (!(repl->valid_hooks & (1 << i))) continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) { - duprintf("Invalid hook entry %u %u\n", - i, repl->hook_entry[i]); + if (newinfo->hook_entry[i] == 0xFFFFFFFF) return -EINVAL; - } - if (newinfo->underflow[i] == 0xFFFFFFFF) { - duprintf("Invalid underflow %u %u\n", - i, repl->underflow[i]); + if (newinfo->underflow[i] == 0xFFFFFFFF) return -EINVAL; - } } if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) @@ -1093,11 +991,8 @@ static int get_info(struct net *net, void __user *user, struct xt_table *t; int ret; - if (*len != sizeof(struct ip6t_getinfo)) { - duprintf("length %u != %zu\n", *len, - sizeof(struct ip6t_getinfo)); + if (*len != sizeof(struct ip6t_getinfo)) return -EINVAL; - } if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; @@ -1155,31 +1050,24 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, struct ip6t_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct ip6t_get_entries) + get.size) { - duprintf("get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct ip6t_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR_OR_NULL(t)) { struct xt_table_info *private = t->private; - duprintf("t->private->number = %u\n", private->number); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); - else { - duprintf("get_entries: I've got %u not %u!\n", - private->size, get.size); + else ret = -EAGAIN; - } + module_put(t->me); xt_table_unlock(t); } else @@ -1215,8 +1103,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, /* You lied! */ if (valid_hooks != t->valid_hooks) { - duprintf("Valid hook crap: %08X vs %08X\n", - valid_hooks, t->valid_hooks); ret = -EINVAL; goto put_module; } @@ -1226,8 +1112,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, goto put_module; /* Update module usage count based on number of rules */ - duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", - oldinfo->number, oldinfo->initial_entries, newinfo->number); if ((oldinfo->number > oldinfo->initial_entries) || (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); @@ -1296,8 +1180,6 @@ do_replace(struct net *net, const void __user *user, unsigned int len) if (ret != 0) goto free_newinfo; - duprintf("ip_tables: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); if (ret) @@ -1422,11 +1304,9 @@ compat_find_calc_match(struct xt_entry_match *m, match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, m->u.user.revision); - if (IS_ERR(match)) { - duprintf("compat_check_calc_match: `%s' not found\n", - m->u.user.name); + if (IS_ERR(match)) return PTR_ERR(match); - } + m->u.kernel.match = match; *size += xt_compat_match_offset(match); return 0; @@ -1458,20 +1338,14 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, unsigned int j; int ret, off; - duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 || (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p, limit = %p\n", e, limit); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset < sizeof(struct compat_ip6t_entry) + - sizeof(struct compat_xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + sizeof(struct compat_xt_entry_target)) return -EINVAL; - } if (!ip6_checkentry(&e->ipv6)) return -EINVAL; @@ -1495,8 +1369,6 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", - t->u.user.name); ret = PTR_ERR(target); goto release_matches; } @@ -1575,7 +1447,6 @@ translate_compat_table(struct net *net, size = compatr->size; info->number = compatr->num_entries; - duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(AF_INET6); xt_compat_init_offsets(AF_INET6, compatr->num_entries); @@ -1590,11 +1461,8 @@ translate_compat_table(struct net *net, } ret = -EINVAL; - if (j != compatr->num_entries) { - duprintf("translate_compat_table: %u not %u entries\n", - j, compatr->num_entries); + if (j != compatr->num_entries) goto out_unlock; - } ret = -ENOMEM; newinfo = xt_alloc_table_info(size); @@ -1685,8 +1553,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) if (ret != 0) goto free_newinfo; - duprintf("compat_do_replace: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, compat_ptr(tmp.counters)); if (ret) @@ -1720,7 +1586,6 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, break; default: - duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1770,19 +1635,15 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, struct compat_ip6t_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct compat_ip6t_get_entries) + get.size) { - duprintf("compat_get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct compat_ip6t_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(AF_INET6); @@ -1790,16 +1651,13 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; - duprintf("t->private->number = %u\n", private->number); ret = compat_table_info(private, &info); - if (!ret && get.size == info.size) { + if (!ret && get.size == info.size) ret = compat_copy_entries_to_user(private->size, t, uptr->entrytable); - } else if (!ret) { - duprintf("compat_get_entries: I've got %u not %u!\n", - private->size, get.size); + else if (!ret) ret = -EAGAIN; - } + xt_compat_flush_offsets(AF_INET6); module_put(t->me); xt_table_unlock(t); @@ -1852,7 +1710,6 @@ do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) break; default: - duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1904,7 +1761,6 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) } default: - duprintf("do_ip6t_get_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -2006,7 +1862,6 @@ icmp6_match(const struct sk_buff *skb, struct xt_action_param *par) /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ - duprintf("Dropping evil ICMP tinygram.\n"); par->hotdrop = true; return false; } diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 5d778dd11f66..06bed74cf5ee 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -60,7 +60,7 @@ synproxy_send_tcp(struct net *net, fl6.fl6_dport = nth->dest; security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6)); dst = ip6_route_output(net, NULL, &fl6); - if (dst == NULL || dst->error) { + if (dst->error) { dst_release(dst); goto free_nskb; } diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index da1cff79e447..3ee3e444a66b 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -58,11 +58,11 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int iif = 0; struct flowi6 fl6; int err; - int hlimit; struct dst_entry *dst; struct rt6_info *rt; struct pingfakehdr pfh; struct sockcm_cookie junk = {0}; + struct ipcm6_cookie ipc6; pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); @@ -139,13 +139,15 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) pfh.wcheck = 0; pfh.family = AF_INET6; - hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + ipc6.tclass = np->tclass; + ipc6.dontfrag = np->dontfrag; + ipc6.opt = NULL; lock_sock(sk); err = ip6_append_data(sk, ping_getfrag, &pfh, len, - 0, hlimit, - np->tclass, NULL, &fl6, rt, - MSG_DONTWAIT, np->dontfrag, &junk); + 0, &ipc6, &fl6, rt, + MSG_DONTWAIT, &junk); if (err) { ICMP6_INC_STATS(sock_net(sk), rt->rt6i_idev, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index b07ce21983aa..896350df6423 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -746,10 +746,8 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct raw6_frag_vec rfv; struct flowi6 fl6; struct sockcm_cookie sockc; + struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; - int hlimit = -1; - int tclass = -1; - int dontfrag = -1; u16 proto; int err; @@ -770,6 +768,11 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = sk->sk_mark; + ipc6.hlimit = -1; + ipc6.tclass = -1; + ipc6.dontfrag = -1; + ipc6.opt = NULL; + if (sin6) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; @@ -827,10 +830,9 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); + ipc6.opt = opt; - err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt, - &hlimit, &tclass, &dontfrag, - &sockc); + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -846,7 +848,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!opt) { opt = txopt_get(np); opt_to_free = opt; - } + } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); @@ -881,14 +883,14 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) err = PTR_ERR(dst); goto out; } - if (hlimit < 0) - hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + if (ipc6.hlimit < 0) + ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (tclass < 0) - tclass = np->tclass; + if (ipc6.tclass < 0) + ipc6.tclass = np->tclass; - if (dontfrag < 0) - dontfrag = np->dontfrag; + if (ipc6.dontfrag < 0) + ipc6.dontfrag = np->dontfrag; if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; @@ -897,10 +899,11 @@ back_from_confirm: if (inet->hdrincl) err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, msg->msg_flags); else { + ipc6.opt = opt; lock_sock(sk); err = ip6_append_data(sk, raw6_getfrag, &rfv, - len, 0, hlimit, tclass, opt, &fl6, (struct rt6_info *)dst, - msg->msg_flags, dontfrag, &sockc); + len, 0, &ipc6, &fl6, (struct rt6_info *)dst, + msg->msg_flags, &sockc); if (err) ip6_flush_pending_frames(sk); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index e2ea31175ef9..2160d5d009cb 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -145,12 +145,12 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, if (!dev) goto out_rcu_unlock; - IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); if (inet_frag_evicting(&fq->q)) goto out_rcu_unlock; - IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); /* Don't send error if the first segment did not arrive. */ if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !fq->q.fragments) @@ -223,8 +223,8 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); if ((unsigned int)end > IPV6_MAXPLEN) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((u8 *)&fhdr->frag_off - skb_network_header(skb))); @@ -258,8 +258,8 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, /* RFC2460 says always send parameter problem in * this case. -DaveM */ - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, offsetof(struct ipv6hdr, payload_len)); return -1; @@ -361,8 +361,8 @@ found: discard_fq: inet_frag_kill(&fq->q, &ip6_frags); err: - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_REASMFAILS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -1; } @@ -500,7 +500,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, skb_network_header_len(head)); rcu_read_lock(); - IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); rcu_read_unlock(); fq->q.fragments = NULL; fq->q.fragments_tail = NULL; @@ -513,7 +513,7 @@ out_oom: net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n"); out_fail: rcu_read_lock(); - IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); rcu_read_unlock(); return -1; } @@ -528,7 +528,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) goto fail_hdr; - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS); /* Jumbo payload inhibits frag. header */ if (hdr->payload_len == 0) @@ -544,8 +544,8 @@ static int ipv6_frag_rcv(struct sk_buff *skb) if (!(fhdr->frag_off & htons(0xFFF9))) { /* It is not a fragmented frame */ skb->transport_header += sizeof(struct frag_hdr); - IP6_INC_STATS_BH(net, - ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS); + __IP6_INC_STATS(net, + ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS); IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb); IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; @@ -566,13 +566,13 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return ret; } - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -1; fail_hdr: - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb)); return -1; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d916d6ab9ad2..c42fa1deb152 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1190,7 +1190,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct dst_entry *dst; bool any_src; - dst = l3mdev_rt6_dst_by_oif(net, fl6); + dst = l3mdev_get_rt6_dst(net, fl6); if (dst) return dst; @@ -1769,6 +1769,37 @@ static int ip6_convert_metrics(struct mx6_config *mxc, return -EINVAL; } +static struct rt6_info *ip6_nh_lookup_table(struct net *net, + struct fib6_config *cfg, + const struct in6_addr *gw_addr) +{ + struct flowi6 fl6 = { + .flowi6_oif = cfg->fc_ifindex, + .daddr = *gw_addr, + .saddr = cfg->fc_prefsrc, + }; + struct fib6_table *table; + struct rt6_info *rt; + int flags = 0; + + table = fib6_get_table(net, cfg->fc_table); + if (!table) + return NULL; + + if (!ipv6_addr_any(&cfg->fc_prefsrc)) + flags |= RT6_LOOKUP_F_HAS_SADDR; + + rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags); + + /* if table lookup failed, fall back to full lookup */ + if (rt == net->ipv6.ip6_null_entry) { + ip6_rt_put(rt); + rt = NULL; + } + + return rt; +} + static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) { struct net *net = cfg->fc_nlinfo.nl_net; @@ -1940,7 +1971,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) rt->rt6i_gateway = *gw_addr; if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { - struct rt6_info *grt; + struct rt6_info *grt = NULL; /* IPv6 strictly inhibits using not link-local addresses as nexthop address. @@ -1952,7 +1983,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) if (!(gwa_type & IPV6_ADDR_UNICAST)) goto out; - grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); + if (cfg->fc_table) + grt = ip6_nh_lookup_table(net, cfg, gw_addr); + + if (!grt) + grt = rt6_lookup(net, gw_addr, NULL, + cfg->fc_ifindex, 1); err = -EHOSTUNREACH; if (!grt) diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index aab91fa86c5e..59c483937aec 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -155,11 +155,11 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) mss = __cookie_v6_check(ipv6_hdr(skb), th, cookie); if (mss == 0) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 800265c7fd3f..c4efaa97280c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -336,8 +336,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, skb->dev->ifindex); if (!sk) { - ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), - ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), + ICMP6_MIB_INERRORS); return; } @@ -352,13 +352,13 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) - NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); + __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) goto out; if (ipv6_hdr(skb)->hop_limit < inet6_sk(sk)->min_hopcount) { - NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto out; } @@ -368,7 +368,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && !between(seq, snd_una, tp->snd_nxt)) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -649,12 +649,12 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk, return false; if (hash_expected && !hash_location) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); return true; } if (!hash_expected && hash_location) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); return true; } @@ -810,8 +810,13 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 fl6.flowi6_proto = IPPROTO_TCP; if (rt6_need_strict(&fl6.daddr) && !oif) fl6.flowi6_oif = tcp_v6_iif(skb); - else + else { + if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) + oif = skb->skb_iif; + fl6.flowi6_oif = oif; + } + fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark); fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; @@ -825,9 +830,9 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (!IS_ERR(dst)) { skb_dst_set(buff, dst); ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass); - TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); + TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) - TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); + TCP_INC_STATS(net, TCP_MIB_OUTRSTS); return; } @@ -1165,7 +1170,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * return newsk; out_overflow: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); out_nonewsk: dst_release(dst); out: @@ -1276,8 +1281,8 @@ discard: kfree_skb(skb); return 0; csum_err: - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1359,7 +1364,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) /* * Count it even if it's bad. */ - TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); + __TCP_INC_STATS(net, TCP_MIB_INSEGS); if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; @@ -1421,7 +1426,7 @@ process: } } if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { - NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; } @@ -1454,7 +1459,7 @@ process: } else if (unlikely(sk_add_backlog(sk, skb, sk->sk_rcvbuf + sk->sk_sndbuf))) { bh_unlock_sock(sk); - NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); + __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP); goto discard_and_relse; } bh_unlock_sock(sk); @@ -1472,9 +1477,9 @@ no_tcp_socket: if (tcp_checksum_complete(skb)) { csum_error: - TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); + __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); bad_packet: - TCP_INC_STATS_BH(net, TCP_MIB_INERRS); + __TCP_INC_STATS(net, TCP_MIB_INERRS); } else { tcp_v6_send_reset(NULL, skb); } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 8d8b2cd8ec5b..aca06094110f 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -423,24 +423,22 @@ try_again: if (!peeked) { atomic_inc(&sk->sk_drops); if (is_udp4) - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_INERRORS, - is_udplite); + UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, + is_udplite); else - UDP6_INC_STATS_USER(sock_net(sk), - UDP_MIB_INERRORS, - is_udplite); + UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, + is_udplite); } skb_free_datagram_locked(sk, skb); return err; } if (!peeked) { if (is_udp4) - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_INDATAGRAMS, is_udplite); + UDP_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, + is_udplite); else - UDP6_INC_STATS_USER(sock_net(sk), - UDP_MIB_INDATAGRAMS, is_udplite); + UDP6_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, + is_udplite); } sock_recv_ts_and_drops(msg, sk, skb); @@ -487,15 +485,15 @@ csum_copy_err: slow = lock_sock_fast(sk); if (!skb_kill_datagram(sk, skb, flags)) { if (is_udp4) { - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_CSUMERRORS, is_udplite); - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_INERRORS, is_udplite); + UDP_INC_STATS(sock_net(sk), + UDP_MIB_CSUMERRORS, is_udplite); + UDP_INC_STATS(sock_net(sk), + UDP_MIB_INERRORS, is_udplite); } else { - UDP6_INC_STATS_USER(sock_net(sk), - UDP_MIB_CSUMERRORS, is_udplite); - UDP6_INC_STATS_USER(sock_net(sk), - UDP_MIB_INERRORS, is_udplite); + UDP6_INC_STATS(sock_net(sk), + UDP_MIB_CSUMERRORS, is_udplite); + UDP6_INC_STATS(sock_net(sk), + UDP_MIB_INERRORS, is_udplite); } } unlock_sock_fast(sk, slow); @@ -523,8 +521,8 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, inet6_iif(skb), udptable, skb); if (!sk) { - ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), - ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), + ICMP6_MIB_INERRORS); return; } @@ -572,9 +570,9 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) /* Note that an ENOMEM error is charged twice */ if (rc == -ENOMEM) - UDP6_INC_STATS_BH(sock_net(sk), - UDP_MIB_RCVBUFERRORS, is_udplite); - UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + UDP6_INC_STATS(sock_net(sk), + UDP_MIB_RCVBUFERRORS, is_udplite); + UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); kfree_skb(skb); return -1; } @@ -630,9 +628,9 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) ret = encap_rcv(sk, skb); if (ret <= 0) { - UDP_INC_STATS_BH(sock_net(sk), - UDP_MIB_INDATAGRAMS, - is_udplite); + __UDP_INC_STATS(sock_net(sk), + UDP_MIB_INDATAGRAMS, + is_udplite); return -ret; } } @@ -666,8 +664,8 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) udp_csum_pull_header(skb); if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { - UDP6_INC_STATS_BH(sock_net(sk), - UDP_MIB_RCVBUFERRORS, is_udplite); + __UDP6_INC_STATS(sock_net(sk), + UDP_MIB_RCVBUFERRORS, is_udplite); goto drop; } @@ -686,9 +684,9 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return rc; csum_error: - UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); + __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: - UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); kfree_skb(skb); return -1; @@ -771,10 +769,10 @@ start_lookup: nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { atomic_inc(&sk->sk_drops); - UDP6_INC_STATS_BH(net, UDP_MIB_RCVBUFERRORS, - IS_UDPLITE(sk)); - UDP6_INC_STATS_BH(net, UDP_MIB_INERRORS, - IS_UDPLITE(sk)); + __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS, + IS_UDPLITE(sk)); + __UDP6_INC_STATS(net, UDP_MIB_INERRORS, + IS_UDPLITE(sk)); continue; } @@ -793,8 +791,8 @@ start_lookup: consume_skb(skb); } else { kfree_skb(skb); - UDP6_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, - proto == IPPROTO_UDPLITE); + __UDP6_INC_STATS(net, UDP_MIB_IGNOREDMULTI, + proto == IPPROTO_UDPLITE); } return 0; } @@ -887,7 +885,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp_lib_checksum_complete(skb)) goto csum_error; - UDP6_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); + __UDP6_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); kfree_skb(skb); @@ -901,9 +899,9 @@ short_packet: daddr, ntohs(uh->dest)); goto discard; csum_error: - UDP6_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); + __UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); discard: - UDP6_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); + __UDP6_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); kfree_skb(skb); return 0; } @@ -1015,13 +1013,14 @@ send: err = ip6_send_skb(skb); if (err) { if (err == -ENOBUFS && !inet6_sk(sk)->recverr) { - UDP6_INC_STATS_USER(sock_net(sk), - UDP_MIB_SNDBUFERRORS, is_udplite); + UDP6_INC_STATS(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); err = 0; } - } else - UDP6_INC_STATS_USER(sock_net(sk), - UDP_MIB_OUTDATAGRAMS, is_udplite); + } else { + UDP6_INC_STATS(sock_net(sk), + UDP_MIB_OUTDATAGRAMS, is_udplite); + } return err; } @@ -1065,11 +1064,9 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ip6_flowlabel *flowlabel = NULL; struct flowi6 fl6; struct dst_entry *dst; + struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; int ulen = len; - int hlimit = -1; - int tclass = -1; - int dontfrag = -1; int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; int err; int connected = 0; @@ -1077,6 +1074,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sockcm_cookie sockc; + ipc6.hlimit = -1; + ipc6.tclass = -1; + ipc6.dontfrag = -1; + /* destination address check */ if (sin6) { if (addr_len < offsetof(struct sockaddr, sa_data)) @@ -1201,10 +1202,9 @@ do_udp_sendmsg: opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(*opt); + ipc6.opt = opt; - err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt, - &hlimit, &tclass, &dontfrag, - &sockc); + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -1225,6 +1225,7 @@ do_udp_sendmsg: if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); + ipc6.opt = opt; fl6.flowi6_proto = sk->sk_protocol; if (!ipv6_addr_any(daddr)) @@ -1254,11 +1255,11 @@ do_udp_sendmsg: goto out; } - if (hlimit < 0) - hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + if (ipc6.hlimit < 0) + ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (tclass < 0) - tclass = np->tclass; + if (ipc6.tclass < 0) + ipc6.tclass = np->tclass; if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; @@ -1269,9 +1270,9 @@ back_from_confirm: struct sk_buff *skb; skb = ip6_make_skb(sk, getfrag, msg, ulen, - sizeof(struct udphdr), hlimit, tclass, opt, + sizeof(struct udphdr), &ipc6, &fl6, (struct rt6_info *)dst, - msg->msg_flags, dontfrag, &sockc); + msg->msg_flags, &sockc); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) err = udp_v6_send_skb(skb, &fl6); @@ -1292,14 +1293,12 @@ back_from_confirm: up->pending = AF_INET6; do_append_data: - if (dontfrag < 0) - dontfrag = np->dontfrag; + if (ipc6.dontfrag < 0) + ipc6.dontfrag = np->dontfrag; up->len += ulen; - err = ip6_append_data(sk, getfrag, msg, ulen, - sizeof(struct udphdr), hlimit, tclass, opt, &fl6, - (struct rt6_info *)dst, - corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, dontfrag, - &sockc); + err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), + &ipc6, &fl6, (struct rt6_info *)dst, + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, &sockc); if (err) udp_v6_flush_pending_frames(sk); else if (!corkreq) @@ -1342,8 +1341,8 @@ out: * seems like overkill. */ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - UDP6_INC_STATS_USER(sock_net(sk), - UDP_MIB_SNDBUFERRORS, is_udplite); + UDP6_INC_STATS(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); } return err; diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c index fcfbe579434a..d8b7267280c3 100644 --- a/net/irda/irlan/irlan_eth.c +++ b/net/irda/irlan/irlan_eth.c @@ -181,7 +181,7 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb, skb = new_skb; } - dev->trans_start = jiffies; + netif_trans_update(dev); len = skb->len; /* Now queue the packet in the transport layer */ diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index afca2eb4dfa7..6edfa9980314 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1376,9 +1376,9 @@ static int l2tp_tunnel_sock_create(struct net *net, memcpy(&udp_conf.peer_ip6, cfg->peer_ip6, sizeof(udp_conf.peer_ip6)); udp_conf.use_udp6_tx_checksums = - cfg->udp6_zero_tx_checksums; + ! cfg->udp6_zero_tx_checksums; udp_conf.use_udp6_rx_checksums = - cfg->udp6_zero_rx_checksums; + ! cfg->udp6_zero_rx_checksums; } else #endif { diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 46e07267e503..c6f5df1bed12 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -495,10 +495,8 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct dst_entry *dst = NULL; struct flowi6 fl6; struct sockcm_cookie sockc_unused = {0}; + struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; - int hlimit = -1; - int tclass = -1; - int dontfrag = -1; int transhdrlen = 4; /* zero session-id */ int ulen = len + transhdrlen; int err; @@ -520,6 +518,10 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = sk->sk_mark; + ipc6.hlimit = -1; + ipc6.tclass = -1; + ipc6.dontfrag = -1; + if (lsa) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; @@ -564,11 +566,11 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); + ipc6.opt = opt; - err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt, - &hlimit, &tclass, &dontfrag, - &sockc_unused); - if (err < 0) { + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, + &sockc_unused); + if (err < 0) { fl6_sock_release(flowlabel); return err; } @@ -588,6 +590,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); + ipc6.opt = opt; fl6.flowi6_proto = sk->sk_protocol; if (!ipv6_addr_any(daddr)) @@ -612,14 +615,14 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto out; } - if (hlimit < 0) - hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + if (ipc6.hlimit < 0) + ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (tclass < 0) - tclass = np->tclass; + if (ipc6.tclass < 0) + ipc6.tclass = np->tclass; - if (dontfrag < 0) - dontfrag = np->dontfrag; + if (ipc6.dontfrag < 0) + ipc6.dontfrag = np->dontfrag; if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; @@ -627,9 +630,9 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) back_from_confirm: lock_sock(sk); err = ip6_append_data(sk, ip_generic_getfrag, msg, - ulen, transhdrlen, hlimit, tclass, opt, + ulen, transhdrlen, &ipc6, &fl6, (struct rt6_info *)dst, - msg->msg_flags, dontfrag, &sockc_unused); + msg->msg_flags, &sockc_unused); if (err) ip6_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index e925037fa0df..6651a78e100c 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -97,3 +97,66 @@ u32 l3mdev_fib_table_by_index(struct net *net, int ifindex) return tb_id; } EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); + +/** + * l3mdev_get_rt6_dst - IPv6 route lookup based on flow. Returns + * cached route for L3 master device if relevant + * to flow + * @net: network namespace for device index lookup + * @fl6: IPv6 flow struct for lookup + */ + +struct dst_entry *l3mdev_get_rt6_dst(struct net *net, + const struct flowi6 *fl6) +{ + struct dst_entry *dst = NULL; + struct net_device *dev; + + if (fl6->flowi6_oif) { + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); + if (dev && netif_is_l3_slave(dev)) + dev = netdev_master_upper_dev_get_rcu(dev); + + if (dev && netif_is_l3_master(dev) && + dev->l3mdev_ops->l3mdev_get_rt6_dst) + dst = dev->l3mdev_ops->l3mdev_get_rt6_dst(dev, fl6); + + rcu_read_unlock(); + } + + return dst; +} +EXPORT_SYMBOL_GPL(l3mdev_get_rt6_dst); + +/** + * l3mdev_get_saddr - get source address for a flow based on an interface + * enslaved to an L3 master device + * @net: network namespace for device index lookup + * @ifindex: Interface index + * @fl4: IPv4 flow struct + */ + +int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4) +{ + struct net_device *dev; + int rc = 0; + + if (ifindex) { + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, ifindex); + if (dev && netif_is_l3_slave(dev)) + dev = netdev_master_upper_dev_get_rcu(dev); + + if (dev && netif_is_l3_master(dev) && + dev->l3mdev_ops->l3mdev_get_saddr) + rc = dev->l3mdev_ops->l3mdev_get_saddr(dev, fl4); + + rcu_read_unlock(); + } + + return rc; +} +EXPORT_SYMBOL_GPL(l3mdev_get_saddr); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index b3c52e3f689a..8ae3ed97d95c 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -626,6 +626,7 @@ static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb) if (llc->cmsg_flags & LLC_CMSG_PKTINFO) { struct llc_pktinfo info; + memset(&info, 0, sizeof(info)); info.lpi_ifindex = llc_sk(skb->sk)->dev->ifindex; llc_pdu_decode_dsap(skb, &info.lpi_sap); llc_pdu_decode_da(skb, info.lpi_mac); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 6a33f0b4d839..c59af3eb9fa4 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1761,7 +1761,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, ret = dev_alloc_name(ndev, ndev->name); if (ret < 0) { - free_netdev(ndev); + ieee80211_if_free(ndev); return ret; } @@ -1847,7 +1847,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, ret = register_netdevice(ndev); if (ret) { - free_netdev(ndev); + ieee80211_if_free(ndev); return ret; } } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 85ca189bdc3d..2cb3c626cd43 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -104,6 +104,7 @@ static inline void ct_write_unlock_bh(unsigned int key) spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } +static void ip_vs_conn_expire(unsigned long data); /* * Returns hash value for IPVS connection entry @@ -453,10 +454,16 @@ ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af, } EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); +static void __ip_vs_conn_put_notimer(struct ip_vs_conn *cp) +{ + __ip_vs_conn_put(cp); + ip_vs_conn_expire((unsigned long)cp); +} + /* * Put back the conn and restart its timer with its timeout */ -void ip_vs_conn_put(struct ip_vs_conn *cp) +static void __ip_vs_conn_put_timer(struct ip_vs_conn *cp) { unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ? 0 : cp->timeout; @@ -465,6 +472,16 @@ void ip_vs_conn_put(struct ip_vs_conn *cp) __ip_vs_conn_put(cp); } +void ip_vs_conn_put(struct ip_vs_conn *cp) +{ + if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && + (atomic_read(&cp->refcnt) == 1) && + !timer_pending(&cp->timer)) + /* expire connection immediately */ + __ip_vs_conn_put_notimer(cp); + else + __ip_vs_conn_put_timer(cp); +} /* * Fill a no_client_port connection with a client port number @@ -819,7 +836,8 @@ static void ip_vs_conn_expire(unsigned long data) if (cp->control) ip_vs_control_del(cp); - if (cp->flags & IP_VS_CONN_F_NFCT) { + if ((cp->flags & IP_VS_CONN_F_NFCT) && + !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) { /* Do not access conntracks during subsys cleanup * because nf_conntrack_find_get can not be used after * conntrack cleanup for the net. @@ -834,7 +852,10 @@ static void ip_vs_conn_expire(unsigned long data) ip_vs_unbind_dest(cp); if (cp->flags & IP_VS_CONN_F_NO_CPORT) atomic_dec(&ip_vs_conn_no_cport_cnt); - call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + ip_vs_conn_rcu_free(&cp->rcu_head); + else + call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); atomic_dec(&ipvs->conn_count); return; } @@ -850,7 +871,7 @@ static void ip_vs_conn_expire(unsigned long data) if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs)); - ip_vs_conn_put(cp); + __ip_vs_conn_put_timer(cp); } /* Modify timer, so that it expires as soon as possible. @@ -1240,6 +1261,16 @@ static inline int todrop_entry(struct ip_vs_conn *cp) return 1; } +static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp) +{ + struct ip_vs_service *svc; + + if (!cp->dest) + return false; + svc = rcu_dereference(cp->dest->svc); + return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET); +} + /* Called from keventd and must protect itself from softirqs */ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) { @@ -1254,11 +1285,16 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask; hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->flags & IP_VS_CONN_F_TEMPLATE) - /* connection template */ - continue; if (cp->ipvs != ipvs) continue; + if (cp->flags & IP_VS_CONN_F_TEMPLATE) { + if (atomic_read(&cp->n_control) || + !ip_vs_conn_ops_mode(cp)) + continue; + else + /* connection template of OPS */ + goto try_drop; + } if (cp->protocol == IPPROTO_TCP) { switch(cp->state) { case IP_VS_TCP_S_SYN_RECV: @@ -1286,6 +1322,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) continue; } } else { +try_drop: if (!todrop_entry(cp)) continue; } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index b9a4082afa3a..1207f20d24e4 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -68,6 +68,7 @@ EXPORT_SYMBOL(ip_vs_conn_put); #ifdef CONFIG_IP_VS_DEBUG EXPORT_SYMBOL(ip_vs_get_debug_level); #endif +EXPORT_SYMBOL(ip_vs_new_conn_out); static int ip_vs_net_id __read_mostly; /* netns cnt used for uniqueness */ @@ -611,7 +612,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ret = cp->packet_xmit(skb, cp, pd->pp, iph); /* do not touch skb anymore */ - atomic_inc(&cp->in_pkts); + if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) + atomic_inc(&cp->control->in_pkts); + else + atomic_inc(&cp->in_pkts); ip_vs_conn_put(cp); return ret; } @@ -1100,6 +1104,143 @@ static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, } } +/* Generic function to create new connections for outgoing RS packets + * + * Pre-requisites for successful connection creation: + * 1) Virtual Service is NOT fwmark based: + * In fwmark-VS actual vaddr and vport are unknown to IPVS + * 2) Real Server and Virtual Service were NOT configured without port: + * This is to allow match of different VS to the same RS ip-addr + */ +struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + __be16 dport, + __be16 cport) +{ + struct ip_vs_conn_param param; + struct ip_vs_conn *ct = NULL, *cp = NULL; + const union nf_inet_addr *vaddr, *daddr, *caddr; + union nf_inet_addr snet; + __be16 vport; + unsigned int flags; + + EnterFunction(12); + vaddr = &svc->addr; + vport = svc->port; + daddr = &iph->saddr; + caddr = &iph->daddr; + + /* check pre-requisites are satisfied */ + if (svc->fwmark) + return NULL; + if (!vport || !dport) + return NULL; + + /* for persistent service first create connection template */ + if (svc->flags & IP_VS_SVC_F_PERSISTENT) { + /* apply netmask the same way ingress-side does */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + ipv6_addr_prefix(&snet.in6, &caddr->in6, + (__force __u32)svc->netmask); + else +#endif + snet.ip = caddr->ip & svc->netmask; + /* fill params and create template if not existent */ + if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol, + &snet, 0, vaddr, + vport, ¶m) < 0) + return NULL; + ct = ip_vs_ct_in_get(¶m); + if (!ct) { + ct = ip_vs_conn_new(¶m, dest->af, daddr, dport, + IP_VS_CONN_F_TEMPLATE, dest, 0); + if (!ct) { + kfree(param.pe_data); + return NULL; + } + ct->timeout = svc->timeout; + } else { + kfree(param.pe_data); + } + } + + /* connection flags */ + flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) && + iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; + /* create connection */ + ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, + caddr, cport, vaddr, vport, ¶m); + cp = ip_vs_conn_new(¶m, dest->af, daddr, dport, flags, dest, 0); + if (!cp) { + if (ct) + ip_vs_conn_put(ct); + return NULL; + } + if (ct) { + ip_vs_control_add(cp, ct); + ip_vs_conn_put(ct); + } + ip_vs_conn_stats(cp, svc); + + /* return connection (will be used to handle outgoing packet) */ + IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u " + "d:%s:%u conn->flags:%X conn->refcnt:%d\n", + ip_vs_fwd_tag(cp), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + cp->flags, atomic_read(&cp->refcnt)); + LeaveFunction(12); + return cp; +} + +/* Handle outgoing packets which are considered requests initiated by + * real servers, so that subsequent responses from external client can be + * routed to the right real server. + * Used also for outgoing responses in OPS mode. + * + * Connection management is handled by persistent-engine specific callback. + */ +static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum, + struct netns_ipvs *ipvs, + int af, struct sk_buff *skb, + const struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_conn *cp = NULL; + __be16 _ports[2], *pptr; + + if (hooknum == NF_INET_LOCAL_IN) + return NULL; + + pptr = frag_safe_skb_hp(skb, iph->len, + sizeof(_ports), _ports, iph); + if (!pptr) + return NULL; + + rcu_read_lock(); + dest = ip_vs_find_real_service(ipvs, af, iph->protocol, + &iph->saddr, pptr[0]); + if (dest) { + struct ip_vs_service *svc; + struct ip_vs_pe *pe; + + svc = rcu_dereference(dest->svc); + if (svc) { + pe = rcu_dereference(svc->pe); + if (pe && pe->conn_out) + cp = pe->conn_out(svc, dest, skb, iph, + pptr[0], pptr[1]); + } + } + rcu_read_unlock(); + + return cp; +} + /* Handle response packets: rewrite addresses and send away... */ static unsigned int @@ -1245,6 +1386,22 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in if (likely(cp)) return handle_response(af, skb, pd, cp, &iph, hooknum); + + /* Check for real-server-started requests */ + if (atomic_read(&ipvs->conn_out_counter)) { + /* Currently only for UDP: + * connection oriented protocols typically use + * ephemeral ports for outgoing connections, so + * related incoming responses would not match any VS + */ + if (pp->protocol == IPPROTO_UDP) { + cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph); + if (likely(cp)) + return handle_response(af, skb, pd, cp, &iph, + hooknum); + } + } + if (sysctl_nat_icmp_send(ipvs) && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || @@ -1837,6 +1994,9 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(ipvs, cp, pkts); + else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) + /* increment is done inside ip_vs_sync_conn too */ + atomic_inc(&cp->control->in_pkts); ip_vs_conn_put(cp); return ret; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index f35ebc02fa5c..c3c809b2e712 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -567,6 +567,36 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, return false; } +/* Find real service record by <proto,addr,port>. + * In case of multiple records with the same <proto,addr,port>, only + * the first found record is returned. + * + * To be called under RCU lock. + */ +struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, + __u16 protocol, + const union nf_inet_addr *daddr, + __be16 dport) +{ + unsigned int hash; + struct ip_vs_dest *dest; + + /* Check for "full" addressed entries */ + hash = ip_vs_rs_hashkey(af, daddr, dport); + + hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { + if (dest->port == dport && + dest->af == af && + ip_vs_addr_equal(af, &dest->addr, daddr) && + (dest->protocol == protocol || dest->vfwmark)) { + /* HIT */ + return dest; + } + } + + return NULL; +} + /* Lookup destination by {addr,port} in the given service * Called under RCU lock. */ @@ -1253,6 +1283,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, atomic_inc(&ipvs->ftpsvc_counter); else if (svc->port == 0) atomic_inc(&ipvs->nullsvc_counter); + if (svc->pe && svc->pe->conn_out) + atomic_inc(&ipvs->conn_out_counter); ip_vs_start_estimator(ipvs, &svc->stats); @@ -1293,6 +1325,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) struct ip_vs_scheduler *sched = NULL, *old_sched; struct ip_vs_pe *pe = NULL, *old_pe = NULL; int ret = 0; + bool new_pe_conn_out, old_pe_conn_out; /* * Lookup the scheduler, by 'u->sched_name' @@ -1355,8 +1388,16 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) svc->netmask = u->netmask; old_pe = rcu_dereference_protected(svc->pe, 1); - if (pe != old_pe) + if (pe != old_pe) { rcu_assign_pointer(svc->pe, pe); + /* check for optional methods in new pe */ + new_pe_conn_out = (pe && pe->conn_out) ? true : false; + old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false; + if (new_pe_conn_out && !old_pe_conn_out) + atomic_inc(&svc->ipvs->conn_out_counter); + if (old_pe_conn_out && !new_pe_conn_out) + atomic_dec(&svc->ipvs->conn_out_counter); + } out: ip_vs_scheduler_put(old_sched); @@ -1389,6 +1430,8 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) /* Unbind persistence engine, keep svc->pe */ old_pe = rcu_dereference_protected(svc->pe, 1); + if (old_pe && old_pe->conn_out) + atomic_dec(&ipvs->conn_out_counter); ip_vs_pe_put(old_pe); /* @@ -3969,6 +4012,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) (unsigned long) ipvs); atomic_set(&ipvs->ftpsvc_counter, 0); atomic_set(&ipvs->nullsvc_counter, 0); + atomic_set(&ipvs->conn_out_counter, 0); /* procfs stats */ ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index 30434fb133df..f04fd8df210b 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -93,6 +93,10 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) return; + /* Never alter conntrack for OPS conns (no reply is expected) */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return; + /* Alter reply only in original direction */ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) return; diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index 0a6eb5c0d9e9..d07ef9e31c12 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -143,6 +143,20 @@ static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf) return cp->pe_data_len; } +static struct ip_vs_conn * +ip_vs_sip_conn_out(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + __be16 dport, + __be16 cport) +{ + if (likely(iph->protocol == IPPROTO_UDP)) + return ip_vs_new_conn_out(svc, dest, skb, iph, dport, cport); + /* currently no need to handle other than UDP */ + return NULL; +} + static struct ip_vs_pe ip_vs_sip_pe = { .name = "sip", @@ -153,6 +167,7 @@ static struct ip_vs_pe ip_vs_sip_pe = .ct_match = ip_vs_sip_ct_match, .hashkey_raw = ip_vs_sip_hashkey_raw, .show_pe_data = ip_vs_sip_show_pe_data, + .conn_out = ip_vs_sip_conn_out, }; static int __init ip_vs_sip_init(void) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 2fd607408998..566c64e3ec50 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -54,6 +54,7 @@ #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_helper.h> +#include <net/netns/hash.h> #define NF_CONNTRACK_VERSION "0.5.0" @@ -68,7 +69,12 @@ EXPORT_SYMBOL_GPL(nf_conntrack_locks); __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); +struct hlist_nulls_head *nf_conntrack_hash __read_mostly; +EXPORT_SYMBOL_GPL(nf_conntrack_hash); + +static __read_mostly struct kmem_cache *nf_conntrack_cachep; static __read_mostly spinlock_t nf_conntrack_locks_all_lock; +static __read_mostly seqcount_t nf_conntrack_generation; static __read_mostly bool nf_conntrack_locks_all; void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) @@ -107,7 +113,7 @@ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, spin_lock_nested(&nf_conntrack_locks[h1], SINGLE_DEPTH_NESTING); } - if (read_seqcount_retry(&net->ct.generation, sequence)) { + if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { nf_conntrack_double_unlock(h1, h2); return true; } @@ -141,43 +147,43 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max); DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); -unsigned int nf_conntrack_hash_rnd __read_mostly; -EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd); +static unsigned int nf_conntrack_hash_rnd __read_mostly; -static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple) +static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, + const struct net *net) { unsigned int n; + u32 seed; + + get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); /* The direction must be ignored, so we hash everything up to the * destination ports (which is a multiple of 4) and treat the last * three bytes manually. */ + seed = nf_conntrack_hash_rnd ^ net_hash_mix(net); n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); - return jhash2((u32 *)tuple, n, nf_conntrack_hash_rnd ^ + return jhash2((u32 *)tuple, n, seed ^ (((__force __u16)tuple->dst.u.all << 16) | tuple->dst.protonum)); } -static u32 __hash_bucket(u32 hash, unsigned int size) -{ - return reciprocal_scale(hash, size); -} - -static u32 hash_bucket(u32 hash, const struct net *net) +static u32 scale_hash(u32 hash) { - return __hash_bucket(hash, net->ct.htable_size); + return reciprocal_scale(hash, nf_conntrack_htable_size); } -static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, - unsigned int size) +static u32 __hash_conntrack(const struct net *net, + const struct nf_conntrack_tuple *tuple, + unsigned int size) { - return __hash_bucket(hash_conntrack_raw(tuple), size); + return reciprocal_scale(hash_conntrack_raw(tuple, net), size); } -static inline u_int32_t hash_conntrack(const struct net *net, - const struct nf_conntrack_tuple *tuple) +static u32 hash_conntrack(const struct net *net, + const struct nf_conntrack_tuple *tuple) { - return __hash_conntrack(tuple, net->ct.htable_size); + return scale_hash(hash_conntrack_raw(tuple, net)); } bool @@ -358,7 +364,7 @@ destroy_conntrack(struct nf_conntrack *nfct) } rcu_read_lock(); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); - if (l4proto && l4proto->destroy) + if (l4proto->destroy) l4proto->destroy(ct); rcu_read_unlock(); @@ -393,7 +399,7 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) local_bh_disable(); do { - sequence = read_seqcount_begin(&net->ct.generation); + sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); reply_hash = hash_conntrack(net, @@ -445,7 +451,8 @@ static void death_by_timeout(unsigned long ul_conntrack) static inline bool nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const struct nf_conntrack_zone *zone, + const struct net *net) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); @@ -454,7 +461,8 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, */ return nf_ct_tuple_equal(tuple, &h->tuple) && nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && - nf_ct_is_confirmed(ct); + nf_ct_is_confirmed(ct) && + net_eq(net, nf_ct_net(ct)); } /* @@ -467,21 +475,23 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_head *ct_hash; struct hlist_nulls_node *n; - unsigned int bucket = hash_bucket(hash, net); + unsigned int bucket, sequence; - /* Disable BHs the entire time since we normally need to disable them - * at least once for the stats anyway. - */ - local_bh_disable(); begin: - hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) { - if (nf_ct_key_equal(h, tuple, zone)) { - NF_CT_STAT_INC(net, found); - local_bh_enable(); + do { + sequence = read_seqcount_begin(&nf_conntrack_generation); + bucket = scale_hash(hash); + ct_hash = nf_conntrack_hash; + } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + + hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { + if (nf_ct_key_equal(h, tuple, zone, net)) { + NF_CT_STAT_INC_ATOMIC(net, found); return h; } - NF_CT_STAT_INC(net, searched); + NF_CT_STAT_INC_ATOMIC(net, searched); } /* * if the nulls value we got at the end of this lookup is @@ -489,10 +499,9 @@ begin: * We probably met an item that was moved to another chain. */ if (get_nulls_value(n) != bucket) { - NF_CT_STAT_INC(net, search_restart); + NF_CT_STAT_INC_ATOMIC(net, search_restart); goto begin; } - local_bh_enable(); return NULL; } @@ -514,7 +523,7 @@ begin: !atomic_inc_not_zero(&ct->ct_general.use))) h = NULL; else { - if (unlikely(!nf_ct_key_equal(h, tuple, zone))) { + if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) { nf_ct_put(ct); goto begin; } @@ -530,7 +539,7 @@ nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { return __nf_conntrack_find_get(net, zone, tuple, - hash_conntrack_raw(tuple)); + hash_conntrack_raw(tuple, net)); } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); @@ -538,12 +547,10 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct, unsigned int hash, unsigned int reply_hash) { - struct net *net = nf_ct_net(ct); - hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &net->ct.hash[hash]); + &nf_conntrack_hash[hash]); hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, - &net->ct.hash[reply_hash]); + &nf_conntrack_hash[reply_hash]); } int @@ -560,7 +567,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) local_bh_disable(); do { - sequence = read_seqcount_begin(&net->ct.generation); + sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); reply_hash = hash_conntrack(net, @@ -568,17 +575,14 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* See if there's one in the list already, including reverse */ - hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + zone, net)) goto out; - hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + zone, net)) goto out; add_timer(&ct->timeout); @@ -599,6 +603,62 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); +static inline void nf_ct_acct_update(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int len) +{ + struct nf_conn_acct *acct; + + acct = nf_conn_acct_find(ct); + if (acct) { + struct nf_conn_counter *counter = acct->counter; + + atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); + atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes); + } +} + +static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + const struct nf_conn *loser_ct) +{ + struct nf_conn_acct *acct; + + acct = nf_conn_acct_find(loser_ct); + if (acct) { + struct nf_conn_counter *counter = acct->counter; + unsigned int bytes; + + /* u32 should be fine since we must have seen one packet. */ + bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); + nf_ct_acct_update(ct, ctinfo, bytes); + } +} + +/* Resolve race on insertion if this protocol allows this. */ +static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + struct nf_conntrack_tuple_hash *h) +{ + /* This is the conntrack entry already in hashes that won race. */ + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + struct nf_conntrack_l4proto *l4proto; + + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + if (l4proto->allow_clash && + !nf_ct_is_dying(ct) && + atomic_inc_not_zero(&ct->ct_general.use)) { + nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct); + nf_conntrack_put(skb->nfct); + /* Assign conntrack already in hashes to this skbuff. Don't + * modify skb->nfctinfo to ensure consistent stateful filtering. + */ + skb->nfct = &ct->ct_general; + return NF_ACCEPT; + } + NF_CT_STAT_INC(net, drop); + return NF_DROP; +} + /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff *skb) @@ -613,6 +673,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) enum ip_conntrack_info ctinfo; struct net *net; unsigned int sequence; + int ret = NF_DROP; ct = nf_ct_get(skb, &ctinfo); net = nf_ct_net(ct); @@ -628,10 +689,10 @@ __nf_conntrack_confirm(struct sk_buff *skb) local_bh_disable(); do { - sequence = read_seqcount_begin(&net->ct.generation); + sequence = read_seqcount_begin(&nf_conntrack_generation); /* reuse the hash saved before */ hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; - hash = hash_bucket(hash, net); + hash = scale_hash(hash); reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); @@ -655,23 +716,22 @@ __nf_conntrack_confirm(struct sk_buff *skb) */ nf_ct_del_from_dying_or_unconfirmed_list(ct); - if (unlikely(nf_ct_is_dying(ct))) - goto out; + if (unlikely(nf_ct_is_dying(ct))) { + nf_ct_add_to_dying_list(ct); + goto dying; + } /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ - hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + zone, net)) goto out; - hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + zone, net)) goto out; /* Timer relative to confirmation time, not original @@ -710,10 +770,12 @@ __nf_conntrack_confirm(struct sk_buff *skb) out: nf_ct_add_to_dying_list(ct); + ret = nf_ct_resolve_clash(net, skb, ctinfo, h); +dying: nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert_failed); local_bh_enable(); - return NF_DROP; + return ret; } EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); @@ -726,29 +788,31 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, struct net *net = nf_ct_net(ignored_conntrack); const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_head *ct_hash; + unsigned int hash, sequence; struct hlist_nulls_node *n; struct nf_conn *ct; - unsigned int hash; zone = nf_ct_zone(ignored_conntrack); - hash = hash_conntrack(net, tuple); - /* Disable BHs the entire time since we need to disable them at - * least once for the stats anyway. - */ - rcu_read_lock_bh(); - hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { + rcu_read_lock(); + do { + sequence = read_seqcount_begin(&nf_conntrack_generation); + hash = hash_conntrack(net, tuple); + ct_hash = nf_conntrack_hash; + } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + + hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (ct != ignored_conntrack && - nf_ct_tuple_equal(tuple, &h->tuple) && - nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h))) { - NF_CT_STAT_INC(net, found); - rcu_read_unlock_bh(); + nf_ct_key_equal(h, tuple, zone, net)) { + NF_CT_STAT_INC_ATOMIC(net, found); + rcu_read_unlock(); return 1; } - NF_CT_STAT_INC(net, searched); + NF_CT_STAT_INC_ATOMIC(net, searched); } - rcu_read_unlock_bh(); + rcu_read_unlock(); return 0; } @@ -762,71 +826,63 @@ static noinline int early_drop(struct net *net, unsigned int _hash) { /* Use oldest entry, which is roughly LRU */ struct nf_conntrack_tuple_hash *h; - struct nf_conn *ct = NULL, *tmp; + struct nf_conn *tmp; struct hlist_nulls_node *n; - unsigned int i = 0, cnt = 0; - int dropped = 0; - unsigned int hash, sequence; + unsigned int i, hash, sequence; + struct nf_conn *ct = NULL; spinlock_t *lockp; + bool ret = false; + + i = 0; local_bh_disable(); restart: - sequence = read_seqcount_begin(&net->ct.generation); - hash = hash_bucket(_hash, net); - for (; i < net->ct.htable_size; i++) { + sequence = read_seqcount_begin(&nf_conntrack_generation); + for (; i < NF_CT_EVICTION_RANGE; i++) { + hash = scale_hash(_hash++); lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; nf_conntrack_lock(lockp); - if (read_seqcount_retry(&net->ct.generation, sequence)) { + if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { spin_unlock(lockp); goto restart; } - hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], - hnnode) { + hlist_nulls_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], + hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); - if (!test_bit(IPS_ASSURED_BIT, &tmp->status) && - !nf_ct_is_dying(tmp) && - atomic_inc_not_zero(&tmp->ct_general.use)) { + + if (test_bit(IPS_ASSURED_BIT, &tmp->status) || + !net_eq(nf_ct_net(tmp), net) || + nf_ct_is_dying(tmp)) + continue; + + if (atomic_inc_not_zero(&tmp->ct_general.use)) { ct = tmp; break; } - cnt++; } - hash = (hash + 1) % net->ct.htable_size; spin_unlock(lockp); - - if (ct || cnt >= NF_CT_EVICTION_RANGE) + if (ct) break; - } + local_bh_enable(); if (!ct) - return dropped; + return false; - if (del_timer(&ct->timeout)) { + /* kill only if in same netns -- might have moved due to + * SLAB_DESTROY_BY_RCU rules + */ + if (net_eq(nf_ct_net(ct), net) && del_timer(&ct->timeout)) { if (nf_ct_delete(ct, 0, 0)) { - dropped = 1; NF_CT_STAT_INC_ATOMIC(net, early_drop); + ret = true; } } - nf_ct_put(ct); - return dropped; -} - -void init_nf_conntrack_hash_rnd(void) -{ - unsigned int rand; - /* - * Why not initialize nf_conntrack_rnd in a "init()" function ? - * Because there isn't enough entropy when system initializing, - * and we initialize it as late as possible. - */ - do { - get_random_bytes(&rand, sizeof(rand)); - } while (!rand); - cmpxchg(&nf_conntrack_hash_rnd, 0, rand); + nf_ct_put(ct); + return ret; } static struct nf_conn * @@ -838,12 +894,6 @@ __nf_conntrack_alloc(struct net *net, { struct nf_conn *ct; - if (unlikely(!nf_conntrack_hash_rnd)) { - init_nf_conntrack_hash_rnd(); - /* recompute the hash as nf_conntrack_hash_rnd is initialized */ - hash = hash_conntrack_raw(orig); - } - /* We don't want any race condition at early drop stage */ atomic_inc(&net->ct.count); @@ -860,7 +910,7 @@ __nf_conntrack_alloc(struct net *net, * Do not use kmem_cache_zalloc(), as this cache uses * SLAB_DESTROY_BY_RCU. */ - ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp); + ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); if (ct == NULL) goto out; @@ -887,7 +937,7 @@ __nf_conntrack_alloc(struct net *net, atomic_set(&ct->ct_general.use, 0); return ct; out_free: - kmem_cache_free(net->ct.nf_conntrack_cachep, ct); + kmem_cache_free(nf_conntrack_cachep, ct); out: atomic_dec(&net->ct.count); return ERR_PTR(-ENOMEM); @@ -914,7 +964,7 @@ void nf_conntrack_free(struct nf_conn *ct) nf_ct_ext_destroy(ct); nf_ct_ext_free(ct); - kmem_cache_free(net->ct.nf_conntrack_cachep, ct); + kmem_cache_free(nf_conntrack_cachep, ct); smp_mb__before_atomic(); atomic_dec(&net->ct.count); } @@ -1061,7 +1111,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); - hash = hash_conntrack_raw(&tuple); + hash = hash_conntrack_raw(&tuple, net); h = __nf_conntrack_find_get(net, zone, &tuple, hash); if (!h) { h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, @@ -1270,17 +1320,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, } acct: - if (do_acct) { - struct nf_conn_acct *acct; - - acct = nf_conn_acct_find(ct); - if (acct) { - struct nf_conn_counter *counter = acct->counter; - - atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); - atomic64_add(skb->len, &counter[CTINFO2DIR(ctinfo)].bytes); - } - } + if (do_acct) + nf_ct_acct_update(ct, ctinfo, skb->len); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); @@ -1289,18 +1330,8 @@ bool __nf_ct_kill_acct(struct nf_conn *ct, const struct sk_buff *skb, int do_acct) { - if (do_acct) { - struct nf_conn_acct *acct; - - acct = nf_conn_acct_find(ct); - if (acct) { - struct nf_conn_counter *counter = acct->counter; - - atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); - atomic64_add(skb->len - skb_network_offset(skb), - &counter[CTINFO2DIR(ctinfo)].bytes); - } - } + if (do_acct) + nf_ct_acct_update(ct, ctinfo, skb->len); if (del_timer(&ct->timeout)) { ct->timeout.function((unsigned long)ct); @@ -1396,16 +1427,17 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), int cpu; spinlock_t *lockp; - for (; *bucket < net->ct.htable_size; (*bucket)++) { + for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; local_bh_disable(); nf_conntrack_lock(lockp); - if (*bucket < net->ct.htable_size) { - hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { + if (*bucket < nf_conntrack_htable_size) { + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = nf_ct_tuplehash_to_ctrack(h); - if (iter(ct, data)) + if (net_eq(nf_ct_net(ct), net) && + iter(ct, data)) goto found; } } @@ -1443,6 +1475,9 @@ void nf_ct_iterate_cleanup(struct net *net, might_sleep(); + if (atomic_read(&net->ct.count) == 0) + return; + while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { /* Time to push up daises... */ if (del_timer(&ct->timeout)) @@ -1494,6 +1529,8 @@ void nf_conntrack_cleanup_end(void) while (untrack_refs() > 0) schedule(); + nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); + #ifdef CONFIG_NF_CONNTRACK_ZONES nf_ct_extend_unregister(&nf_ct_zone_extend); #endif @@ -1544,15 +1581,12 @@ i_see_dead_people: } list_for_each_entry(net, net_exit_list, exit_list) { - nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); nf_conntrack_proto_pernet_fini(net); nf_conntrack_helper_pernet_fini(net); nf_conntrack_ecache_pernet_fini(net); nf_conntrack_tstamp_pernet_fini(net); nf_conntrack_acct_pernet_fini(net); nf_conntrack_expect_pernet_fini(net); - kmem_cache_destroy(net->ct.nf_conntrack_cachep); - kfree(net->ct.slabname); free_percpu(net->ct.stat); free_percpu(net->ct.pcpu_lists); } @@ -1607,7 +1641,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) local_bh_disable(); nf_conntrack_all_lock(); - write_seqcount_begin(&init_net.ct.generation); + write_seqcount_begin(&nf_conntrack_generation); /* Lookups in the old hash might happen in parallel, which means we * might get false negatives during connection lookup. New connections @@ -1615,26 +1649,28 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) * though since that required taking the locks. */ - for (i = 0; i < init_net.ct.htable_size; i++) { - while (!hlist_nulls_empty(&init_net.ct.hash[i])) { - h = hlist_nulls_entry(init_net.ct.hash[i].first, - struct nf_conntrack_tuple_hash, hnnode); + for (i = 0; i < nf_conntrack_htable_size; i++) { + while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { + h = hlist_nulls_entry(nf_conntrack_hash[i].first, + struct nf_conntrack_tuple_hash, hnnode); ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&h->hnnode); - bucket = __hash_conntrack(&h->tuple, hashsize); + bucket = __hash_conntrack(nf_ct_net(ct), + &h->tuple, hashsize); hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } - old_size = init_net.ct.htable_size; - old_hash = init_net.ct.hash; + old_size = nf_conntrack_htable_size; + old_hash = nf_conntrack_hash; - init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; - init_net.ct.hash = hash; + nf_conntrack_hash = hash; + nf_conntrack_htable_size = hashsize; - write_seqcount_end(&init_net.ct.generation); + write_seqcount_end(&nf_conntrack_generation); nf_conntrack_all_unlock(); local_bh_enable(); + synchronize_net(); nf_ct_free_hashtable(old_hash, old_size); return 0; } @@ -1655,7 +1691,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); int nf_conntrack_init_start(void) { int max_factor = 8; - int i, ret, cpu; + int ret = -ENOMEM; + int i, cpu; + + seqcount_init(&nf_conntrack_generation); for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_conntrack_locks[i]); @@ -1682,8 +1721,19 @@ int nf_conntrack_init_start(void) * entries. */ max_factor = 4; } + + nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); + if (!nf_conntrack_hash) + return -ENOMEM; + nf_conntrack_max = max_factor * nf_conntrack_htable_size; + nf_conntrack_cachep = kmem_cache_create("nf_conntrack", + sizeof(struct nf_conn), 0, + SLAB_DESTROY_BY_RCU, NULL); + if (!nf_conntrack_cachep) + goto err_cachep; + printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n", NF_CONNTRACK_VERSION, nf_conntrack_htable_size, nf_conntrack_max); @@ -1760,6 +1810,9 @@ err_tstamp: err_acct: nf_conntrack_expect_fini(); err_expect: + kmem_cache_destroy(nf_conntrack_cachep); +err_cachep: + nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); return ret; } @@ -1783,7 +1836,6 @@ int nf_conntrack_init_net(struct net *net) int cpu; atomic_set(&net->ct.count, 0); - seqcount_init(&net->ct.generation); net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); if (!net->ct.pcpu_lists) @@ -1801,24 +1853,6 @@ int nf_conntrack_init_net(struct net *net) if (!net->ct.stat) goto err_pcpu_lists; - net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net); - if (!net->ct.slabname) - goto err_slabname; - - net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname, - sizeof(struct nf_conn), 0, - SLAB_DESTROY_BY_RCU, NULL); - if (!net->ct.nf_conntrack_cachep) { - printk(KERN_ERR "Unable to create nf_conn slab cache\n"); - goto err_cache; - } - - net->ct.htable_size = nf_conntrack_htable_size; - net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1); - if (!net->ct.hash) { - printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); - goto err_hash; - } ret = nf_conntrack_expect_pernet_init(net); if (ret < 0) goto err_expect; @@ -1850,12 +1884,6 @@ err_tstamp: err_acct: nf_conntrack_expect_pernet_fini(net); err_expect: - nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); -err_hash: - kmem_cache_destroy(net->ct.nf_conntrack_cachep); -err_cache: - kfree(net->ct.slabname); -err_slabname: free_percpu(net->ct.stat); err_pcpu_lists: free_percpu(net->ct.pcpu_lists); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 278927ab0948..9e3693128313 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -24,6 +24,7 @@ #include <linux/moduleparam.h> #include <linux/export.h> #include <net/net_namespace.h> +#include <net/netns/hash.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> @@ -35,9 +36,13 @@ unsigned int nf_ct_expect_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); +struct hlist_head *nf_ct_expect_hash __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_expect_hash); + unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; +static unsigned int nf_ct_expect_hashrnd __read_mostly; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, @@ -72,21 +77,32 @@ static void nf_ct_expectation_timed_out(unsigned long ul_expect) nf_ct_expect_put(exp); } -static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple) +static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple) { - unsigned int hash; + unsigned int hash, seed; - if (unlikely(!nf_conntrack_hash_rnd)) { - init_nf_conntrack_hash_rnd(); - } + get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd)); + + seed = nf_ct_expect_hashrnd ^ net_hash_mix(n); hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), (((tuple->dst.protonum ^ tuple->src.l3num) << 16) | - (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd); + (__force __u16)tuple->dst.u.all) ^ seed); return reciprocal_scale(hash, nf_ct_expect_hsize); } +static bool +nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_expect *i, + const struct nf_conntrack_zone *zone, + const struct net *net) +{ + return nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && + net_eq(net, nf_ct_net(i->master)) && + nf_ct_zone_equal_any(i->master, zone); +} + struct nf_conntrack_expect * __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, @@ -98,10 +114,9 @@ __nf_ct_expect_find(struct net *net, if (!net->ct.expect_count) return NULL; - h = nf_ct_expect_dst_hash(tuple); - hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) { - if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && - nf_ct_zone_equal_any(i->master, zone)) + h = nf_ct_expect_dst_hash(net, tuple); + hlist_for_each_entry_rcu(i, &nf_ct_expect_hash[h], hnode) { + if (nf_ct_exp_equal(tuple, i, zone, net)) return i; } return NULL; @@ -139,11 +154,10 @@ nf_ct_find_expectation(struct net *net, if (!net->ct.expect_count) return NULL; - h = nf_ct_expect_dst_hash(tuple); - hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) { + h = nf_ct_expect_dst_hash(net, tuple); + hlist_for_each_entry(i, &nf_ct_expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && - nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && - nf_ct_zone_equal_any(i->master, zone)) { + nf_ct_exp_equal(tuple, i, zone, net)) { exp = i; break; } @@ -223,6 +237,7 @@ static inline int expect_clash(const struct nf_conntrack_expect *a, } return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) && + net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } @@ -232,6 +247,7 @@ static inline int expect_matches(const struct nf_conntrack_expect *a, return a->master == b->master && a->class == b->class && nf_ct_tuple_equal(&a->tuple, &b->tuple) && nf_ct_tuple_mask_equal(&a->mask, &b->mask) && + net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } @@ -342,7 +358,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) struct nf_conn_help *master_help = nfct_help(exp->master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(exp); - unsigned int h = nf_ct_expect_dst_hash(&exp->tuple); + unsigned int h = nf_ct_expect_dst_hash(net, &exp->tuple); /* two references : one for hash insert, one for the timer */ atomic_add(2, &exp->use); @@ -350,7 +366,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) hlist_add_head(&exp->lnode, &master_help->expectations); master_help->expecting[exp->class]++; - hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]); + hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); net->ct.expect_count++; setup_timer(&exp->timeout, nf_ct_expectation_timed_out, @@ -401,8 +417,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) ret = -ESHUTDOWN; goto out; } - h = nf_ct_expect_dst_hash(&expect->tuple); - hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) { + h = nf_ct_expect_dst_hash(net, &expect->tuple); + hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) { if (expect_matches(i, expect)) { if (del_timer(&i->timeout)) { nf_ct_unlink_expect(i); @@ -468,12 +484,11 @@ struct ct_expect_iter_state { static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { - struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + n = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); if (n) return n; } @@ -483,14 +498,13 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { - struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(hlist_next_rcu(head)); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + head = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); } return head; } @@ -623,28 +637,13 @@ module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); int nf_conntrack_expect_pernet_init(struct net *net) { - int err = -ENOMEM; - net->ct.expect_count = 0; - net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); - if (net->ct.expect_hash == NULL) - goto err1; - - err = exp_proc_init(net); - if (err < 0) - goto err2; - - return 0; -err2: - nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); -err1: - return err; + return exp_proc_init(net); } void nf_conntrack_expect_pernet_fini(struct net *net) { exp_proc_remove(net); - nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); } int nf_conntrack_expect_init(void) @@ -660,6 +659,13 @@ int nf_conntrack_expect_init(void) 0, 0, NULL); if (!nf_ct_expect_cachep) return -ENOMEM; + + nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); + if (!nf_ct_expect_hash) { + kmem_cache_destroy(nf_ct_expect_cachep); + return -ENOMEM; + } + return 0; } @@ -667,4 +673,5 @@ void nf_conntrack_expect_fini(void) { rcu_barrier(); /* Wait for call_rcu() before destroy */ kmem_cache_destroy(nf_ct_expect_cachep); + nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_hsize); } diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 3b40ec575cd5..f703adb7e5f7 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -38,10 +38,10 @@ unsigned int nf_ct_helper_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_helper_hsize); static unsigned int nf_ct_helper_count __read_mostly; -static bool nf_ct_auto_assign_helper __read_mostly = true; +static bool nf_ct_auto_assign_helper __read_mostly = false; module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644); MODULE_PARM_DESC(nf_conntrack_helper, - "Enable automatic conntrack helper assignment (default 1)"); + "Enable automatic conntrack helper assignment (default 0)"); #ifdef CONFIG_SYSCTL static struct ctl_table helper_sysctl_table[] = { @@ -400,7 +400,7 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, - &net->ct.expect_hash[i], hnode) { + &nf_ct_expect_hash[i], hnode) { struct nf_conn_help *help = nfct_help(exp->master); if ((rcu_dereference_protected( help->helper, @@ -424,10 +424,10 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, spin_unlock_bh(&pcpu->lock); } local_bh_disable(); - for (i = 0; i < net->ct.htable_size; i++) { + for (i = 0; i < nf_conntrack_htable_size; i++) { nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); - if (i < net->ct.htable_size) { - hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) + if (i < nf_conntrack_htable_size) { + hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) unhelp(h, me); } spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 294a8e28cec4..a18d1ceabad5 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -824,19 +824,22 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) last = (struct nf_conn *)cb->args[1]; local_bh_disable(); - for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { + for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { restart: lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; nf_conntrack_lock(lockp); - if (cb->args[0] >= net->ct.htable_size) { + if (cb->args[0] >= nf_conntrack_htable_size) { spin_unlock(lockp); goto out; } - hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]], - hnnode) { + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]], + hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = nf_ct_tuplehash_to_ctrack(h); + if (!net_eq(net, nf_ct_net(ct))) + continue; + /* Dump entries of a given L3 protocol number. * If it is not specified, ie. l3proto == 0, * then dump everything. */ @@ -2629,10 +2632,14 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) last = (struct nf_conntrack_expect *)cb->args[1]; for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { restart: - hlist_for_each_entry(exp, &net->ct.expect_hash[cb->args[0]], + hlist_for_each_entry(exp, &nf_ct_expect_hash[cb->args[0]], hnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; + + if (!net_eq(nf_ct_net(exp->master), net)) + continue; + if (cb->args[1]) { if (exp != last) continue; @@ -2883,8 +2890,12 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, - &net->ct.expect_hash[i], + &nf_ct_expect_hash[i], hnode) { + + if (!net_eq(nf_ct_exp_net(exp), net)) + continue; + m_help = nfct_help(exp->master); if (!strcmp(m_help->helper->name, name) && del_timer(&exp->timeout)) { @@ -2901,8 +2912,12 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, - &net->ct.expect_hash[i], + &nf_ct_expect_hash[i], hnode) { + + if (!net_eq(nf_ct_exp_net(exp), net)) + continue; + if (del_timer(&exp->timeout)) { nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 478f92f834b6..4fd040575ffe 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -309,6 +309,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly = .l3proto = PF_INET, .l4proto = IPPROTO_UDP, .name = "udp", + .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, .invert_tuple = udp_invert_tuple, .print_tuple = udp_print_tuple, @@ -341,6 +342,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = .l3proto = PF_INET6, .l4proto = IPPROTO_UDP, .name = "udp", + .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, .invert_tuple = udp_invert_tuple, .print_tuple = udp_print_tuple, diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 1ac8ee13a873..9d692f5adb94 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -274,6 +274,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = .l3proto = PF_INET, .l4proto = IPPROTO_UDPLITE, .name = "udplite", + .allow_clash = true, .pkt_to_tuple = udplite_pkt_to_tuple, .invert_tuple = udplite_invert_tuple, .print_tuple = udplite_print_tuple, @@ -306,6 +307,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = .l3proto = PF_INET6, .l4proto = IPPROTO_UDPLITE, .name = "udplite", + .allow_clash = true, .pkt_to_tuple = udplite_pkt_to_tuple, .invert_tuple = udplite_invert_tuple, .print_tuple = udplite_print_tuple, diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 0f1a45bcacb2..f87e84ebcec3 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -54,14 +54,13 @@ struct ct_iter_state { static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) { - struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < net->ct.htable_size; + st->bucket < nf_conntrack_htable_size; st->bucket++) { - n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); + n = rcu_dereference(hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -71,18 +70,17 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, struct hlist_nulls_node *head) { - struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= net->ct.htable_size) + if (++st->bucket >= nf_conntrack_htable_size) return NULL; } head = rcu_dereference( hlist_nulls_first_rcu( - &net->ct.hash[st->bucket])); + &nf_conntrack_hash[st->bucket])); } return head; } @@ -458,7 +456,7 @@ static struct ctl_table nf_ct_sysctl_table[] = { }, { .procname = "nf_conntrack_buckets", - .data = &init_net.ct.htable_size, + .data = &nf_conntrack_htable_size, .maxlen = sizeof(unsigned int), .mode = 0444, .proc_handler = proc_dointvec, @@ -512,7 +510,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) goto out_kmemdup; table[1].data = &net->ct.count; - table[2].data = &net->ct.htable_size; table[3].data = &net->ct.sysctl_checksum; table[4].data = &net->ct.sysctl_log_invalid; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 06a9f45771ab..6877a396f8fc 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -38,6 +38,9 @@ static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] __read_mostly; +static struct hlist_head *nf_nat_bysource __read_mostly; +static unsigned int nf_nat_htable_size __read_mostly; +static unsigned int nf_nat_hash_rnd __read_mostly; inline const struct nf_nat_l3proto * __nf_nat_l3proto_find(u8 family) @@ -118,15 +121,17 @@ EXPORT_SYMBOL(nf_xfrm_me_harder); /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int -hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple) +hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) { unsigned int hash; + get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); + /* Original src, to ensure we map it consistently if poss. */ hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), - tuple->dst.protonum ^ nf_conntrack_hash_rnd); + tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); - return reciprocal_scale(hash, net->ct.nat_htable_size); + return reciprocal_scale(hash, nf_nat_htable_size); } /* Is this tuple already taken? (not by us) */ @@ -196,9 +201,10 @@ find_appropriate_src(struct net *net, const struct nf_conn_nat *nat; const struct nf_conn *ct; - hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) { + hlist_for_each_entry_rcu(nat, &nf_nat_bysource[h], bysource) { ct = nat->ct; if (same_src(ct, tuple) && + net_eq(net, nf_ct_net(ct)) && nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { /* Copy source part from reply tuple. */ nf_ct_invert_tuplepr(result, @@ -431,7 +437,7 @@ nf_nat_setup_info(struct nf_conn *ct, nat = nfct_nat(ct); nat->ct = ct; hlist_add_head_rcu(&nat->bysource, - &net->ct.nat_bysource[srchash]); + &nf_nat_bysource[srchash]); spin_unlock_bh(&nf_nat_lock); } @@ -819,27 +825,14 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, } #endif -static int __net_init nf_nat_net_init(struct net *net) -{ - /* Leave them the same for the moment. */ - net->ct.nat_htable_size = net->ct.htable_size; - net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0); - if (!net->ct.nat_bysource) - return -ENOMEM; - return 0; -} - static void __net_exit nf_nat_net_exit(struct net *net) { struct nf_nat_proto_clean clean = {}; nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0); - synchronize_rcu(); - nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size); } static struct pernet_operations nf_nat_net_ops = { - .init = nf_nat_net_init, .exit = nf_nat_net_exit, }; @@ -852,8 +845,16 @@ static int __init nf_nat_init(void) { int ret; + /* Leave them the same for the moment. */ + nf_nat_htable_size = nf_conntrack_htable_size; + + nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); + if (!nf_nat_bysource) + return -ENOMEM; + ret = nf_ct_extend_register(&nat_extend); if (ret < 0) { + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); return ret; } @@ -877,6 +878,7 @@ static int __init nf_nat_init(void) return 0; cleanup_extend: + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); nf_ct_extend_unregister(&nat_extend); return ret; } @@ -895,6 +897,7 @@ static void __exit nf_nat_cleanup(void) for (i = 0; i < NFPROTO_NUMPROTO; i++) kfree(nf_nat_l4protos[i]); synchronize_net(); + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); } MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7a85a9dd37ad..4d292b933b5c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2317,7 +2317,7 @@ nft_select_set_ops(const struct nlattr * const nla[], static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_TABLE] = { .type = NLA_STRING }, [NFTA_SET_NAME] = { .type = NLA_STRING, - .len = IFNAMSIZ - 1 }, + .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_SET_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_KEY_TYPE] = { .type = NLA_U32 }, [NFTA_SET_KEY_LEN] = { .type = NLA_U32 }, @@ -2401,7 +2401,7 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, unsigned long *inuse; unsigned int n = 0, min = 0; - p = strnchr(name, IFNAMSIZ, '%'); + p = strnchr(name, NFT_SET_MAXNAMELEN, '%'); if (p != NULL) { if (p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL; @@ -2696,7 +2696,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; - char name[IFNAMSIZ]; + char name[NFT_SET_MAXNAMELEN]; unsigned int size; bool create; u64 timeout; @@ -3375,6 +3375,22 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem) } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); +static int nft_setelem_parse_flags(const struct nft_set *set, + const struct nlattr *attr, u32 *flags) +{ + if (attr == NULL) + return 0; + + *flags = ntohl(nla_get_be32(attr)); + if (*flags & ~NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; + if (!(set->flags & NFT_SET_INTERVAL) && + *flags & NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; + + return 0; +} + static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { @@ -3388,8 +3404,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_data data; enum nft_registers dreg; struct nft_trans *trans; + u32 flags = 0; u64 timeout; - u32 flags; u8 ulen; int err; @@ -3403,17 +3419,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_prepare(&tmpl); - flags = 0; - if (nla[NFTA_SET_ELEM_FLAGS] != NULL) { - flags = ntohl(nla_get_be32(nla[NFTA_SET_ELEM_FLAGS])); - if (flags & ~NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; - if (!(set->flags & NFT_SET_INTERVAL) && - flags & NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; - if (flags != 0) - nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); - } + err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); + if (err < 0) + return err; + if (flags != 0) + nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); if (set->flags & NFT_SET_MAP) { if (nla[NFTA_SET_ELEM_DATA] == NULL && @@ -3582,9 +3592,13 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; + struct nft_set_ext_tmpl tmpl; struct nft_data_desc desc; struct nft_set_elem elem; + struct nft_set_ext *ext; struct nft_trans *trans; + u32 flags = 0; + void *priv; int err; err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, @@ -3596,6 +3610,14 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_KEY] == NULL) goto err1; + nft_set_ext_prepare(&tmpl); + + err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); + if (err < 0) + return err; + if (flags != 0) + nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); + err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, nla[NFTA_SET_ELEM_KEY]); if (err < 0) @@ -3605,24 +3627,40 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) goto err2; + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, desc.len); + + err = -ENOMEM; + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0, + GFP_KERNEL); + if (elem.priv == NULL) + goto err2; + + ext = nft_set_elem_ext(set, elem.priv); + if (flags) + *nft_set_ext_flags(ext) = flags; + trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); if (trans == NULL) { err = -ENOMEM; - goto err2; + goto err3; } - elem.priv = set->ops->deactivate(set, &elem); - if (elem.priv == NULL) { + priv = set->ops->deactivate(set, &elem); + if (priv == NULL) { err = -ENOENT; - goto err3; + goto err4; } + kfree(elem.priv); + elem.priv = priv; nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; -err3: +err4: kfree(trans); +err3: + kfree(elem.priv); err2: nft_data_uninit(&elem.key.val, desc.type); err1: diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 2671b9deb103..3c84f14326f5 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -306,10 +306,10 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout) int i; local_bh_disable(); - for (i = 0; i < net->ct.htable_size; i++) { + for (i = 0; i < nf_conntrack_htable_size; i++) { nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); - if (i < net->ct.htable_size) { - hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) + if (i < nf_conntrack_htable_size) { + hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) untimeout(h, timeout); } spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 25998facefd0..137e308d5b24 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -198,6 +198,14 @@ static void nft_ct_set_eval(const struct nft_expr *expr, } break; #endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: + nf_connlabels_replace(ct, + ®s->data[priv->sreg], + ®s->data[priv->sreg], + NF_CT_LABELS_MAX_SIZE / sizeof(u32)); + break; +#endif default: break; } @@ -365,6 +373,16 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, len = FIELD_SIZEOF(struct nf_conn, mark); break; #endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: + if (tb[NFTA_CT_DIRECTION]) + return -EINVAL; + len = NF_CT_LABELS_MAX_SIZE; + err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1); + if (err) + return err; + break; +#endif default: return -EOPNOTSUPP; } @@ -384,6 +402,18 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, static void nft_ct_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { + struct nft_ct *priv = nft_expr_priv(expr); + + switch (priv->key) { +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: + nf_connlabels_put(ctx->net); + break; +#endif + default: + break; + } + nft_ct_l3proto_module_put(ctx->afi->family); } diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c index 1c30f41cff5b..f762094af7c1 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_rbtree.c @@ -29,6 +29,17 @@ struct nft_rbtree_elem { struct nft_set_ext ext; }; +static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe) +{ + return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) && + (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END); +} + +static bool nft_rbtree_equal(const struct nft_set *set, const void *this, + const struct nft_rbtree_elem *interval) +{ + return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0; +} static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) @@ -37,6 +48,7 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, const struct nft_rbtree_elem *rbe, *interval = NULL; const struct rb_node *parent; u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); + const void *this; int d; spin_lock_bh(&nft_rbtree_lock); @@ -44,9 +56,16 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, while (parent != NULL) { rbe = rb_entry(parent, struct nft_rbtree_elem, node); - d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); + this = nft_set_ext_key(&rbe->ext); + d = memcmp(this, key, set->klen); if (d < 0) { parent = parent->rb_left; + /* In case of adjacent ranges, we always see the high + * part of the range in first place, before the low one. + * So don't update interval if the keys are equal. + */ + if (interval && nft_rbtree_equal(set, this, interval)) + continue; interval = rbe; } else if (d > 0) parent = parent->rb_right; @@ -56,9 +75,7 @@ found: parent = parent->rb_left; continue; } - if (nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) && - *nft_set_ext_flags(&rbe->ext) & - NFT_SET_ELEM_INTERVAL_END) + if (nft_rbtree_interval_end(rbe)) goto out; spin_unlock_bh(&nft_rbtree_lock); @@ -98,9 +115,16 @@ static int __nft_rbtree_insert(const struct nft_set *set, else if (d > 0) p = &parent->rb_right; else { - if (nft_set_elem_active(&rbe->ext, genmask)) - return -EEXIST; - p = &parent->rb_left; + if (nft_set_elem_active(&rbe->ext, genmask)) { + if (nft_rbtree_interval_end(rbe) && + !nft_rbtree_interval_end(new)) + p = &parent->rb_left; + else if (!nft_rbtree_interval_end(rbe) && + nft_rbtree_interval_end(new)) + p = &parent->rb_right; + else + return -EEXIST; + } } } rb_link_node(&new->node, parent, p); @@ -145,7 +169,7 @@ static void *nft_rbtree_deactivate(const struct nft_set *set, { const struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; - struct nft_rbtree_elem *rbe; + struct nft_rbtree_elem *rbe, *this = elem->priv; u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); int d; @@ -163,6 +187,15 @@ static void *nft_rbtree_deactivate(const struct nft_set *set, parent = parent->rb_left; continue; } + if (nft_rbtree_interval_end(rbe) && + !nft_rbtree_interval_end(this)) { + parent = parent->rb_left; + continue; + } else if (!nft_rbtree_interval_end(rbe) && + nft_rbtree_interval_end(this)) { + parent = parent->rb_right; + continue; + } nft_set_elem_change_active(set, &rbe->ext); return rbe; } diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index fbb7a2b57b44..61fff422424f 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -64,18 +64,26 @@ struct nci_conn_info *nci_get_conn_info_by_conn_id(struct nci_dev *ndev, return NULL; } -int nci_get_conn_info_by_id(struct nci_dev *ndev, u8 id) +int nci_get_conn_info_by_dest_type_params(struct nci_dev *ndev, u8 dest_type, + struct dest_spec_params *params) { struct nci_conn_info *conn_info; list_for_each_entry(conn_info, &ndev->conn_info_list, list) { - if (conn_info->id == id) - return conn_info->conn_id; + if (conn_info->dest_type == dest_type) { + if (!params) + return conn_info->conn_id; + if (conn_info) { + if (params->id == conn_info->dest_params->id && + params->protocol == conn_info->dest_params->protocol) + return conn_info->conn_id; + } + } } return -EINVAL; } -EXPORT_SYMBOL(nci_get_conn_info_by_id); +EXPORT_SYMBOL(nci_get_conn_info_by_dest_type_params); /* ---- NCI requests ---- */ @@ -392,6 +400,83 @@ int nci_core_init(struct nci_dev *ndev) } EXPORT_SYMBOL(nci_core_init); +struct nci_loopback_data { + u8 conn_id; + struct sk_buff *data; +}; + +static void nci_send_data_req(struct nci_dev *ndev, unsigned long opt) +{ + struct nci_loopback_data *data = (struct nci_loopback_data *)opt; + + nci_send_data(ndev, data->conn_id, data->data); +} + +static void nci_nfcc_loopback_cb(void *context, struct sk_buff *skb, int err) +{ + struct nci_dev *ndev = (struct nci_dev *)context; + struct nci_conn_info *conn_info; + + conn_info = nci_get_conn_info_by_conn_id(ndev, ndev->cur_conn_id); + if (!conn_info) { + nci_req_complete(ndev, NCI_STATUS_REJECTED); + return; + } + + conn_info->rx_skb = skb; + + nci_req_complete(ndev, NCI_STATUS_OK); +} + +int nci_nfcc_loopback(struct nci_dev *ndev, void *data, size_t data_len, + struct sk_buff **resp) +{ + int r; + struct nci_loopback_data loopback_data; + struct nci_conn_info *conn_info; + struct sk_buff *skb; + int conn_id = nci_get_conn_info_by_dest_type_params(ndev, + NCI_DESTINATION_NFCC_LOOPBACK, NULL); + + if (conn_id < 0) { + r = nci_core_conn_create(ndev, NCI_DESTINATION_NFCC_LOOPBACK, + 0, 0, NULL); + if (r != NCI_STATUS_OK) + return r; + + conn_id = nci_get_conn_info_by_dest_type_params(ndev, + NCI_DESTINATION_NFCC_LOOPBACK, + NULL); + } + + conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id); + if (!conn_info) + return -EPROTO; + + /* store cb and context to be used on receiving data */ + conn_info->data_exchange_cb = nci_nfcc_loopback_cb; + conn_info->data_exchange_cb_context = ndev; + + skb = nci_skb_alloc(ndev, NCI_DATA_HDR_SIZE + data_len, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + skb_reserve(skb, NCI_DATA_HDR_SIZE); + memcpy(skb_put(skb, data_len), data, data_len); + + loopback_data.conn_id = conn_id; + loopback_data.data = skb; + + ndev->cur_conn_id = conn_id; + r = nci_request(ndev, nci_send_data_req, (unsigned long)&loopback_data, + msecs_to_jiffies(NCI_DATA_TIMEOUT)); + if (r == NCI_STATUS_OK && resp) + *resp = conn_info->rx_skb; + + return r; +} +EXPORT_SYMBOL(nci_nfcc_loopback); + static int nci_open_device(struct nci_dev *ndev) { int rc = 0; @@ -610,9 +695,6 @@ int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type, struct nci_core_conn_create_cmd *cmd; struct core_conn_create_data data; - if (!number_destination_params) - return -EINVAL; - data.length = params_len + sizeof(struct nci_core_conn_create_cmd); cmd = kzalloc(data.length, GFP_KERNEL); if (!cmd) @@ -620,17 +702,23 @@ int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type, cmd->destination_type = destination_type; cmd->number_destination_params = number_destination_params; - memcpy(cmd->params, params, params_len); data.cmd = cmd; - if (params->length > 0) - ndev->cur_id = params->value[DEST_SPEC_PARAMS_ID_INDEX]; - else - ndev->cur_id = 0; + if (params) { + memcpy(cmd->params, params, params_len); + if (params->length > 0) + memcpy(&ndev->cur_params, + ¶ms->value[DEST_SPEC_PARAMS_ID_INDEX], + sizeof(struct dest_spec_params)); + else + ndev->cur_params.id = 0; + } else { + ndev->cur_params.id = 0; + } + ndev->cur_dest_type = destination_type; - r = __nci_request(ndev, nci_core_conn_create_req, - (unsigned long)&data, + r = __nci_request(ndev, nci_core_conn_create_req, (unsigned long)&data, msecs_to_jiffies(NCI_CMD_TIMEOUT)); kfree(cmd); return r; @@ -646,6 +734,7 @@ static void nci_core_conn_close_req(struct nci_dev *ndev, unsigned long opt) int nci_core_conn_close(struct nci_dev *ndev, u8 conn_id) { + ndev->cur_conn_id = conn_id; return __nci_request(ndev, nci_core_conn_close_req, conn_id, msecs_to_jiffies(NCI_CMD_TIMEOUT)); } diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index 2ada2b39e355..1e8c1a12aaec 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -734,7 +734,7 @@ static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, * “HCI Access”, even if the HCI Network contains multiple NFCEEs. */ ndev->hci_dev->nfcee_id = nfcee_ntf->nfcee_id; - ndev->cur_id = nfcee_ntf->nfcee_id; + ndev->cur_params.id = nfcee_ntf->nfcee_id; nci_req_complete(ndev, status); } diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c index 9b6eb913d801..e3bbf1937d0e 100644 --- a/net/nfc/nci/rsp.c +++ b/net/nfc/nci/rsp.c @@ -226,7 +226,7 @@ static void nci_core_conn_create_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) { __u8 status = skb->data[0]; - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info = NULL; struct nci_core_conn_create_rsp *rsp; pr_debug("status 0x%x\n", status); @@ -241,7 +241,17 @@ static void nci_core_conn_create_rsp_packet(struct nci_dev *ndev, goto exit; } - conn_info->id = ndev->cur_id; + conn_info->dest_params = devm_kzalloc(&ndev->nfc_dev->dev, + sizeof(struct dest_spec_params), + GFP_KERNEL); + if (!conn_info->dest_params) { + status = NCI_STATUS_REJECTED; + goto free_conn_info; + } + + conn_info->dest_type = ndev->cur_dest_type; + conn_info->dest_params->id = ndev->cur_params.id; + conn_info->dest_params->protocol = ndev->cur_params.protocol; conn_info->conn_id = rsp->conn_id; /* Note: data_exchange_cb and data_exchange_cb_context need to @@ -251,7 +261,7 @@ static void nci_core_conn_create_rsp_packet(struct nci_dev *ndev, INIT_LIST_HEAD(&conn_info->list); list_add(&conn_info->list, &ndev->conn_info_list); - if (ndev->cur_id == ndev->hci_dev->nfcee_id) + if (ndev->cur_params.id == ndev->hci_dev->nfcee_id) ndev->hci_dev->conn_info = conn_info; conn_info->conn_id = rsp->conn_id; @@ -259,7 +269,11 @@ static void nci_core_conn_create_rsp_packet(struct nci_dev *ndev, atomic_set(&conn_info->credits_cnt, rsp->credits_cnt); } +free_conn_info: + if (status == NCI_STATUS_REJECTED) + devm_kfree(&ndev->nfc_dev->dev, conn_info); exit: + nci_req_complete(ndev, status); } @@ -271,7 +285,8 @@ static void nci_core_conn_close_rsp_packet(struct nci_dev *ndev, pr_debug("status 0x%x\n", status); if (status == NCI_STATUS_OK) { - conn_info = nci_get_conn_info_by_conn_id(ndev, ndev->cur_id); + conn_info = nci_get_conn_info_by_conn_id(ndev, + ndev->cur_conn_id); if (conn_info) { list_del(&conn_info->list); devm_kfree(&ndev->nfc_dev->dev, conn_info); diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 9741a76c7405..9f0bc49fa969 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -439,20 +439,12 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, u8 protonum; l3proto = __nf_ct_l3proto_find(l3num); - if (!l3proto) { - pr_debug("ovs_ct_find_existing: Can't get l3proto\n"); - return NULL; - } if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, &protonum) <= 0) { pr_debug("ovs_ct_find_existing: Can't get protonum\n"); return NULL; } l4proto = __nf_ct_l4proto_find(l3num, protonum); - if (!l4proto) { - pr_debug("ovs_ct_find_existing: Can't get l4proto\n"); - return NULL; - } if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, protonum, net, &tuple, l3proto, l4proto)) { pr_debug("ovs_ct_find_existing: Can't get tuple\n"); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 22d9a5316304..856bd8dba676 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -738,9 +738,9 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts, len += nla_total_size(acts->orig_len); return len - + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + + nla_total_size_64bit(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ - + nla_total_size(8); /* OVS_FLOW_ATTR_USED */ + + nla_total_size_64bit(8); /* OVS_FLOW_ATTR_USED */ } /* Called with ovs_mutex or RCU read lock. */ @@ -759,7 +759,9 @@ static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow, return -EMSGSIZE; if (stats.n_packets && - nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats)) + nla_put_64bit(skb, OVS_FLOW_ATTR_STATS, + sizeof(struct ovs_flow_stats), &stats, + OVS_FLOW_ATTR_PAD)) return -EMSGSIZE; if ((u8)ntohs(tcp_flags) && @@ -1435,8 +1437,8 @@ static size_t ovs_dp_cmd_msg_size(void) size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header)); msgsize += nla_total_size(IFNAMSIZ); - msgsize += nla_total_size(sizeof(struct ovs_dp_stats)); - msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats)); + msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_stats)); + msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_megaflow_stats)); msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ return msgsize; @@ -1463,13 +1465,13 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, goto nla_put_failure; get_dp_stats(dp, &dp_stats, &dp_megaflow_stats); - if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), - &dp_stats)) + if (nla_put_64bit(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), + &dp_stats, OVS_DP_ATTR_PAD)) goto nla_put_failure; - if (nla_put(skb, OVS_DP_ATTR_MEGAFLOW_STATS, - sizeof(struct ovs_dp_megaflow_stats), - &dp_megaflow_stats)) + if (nla_put_64bit(skb, OVS_DP_ATTR_MEGAFLOW_STATS, + sizeof(struct ovs_dp_megaflow_stats), + &dp_megaflow_stats, OVS_DP_ATTR_PAD)) goto nla_put_failure; if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) @@ -1838,8 +1840,9 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, goto nla_put_failure; ovs_vport_get_stats(vport, &vport_stats); - if (nla_put(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats), - &vport_stats)) + if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS, + sizeof(struct ovs_vport_stats), &vport_stats, + OVS_VPORT_ATTR_PAD)) goto nla_put_failure; if (ovs_vport_get_upcall_portids(vport, skb)) diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig new file mode 100644 index 000000000000..673fd1f86ebe --- /dev/null +++ b/net/qrtr/Kconfig @@ -0,0 +1,24 @@ +# Qualcomm IPC Router configuration +# + +config QRTR + tristate "Qualcomm IPC Router support" + depends on ARCH_QCOM || COMPILE_TEST + ---help--- + Say Y if you intend to use Qualcomm IPC router protocol. The + protocol is used to communicate with services provided by other + hardware blocks in the system. + + In order to do service lookups, a userspace daemon is required to + maintain a service listing. + +if QRTR + +config QRTR_SMD + tristate "SMD IPC Router channels" + depends on QCOM_SMD || COMPILE_TEST + ---help--- + Say Y here to support SMD based ipcrouter channels. SMD is the + most common transport for IPC Router. + +endif # QRTR diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile new file mode 100644 index 000000000000..6c00dc623b7e --- /dev/null +++ b/net/qrtr/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_QRTR) := qrtr.o +obj-$(CONFIG_QRTR_SMD) += smd.o diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c new file mode 100644 index 000000000000..c985ecbe9bd6 --- /dev/null +++ b/net/qrtr/qrtr.c @@ -0,0 +1,1007 @@ +/* + * Copyright (c) 2015, Sony Mobile Communications Inc. + * Copyright (c) 2013, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/qrtr.h> +#include <linux/termios.h> /* For TIOCINQ/OUTQ */ + +#include <net/sock.h> + +#include "qrtr.h" + +#define QRTR_PROTO_VER 1 + +/* auto-bind range */ +#define QRTR_MIN_EPH_SOCKET 0x4000 +#define QRTR_MAX_EPH_SOCKET 0x7fff + +enum qrtr_pkt_type { + QRTR_TYPE_DATA = 1, + QRTR_TYPE_HELLO = 2, + QRTR_TYPE_BYE = 3, + QRTR_TYPE_NEW_SERVER = 4, + QRTR_TYPE_DEL_SERVER = 5, + QRTR_TYPE_DEL_CLIENT = 6, + QRTR_TYPE_RESUME_TX = 7, + QRTR_TYPE_EXIT = 8, + QRTR_TYPE_PING = 9, +}; + +/** + * struct qrtr_hdr - (I|R)PCrouter packet header + * @version: protocol version + * @type: packet type; one of QRTR_TYPE_* + * @src_node_id: source node + * @src_port_id: source port + * @confirm_rx: boolean; whether a resume-tx packet should be send in reply + * @size: length of packet, excluding this header + * @dst_node_id: destination node + * @dst_port_id: destination port + */ +struct qrtr_hdr { + __le32 version; + __le32 type; + __le32 src_node_id; + __le32 src_port_id; + __le32 confirm_rx; + __le32 size; + __le32 dst_node_id; + __le32 dst_port_id; +} __packed; + +#define QRTR_HDR_SIZE sizeof(struct qrtr_hdr) +#define QRTR_NODE_BCAST ((unsigned int)-1) +#define QRTR_PORT_CTRL ((unsigned int)-2) + +struct qrtr_sock { + /* WARNING: sk must be the first member */ + struct sock sk; + struct sockaddr_qrtr us; + struct sockaddr_qrtr peer; +}; + +static inline struct qrtr_sock *qrtr_sk(struct sock *sk) +{ + BUILD_BUG_ON(offsetof(struct qrtr_sock, sk) != 0); + return container_of(sk, struct qrtr_sock, sk); +} + +static unsigned int qrtr_local_nid = -1; + +/* for node ids */ +static RADIX_TREE(qrtr_nodes, GFP_KERNEL); +/* broadcast list */ +static LIST_HEAD(qrtr_all_nodes); +/* lock for qrtr_nodes, qrtr_all_nodes and node reference */ +static DEFINE_MUTEX(qrtr_node_lock); + +/* local port allocation management */ +static DEFINE_IDR(qrtr_ports); +static DEFINE_MUTEX(qrtr_port_lock); + +/** + * struct qrtr_node - endpoint node + * @ep_lock: lock for endpoint management and callbacks + * @ep: endpoint + * @ref: reference count for node + * @nid: node id + * @rx_queue: receive queue + * @work: scheduled work struct for recv work + * @item: list item for broadcast list + */ +struct qrtr_node { + struct mutex ep_lock; + struct qrtr_endpoint *ep; + struct kref ref; + unsigned int nid; + + struct sk_buff_head rx_queue; + struct work_struct work; + struct list_head item; +}; + +/* Release node resources and free the node. + * + * Do not call directly, use qrtr_node_release. To be used with + * kref_put_mutex. As such, the node mutex is expected to be locked on call. + */ +static void __qrtr_node_release(struct kref *kref) +{ + struct qrtr_node *node = container_of(kref, struct qrtr_node, ref); + + if (node->nid != QRTR_EP_NID_AUTO) + radix_tree_delete(&qrtr_nodes, node->nid); + + list_del(&node->item); + mutex_unlock(&qrtr_node_lock); + + skb_queue_purge(&node->rx_queue); + kfree(node); +} + +/* Increment reference to node. */ +static struct qrtr_node *qrtr_node_acquire(struct qrtr_node *node) +{ + if (node) + kref_get(&node->ref); + return node; +} + +/* Decrement reference to node and release as necessary. */ +static void qrtr_node_release(struct qrtr_node *node) +{ + if (!node) + return; + kref_put_mutex(&node->ref, __qrtr_node_release, &qrtr_node_lock); +} + +/* Pass an outgoing packet socket buffer to the endpoint driver. */ +static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb) +{ + int rc = -ENODEV; + + mutex_lock(&node->ep_lock); + if (node->ep) + rc = node->ep->xmit(node->ep, skb); + else + kfree_skb(skb); + mutex_unlock(&node->ep_lock); + + return rc; +} + +/* Lookup node by id. + * + * callers must release with qrtr_node_release() + */ +static struct qrtr_node *qrtr_node_lookup(unsigned int nid) +{ + struct qrtr_node *node; + + mutex_lock(&qrtr_node_lock); + node = radix_tree_lookup(&qrtr_nodes, nid); + node = qrtr_node_acquire(node); + mutex_unlock(&qrtr_node_lock); + + return node; +} + +/* Assign node id to node. + * + * This is mostly useful for automatic node id assignment, based on + * the source id in the incoming packet. + */ +static void qrtr_node_assign(struct qrtr_node *node, unsigned int nid) +{ + if (node->nid != QRTR_EP_NID_AUTO || nid == QRTR_EP_NID_AUTO) + return; + + mutex_lock(&qrtr_node_lock); + radix_tree_insert(&qrtr_nodes, nid, node); + node->nid = nid; + mutex_unlock(&qrtr_node_lock); +} + +/** + * qrtr_endpoint_post() - post incoming data + * @ep: endpoint handle + * @data: data pointer + * @len: size of data in bytes + * + * Return: 0 on success; negative error code on failure + */ +int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) +{ + struct qrtr_node *node = ep->node; + const struct qrtr_hdr *phdr = data; + struct sk_buff *skb; + unsigned int psize; + unsigned int size; + unsigned int type; + unsigned int ver; + unsigned int dst; + + if (len < QRTR_HDR_SIZE || len & 3) + return -EINVAL; + + ver = le32_to_cpu(phdr->version); + size = le32_to_cpu(phdr->size); + type = le32_to_cpu(phdr->type); + dst = le32_to_cpu(phdr->dst_port_id); + + psize = (size + 3) & ~3; + + if (ver != QRTR_PROTO_VER) + return -EINVAL; + + if (len != psize + QRTR_HDR_SIZE) + return -EINVAL; + + if (dst != QRTR_PORT_CTRL && type != QRTR_TYPE_DATA) + return -EINVAL; + + skb = netdev_alloc_skb(NULL, len); + if (!skb) + return -ENOMEM; + + skb_reset_transport_header(skb); + memcpy(skb_put(skb, len), data, len); + + skb_queue_tail(&node->rx_queue, skb); + schedule_work(&node->work); + + return 0; +} +EXPORT_SYMBOL_GPL(qrtr_endpoint_post); + +/* Allocate and construct a resume-tx packet. */ +static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node, + u32 dst_node, u32 port) +{ + const int pkt_len = 20; + struct qrtr_hdr *hdr; + struct sk_buff *skb; + u32 *buf; + + skb = alloc_skb(QRTR_HDR_SIZE + pkt_len, GFP_KERNEL); + if (!skb) + return NULL; + skb_reset_transport_header(skb); + + hdr = (struct qrtr_hdr *)skb_put(skb, QRTR_HDR_SIZE); + hdr->version = cpu_to_le32(QRTR_PROTO_VER); + hdr->type = cpu_to_le32(QRTR_TYPE_RESUME_TX); + hdr->src_node_id = cpu_to_le32(src_node); + hdr->src_port_id = cpu_to_le32(QRTR_PORT_CTRL); + hdr->confirm_rx = cpu_to_le32(0); + hdr->size = cpu_to_le32(pkt_len); + hdr->dst_node_id = cpu_to_le32(dst_node); + hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL); + + buf = (u32 *)skb_put(skb, pkt_len); + memset(buf, 0, pkt_len); + buf[0] = cpu_to_le32(QRTR_TYPE_RESUME_TX); + buf[1] = cpu_to_le32(src_node); + buf[2] = cpu_to_le32(port); + + return skb; +} + +static struct qrtr_sock *qrtr_port_lookup(int port); +static void qrtr_port_put(struct qrtr_sock *ipc); + +/* Handle and route a received packet. + * + * This will auto-reply with resume-tx packet as necessary. + */ +static void qrtr_node_rx_work(struct work_struct *work) +{ + struct qrtr_node *node = container_of(work, struct qrtr_node, work); + struct sk_buff *skb; + + while ((skb = skb_dequeue(&node->rx_queue)) != NULL) { + const struct qrtr_hdr *phdr; + u32 dst_node, dst_port; + struct qrtr_sock *ipc; + u32 src_node; + int confirm; + + phdr = (const struct qrtr_hdr *)skb_transport_header(skb); + src_node = le32_to_cpu(phdr->src_node_id); + dst_node = le32_to_cpu(phdr->dst_node_id); + dst_port = le32_to_cpu(phdr->dst_port_id); + confirm = !!phdr->confirm_rx; + + qrtr_node_assign(node, src_node); + + ipc = qrtr_port_lookup(dst_port); + if (!ipc) { + kfree_skb(skb); + } else { + if (sock_queue_rcv_skb(&ipc->sk, skb)) + kfree_skb(skb); + + qrtr_port_put(ipc); + } + + if (confirm) { + skb = qrtr_alloc_resume_tx(dst_node, node->nid, dst_port); + if (!skb) + break; + if (qrtr_node_enqueue(node, skb)) + break; + } + } +} + +/** + * qrtr_endpoint_register() - register a new endpoint + * @ep: endpoint to register + * @nid: desired node id; may be QRTR_EP_NID_AUTO for auto-assignment + * Return: 0 on success; negative error code on failure + * + * The specified endpoint must have the xmit function pointer set on call. + */ +int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid) +{ + struct qrtr_node *node; + + if (!ep || !ep->xmit) + return -EINVAL; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + INIT_WORK(&node->work, qrtr_node_rx_work); + kref_init(&node->ref); + mutex_init(&node->ep_lock); + skb_queue_head_init(&node->rx_queue); + node->nid = QRTR_EP_NID_AUTO; + node->ep = ep; + + qrtr_node_assign(node, nid); + + mutex_lock(&qrtr_node_lock); + list_add(&node->item, &qrtr_all_nodes); + mutex_unlock(&qrtr_node_lock); + ep->node = node; + + return 0; +} +EXPORT_SYMBOL_GPL(qrtr_endpoint_register); + +/** + * qrtr_endpoint_unregister - unregister endpoint + * @ep: endpoint to unregister + */ +void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) +{ + struct qrtr_node *node = ep->node; + + mutex_lock(&node->ep_lock); + node->ep = NULL; + mutex_unlock(&node->ep_lock); + + qrtr_node_release(node); + ep->node = NULL; +} +EXPORT_SYMBOL_GPL(qrtr_endpoint_unregister); + +/* Lookup socket by port. + * + * Callers must release with qrtr_port_put() + */ +static struct qrtr_sock *qrtr_port_lookup(int port) +{ + struct qrtr_sock *ipc; + + if (port == QRTR_PORT_CTRL) + port = 0; + + mutex_lock(&qrtr_port_lock); + ipc = idr_find(&qrtr_ports, port); + if (ipc) + sock_hold(&ipc->sk); + mutex_unlock(&qrtr_port_lock); + + return ipc; +} + +/* Release acquired socket. */ +static void qrtr_port_put(struct qrtr_sock *ipc) +{ + sock_put(&ipc->sk); +} + +/* Remove port assignment. */ +static void qrtr_port_remove(struct qrtr_sock *ipc) +{ + int port = ipc->us.sq_port; + + if (port == QRTR_PORT_CTRL) + port = 0; + + __sock_put(&ipc->sk); + + mutex_lock(&qrtr_port_lock); + idr_remove(&qrtr_ports, port); + mutex_unlock(&qrtr_port_lock); +} + +/* Assign port number to socket. + * + * Specify port in the integer pointed to by port, and it will be adjusted + * on return as necesssary. + * + * Port may be: + * 0: Assign ephemeral port in [QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET] + * <QRTR_MIN_EPH_SOCKET: Specified; requires CAP_NET_ADMIN + * >QRTR_MIN_EPH_SOCKET: Specified; available to all + */ +static int qrtr_port_assign(struct qrtr_sock *ipc, int *port) +{ + int rc; + + mutex_lock(&qrtr_port_lock); + if (!*port) { + rc = idr_alloc(&qrtr_ports, ipc, + QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET + 1, + GFP_ATOMIC); + if (rc >= 0) + *port = rc; + } else if (*port < QRTR_MIN_EPH_SOCKET && !capable(CAP_NET_ADMIN)) { + rc = -EACCES; + } else if (*port == QRTR_PORT_CTRL) { + rc = idr_alloc(&qrtr_ports, ipc, 0, 1, GFP_ATOMIC); + } else { + rc = idr_alloc(&qrtr_ports, ipc, *port, *port + 1, GFP_ATOMIC); + if (rc >= 0) + *port = rc; + } + mutex_unlock(&qrtr_port_lock); + + if (rc == -ENOSPC) + return -EADDRINUSE; + else if (rc < 0) + return rc; + + sock_hold(&ipc->sk); + + return 0; +} + +/* Bind socket to address. + * + * Socket should be locked upon call. + */ +static int __qrtr_bind(struct socket *sock, + const struct sockaddr_qrtr *addr, int zapped) +{ + struct qrtr_sock *ipc = qrtr_sk(sock->sk); + struct sock *sk = sock->sk; + int port; + int rc; + + /* rebinding ok */ + if (!zapped && addr->sq_port == ipc->us.sq_port) + return 0; + + port = addr->sq_port; + rc = qrtr_port_assign(ipc, &port); + if (rc) + return rc; + + /* unbind previous, if any */ + if (!zapped) + qrtr_port_remove(ipc); + ipc->us.sq_port = port; + + sock_reset_flag(sk, SOCK_ZAPPED); + + return 0; +} + +/* Auto bind to an ephemeral port. */ +static int qrtr_autobind(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct sockaddr_qrtr addr; + + if (!sock_flag(sk, SOCK_ZAPPED)) + return 0; + + addr.sq_family = AF_QIPCRTR; + addr.sq_node = qrtr_local_nid; + addr.sq_port = 0; + + return __qrtr_bind(sock, &addr, 1); +} + +/* Bind socket to specified sockaddr. */ +static int qrtr_bind(struct socket *sock, struct sockaddr *saddr, int len) +{ + DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr); + struct qrtr_sock *ipc = qrtr_sk(sock->sk); + struct sock *sk = sock->sk; + int rc; + + if (len < sizeof(*addr) || addr->sq_family != AF_QIPCRTR) + return -EINVAL; + + if (addr->sq_node != ipc->us.sq_node) + return -EINVAL; + + lock_sock(sk); + rc = __qrtr_bind(sock, addr, sock_flag(sk, SOCK_ZAPPED)); + release_sock(sk); + + return rc; +} + +/* Queue packet to local peer socket. */ +static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb) +{ + const struct qrtr_hdr *phdr; + struct qrtr_sock *ipc; + + phdr = (const struct qrtr_hdr *)skb_transport_header(skb); + + ipc = qrtr_port_lookup(le32_to_cpu(phdr->dst_port_id)); + if (!ipc || &ipc->sk == skb->sk) { /* do not send to self */ + kfree_skb(skb); + return -ENODEV; + } + + if (sock_queue_rcv_skb(&ipc->sk, skb)) { + qrtr_port_put(ipc); + kfree_skb(skb); + return -ENOSPC; + } + + qrtr_port_put(ipc); + + return 0; +} + +/* Queue packet for broadcast. */ +static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb) +{ + struct sk_buff *skbn; + + mutex_lock(&qrtr_node_lock); + list_for_each_entry(node, &qrtr_all_nodes, item) { + skbn = skb_clone(skb, GFP_KERNEL); + if (!skbn) + break; + skb_set_owner_w(skbn, skb->sk); + qrtr_node_enqueue(node, skbn); + } + mutex_unlock(&qrtr_node_lock); + + qrtr_local_enqueue(node, skb); + + return 0; +} + +static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) +{ + DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name); + int (*enqueue_fn)(struct qrtr_node *, struct sk_buff *); + struct qrtr_sock *ipc = qrtr_sk(sock->sk); + struct sock *sk = sock->sk; + struct qrtr_node *node; + struct qrtr_hdr *hdr; + struct sk_buff *skb; + size_t plen; + int rc; + + if (msg->msg_flags & ~(MSG_DONTWAIT)) + return -EINVAL; + + if (len > 65535) + return -EMSGSIZE; + + lock_sock(sk); + + if (addr) { + if (msg->msg_namelen < sizeof(*addr)) { + release_sock(sk); + return -EINVAL; + } + + if (addr->sq_family != AF_QIPCRTR) { + release_sock(sk); + return -EINVAL; + } + + rc = qrtr_autobind(sock); + if (rc) { + release_sock(sk); + return rc; + } + } else if (sk->sk_state == TCP_ESTABLISHED) { + addr = &ipc->peer; + } else { + release_sock(sk); + return -ENOTCONN; + } + + node = NULL; + if (addr->sq_node == QRTR_NODE_BCAST) { + enqueue_fn = qrtr_bcast_enqueue; + } else if (addr->sq_node == ipc->us.sq_node) { + enqueue_fn = qrtr_local_enqueue; + } else { + enqueue_fn = qrtr_node_enqueue; + node = qrtr_node_lookup(addr->sq_node); + if (!node) { + release_sock(sk); + return -ECONNRESET; + } + } + + plen = (len + 3) & ~3; + skb = sock_alloc_send_skb(sk, plen + QRTR_HDR_SIZE, + msg->msg_flags & MSG_DONTWAIT, &rc); + if (!skb) + goto out_node; + + skb_reset_transport_header(skb); + skb_put(skb, len + QRTR_HDR_SIZE); + + hdr = (struct qrtr_hdr *)skb_transport_header(skb); + hdr->version = cpu_to_le32(QRTR_PROTO_VER); + hdr->src_node_id = cpu_to_le32(ipc->us.sq_node); + hdr->src_port_id = cpu_to_le32(ipc->us.sq_port); + hdr->confirm_rx = cpu_to_le32(0); + hdr->size = cpu_to_le32(len); + hdr->dst_node_id = cpu_to_le32(addr->sq_node); + hdr->dst_port_id = cpu_to_le32(addr->sq_port); + + rc = skb_copy_datagram_from_iter(skb, QRTR_HDR_SIZE, + &msg->msg_iter, len); + if (rc) { + kfree_skb(skb); + goto out_node; + } + + if (plen != len) { + skb_pad(skb, plen - len); + skb_put(skb, plen - len); + } + + if (ipc->us.sq_port == QRTR_PORT_CTRL) { + if (len < 4) { + rc = -EINVAL; + kfree_skb(skb); + goto out_node; + } + + /* control messages already require the type as 'command' */ + skb_copy_bits(skb, QRTR_HDR_SIZE, &hdr->type, 4); + } else { + hdr->type = cpu_to_le32(QRTR_TYPE_DATA); + } + + rc = enqueue_fn(node, skb); + if (rc >= 0) + rc = len; + +out_node: + qrtr_node_release(node); + release_sock(sk); + + return rc; +} + +static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, + size_t size, int flags) +{ + DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name); + const struct qrtr_hdr *phdr; + struct sock *sk = sock->sk; + struct sk_buff *skb; + int copied, rc; + + lock_sock(sk); + + if (sock_flag(sk, SOCK_ZAPPED)) { + release_sock(sk); + return -EADDRNOTAVAIL; + } + + skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, + flags & MSG_DONTWAIT, &rc); + if (!skb) { + release_sock(sk); + return rc; + } + + phdr = (const struct qrtr_hdr *)skb_transport_header(skb); + copied = le32_to_cpu(phdr->size); + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + + rc = skb_copy_datagram_msg(skb, QRTR_HDR_SIZE, msg, copied); + if (rc < 0) + goto out; + rc = copied; + + if (addr) { + addr->sq_family = AF_QIPCRTR; + addr->sq_node = le32_to_cpu(phdr->src_node_id); + addr->sq_port = le32_to_cpu(phdr->src_port_id); + msg->msg_namelen = sizeof(*addr); + } + +out: + skb_free_datagram(sk, skb); + release_sock(sk); + + return rc; +} + +static int qrtr_connect(struct socket *sock, struct sockaddr *saddr, + int len, int flags) +{ + DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr); + struct qrtr_sock *ipc = qrtr_sk(sock->sk); + struct sock *sk = sock->sk; + int rc; + + if (len < sizeof(*addr) || addr->sq_family != AF_QIPCRTR) + return -EINVAL; + + lock_sock(sk); + + sk->sk_state = TCP_CLOSE; + sock->state = SS_UNCONNECTED; + + rc = qrtr_autobind(sock); + if (rc) { + release_sock(sk); + return rc; + } + + ipc->peer = *addr; + sock->state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; + + release_sock(sk); + + return 0; +} + +static int qrtr_getname(struct socket *sock, struct sockaddr *saddr, + int *len, int peer) +{ + struct qrtr_sock *ipc = qrtr_sk(sock->sk); + struct sockaddr_qrtr qaddr; + struct sock *sk = sock->sk; + + lock_sock(sk); + if (peer) { + if (sk->sk_state != TCP_ESTABLISHED) { + release_sock(sk); + return -ENOTCONN; + } + + qaddr = ipc->peer; + } else { + qaddr = ipc->us; + } + release_sock(sk); + + *len = sizeof(qaddr); + qaddr.sq_family = AF_QIPCRTR; + + memcpy(saddr, &qaddr, sizeof(qaddr)); + + return 0; +} + +static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct qrtr_sock *ipc = qrtr_sk(sock->sk); + struct sock *sk = sock->sk; + struct sockaddr_qrtr *sq; + struct sk_buff *skb; + struct ifreq ifr; + long len = 0; + int rc = 0; + + lock_sock(sk); + + switch (cmd) { + case TIOCOUTQ: + len = sk->sk_sndbuf - sk_wmem_alloc_get(sk); + if (len < 0) + len = 0; + rc = put_user(len, (int __user *)argp); + break; + case TIOCINQ: + skb = skb_peek(&sk->sk_receive_queue); + if (skb) + len = skb->len - QRTR_HDR_SIZE; + rc = put_user(len, (int __user *)argp); + break; + case SIOCGIFADDR: + if (copy_from_user(&ifr, argp, sizeof(ifr))) { + rc = -EFAULT; + break; + } + + sq = (struct sockaddr_qrtr *)&ifr.ifr_addr; + *sq = ipc->us; + if (copy_to_user(argp, &ifr, sizeof(ifr))) { + rc = -EFAULT; + break; + } + break; + case SIOCGSTAMP: + rc = sock_get_timestamp(sk, argp); + break; + case SIOCADDRT: + case SIOCDELRT: + case SIOCSIFADDR: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + rc = -EINVAL; + break; + default: + rc = -ENOIOCTLCMD; + break; + } + + release_sock(sk); + + return rc; +} + +static int qrtr_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct qrtr_sock *ipc; + + if (!sk) + return 0; + + lock_sock(sk); + + ipc = qrtr_sk(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + + sock_set_flag(sk, SOCK_DEAD); + sock->sk = NULL; + + if (!sock_flag(sk, SOCK_ZAPPED)) + qrtr_port_remove(ipc); + + skb_queue_purge(&sk->sk_receive_queue); + + release_sock(sk); + sock_put(sk); + + return 0; +} + +static const struct proto_ops qrtr_proto_ops = { + .owner = THIS_MODULE, + .family = AF_QIPCRTR, + .bind = qrtr_bind, + .connect = qrtr_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .listen = sock_no_listen, + .sendmsg = qrtr_sendmsg, + .recvmsg = qrtr_recvmsg, + .getname = qrtr_getname, + .ioctl = qrtr_ioctl, + .poll = datagram_poll, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .release = qrtr_release, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct proto qrtr_proto = { + .name = "QIPCRTR", + .owner = THIS_MODULE, + .obj_size = sizeof(struct qrtr_sock), +}; + +static int qrtr_create(struct net *net, struct socket *sock, + int protocol, int kern) +{ + struct qrtr_sock *ipc; + struct sock *sk; + + if (sock->type != SOCK_DGRAM) + return -EPROTOTYPE; + + sk = sk_alloc(net, AF_QIPCRTR, GFP_KERNEL, &qrtr_proto, kern); + if (!sk) + return -ENOMEM; + + sock_set_flag(sk, SOCK_ZAPPED); + + sock_init_data(sock, sk); + sock->ops = &qrtr_proto_ops; + + ipc = qrtr_sk(sk); + ipc->us.sq_family = AF_QIPCRTR; + ipc->us.sq_node = qrtr_local_nid; + ipc->us.sq_port = 0; + + return 0; +} + +static const struct nla_policy qrtr_policy[IFA_MAX + 1] = { + [IFA_LOCAL] = { .type = NLA_U32 }, +}; + +static int qrtr_addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct nlattr *tb[IFA_MAX + 1]; + struct ifaddrmsg *ifm; + int rc; + + if (!netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; + + if (!netlink_capable(skb, CAP_SYS_ADMIN)) + return -EPERM; + + ASSERT_RTNL(); + + rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, qrtr_policy); + if (rc < 0) + return rc; + + ifm = nlmsg_data(nlh); + if (!tb[IFA_LOCAL]) + return -EINVAL; + + qrtr_local_nid = nla_get_u32(tb[IFA_LOCAL]); + return 0; +} + +static const struct net_proto_family qrtr_family = { + .owner = THIS_MODULE, + .family = AF_QIPCRTR, + .create = qrtr_create, +}; + +static int __init qrtr_proto_init(void) +{ + int rc; + + rc = proto_register(&qrtr_proto, 1); + if (rc) + return rc; + + rc = sock_register(&qrtr_family); + if (rc) { + proto_unregister(&qrtr_proto); + return rc; + } + + rtnl_register(PF_QIPCRTR, RTM_NEWADDR, qrtr_addr_doit, NULL, NULL); + + return 0; +} +module_init(qrtr_proto_init); + +static void __exit qrtr_proto_fini(void) +{ + rtnl_unregister(PF_QIPCRTR, RTM_NEWADDR); + sock_unregister(qrtr_family.family); + proto_unregister(&qrtr_proto); +} +module_exit(qrtr_proto_fini); + +MODULE_DESCRIPTION("Qualcomm IPC-router driver"); +MODULE_LICENSE("GPL v2"); diff --git a/net/qrtr/qrtr.h b/net/qrtr/qrtr.h new file mode 100644 index 000000000000..2b848718f8fe --- /dev/null +++ b/net/qrtr/qrtr.h @@ -0,0 +1,31 @@ +#ifndef __QRTR_H_ +#define __QRTR_H_ + +#include <linux/types.h> + +struct sk_buff; + +/* endpoint node id auto assignment */ +#define QRTR_EP_NID_AUTO (-1) + +/** + * struct qrtr_endpoint - endpoint handle + * @xmit: Callback for outgoing packets + * + * The socket buffer passed to the xmit function becomes owned by the endpoint + * driver. As such, when the driver is done with the buffer, it should + * call kfree_skb() on failure, or consume_skb() on success. + */ +struct qrtr_endpoint { + int (*xmit)(struct qrtr_endpoint *ep, struct sk_buff *skb); + /* private: not for endpoint use */ + struct qrtr_node *node; +}; + +int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid); + +void qrtr_endpoint_unregister(struct qrtr_endpoint *ep); + +int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len); + +#endif diff --git a/net/qrtr/smd.c b/net/qrtr/smd.c new file mode 100644 index 000000000000..84ebce73aa23 --- /dev/null +++ b/net/qrtr/smd.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2015, Sony Mobile Communications Inc. + * Copyright (c) 2013, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/soc/qcom/smd.h> + +#include "qrtr.h" + +struct qrtr_smd_dev { + struct qrtr_endpoint ep; + struct qcom_smd_channel *channel; +}; + +/* from smd to qrtr */ +static int qcom_smd_qrtr_callback(struct qcom_smd_device *sdev, + const void *data, size_t len) +{ + struct qrtr_smd_dev *qdev = dev_get_drvdata(&sdev->dev); + int rc; + + if (!qdev) + return -EAGAIN; + + rc = qrtr_endpoint_post(&qdev->ep, data, len); + if (rc == -EINVAL) { + dev_err(&sdev->dev, "invalid ipcrouter packet\n"); + /* return 0 to let smd drop the packet */ + rc = 0; + } + + return rc; +} + +/* from qrtr to smd */ +static int qcom_smd_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb) +{ + struct qrtr_smd_dev *qdev = container_of(ep, struct qrtr_smd_dev, ep); + int rc; + + rc = skb_linearize(skb); + if (rc) + goto out; + + rc = qcom_smd_send(qdev->channel, skb->data, skb->len); + +out: + if (rc) + kfree_skb(skb); + else + consume_skb(skb); + return rc; +} + +static int qcom_smd_qrtr_probe(struct qcom_smd_device *sdev) +{ + struct qrtr_smd_dev *qdev; + int rc; + + qdev = devm_kzalloc(&sdev->dev, sizeof(*qdev), GFP_KERNEL); + if (!qdev) + return -ENOMEM; + + qdev->channel = sdev->channel; + qdev->ep.xmit = qcom_smd_qrtr_send; + + rc = qrtr_endpoint_register(&qdev->ep, QRTR_EP_NID_AUTO); + if (rc) + return rc; + + dev_set_drvdata(&sdev->dev, qdev); + + dev_dbg(&sdev->dev, "Qualcomm SMD QRTR driver probed\n"); + + return 0; +} + +static void qcom_smd_qrtr_remove(struct qcom_smd_device *sdev) +{ + struct qrtr_smd_dev *qdev = dev_get_drvdata(&sdev->dev); + + qrtr_endpoint_unregister(&qdev->ep); + + dev_set_drvdata(&sdev->dev, NULL); +} + +static const struct qcom_smd_id qcom_smd_qrtr_smd_match[] = { + { "IPCRTR" }, + {} +}; + +static struct qcom_smd_driver qcom_smd_qrtr_driver = { + .probe = qcom_smd_qrtr_probe, + .remove = qcom_smd_qrtr_remove, + .callback = qcom_smd_qrtr_callback, + .smd_match_table = qcom_smd_qrtr_smd_match, + .driver = { + .name = "qcom_smd_qrtr", + .owner = THIS_MODULE, + }, +}; + +module_qcom_smd_driver(qcom_smd_qrtr_driver); + +MODULE_DESCRIPTION("Qualcomm IPC-Router SMD interface driver"); +MODULE_LICENSE("GPL v2"); diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 61ed2a8764ba..86187dad1440 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -127,7 +127,7 @@ void rds_tcp_restore_callbacks(struct socket *sock, /* * This is the only path that sets tc->t_sock. Send and receive trust that - * it is set. The RDS_CONN_CONNECTED bit protects those paths from being + * it is set. The RDS_CONN_UP bit protects those paths from being * called while it isn't set. */ void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn) @@ -216,6 +216,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) if (!tc) return -ENOMEM; + mutex_init(&tc->t_conn_lock); tc->t_sock = NULL; tc->t_tinc = NULL; tc->t_tinc_hdr_rem = sizeof(struct rds_header); diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 64f873c0c6b6..41c228300525 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -12,6 +12,10 @@ struct rds_tcp_connection { struct list_head t_tcp_node; struct rds_connection *conn; + /* t_conn_lock synchronizes the connection establishment between + * rds_tcp_accept_one and rds_tcp_conn_connect + */ + struct mutex t_conn_lock; struct socket *t_sock; void *t_orig_write_space; void *t_orig_data_ready; diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 5cb16875c460..49a3fcfed360 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -78,7 +78,14 @@ int rds_tcp_conn_connect(struct rds_connection *conn) struct socket *sock = NULL; struct sockaddr_in src, dest; int ret; + struct rds_tcp_connection *tc = conn->c_transport_data; + + mutex_lock(&tc->t_conn_lock); + if (rds_conn_up(conn)) { + mutex_unlock(&tc->t_conn_lock); + return 0; + } ret = sock_create_kern(rds_conn_net(conn), PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); if (ret < 0) @@ -120,6 +127,7 @@ int rds_tcp_conn_connect(struct rds_connection *conn) } out: + mutex_unlock(&tc->t_conn_lock); if (sock) sock_release(sock); return ret; diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 0936a4a32b47..be263cdf268b 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -76,7 +76,9 @@ int rds_tcp_accept_one(struct socket *sock) struct rds_connection *conn; int ret; struct inet_sock *inet; - struct rds_tcp_connection *rs_tcp; + struct rds_tcp_connection *rs_tcp = NULL; + int conn_state; + struct sock *nsk; ret = sock_create_kern(sock_net(sock->sk), sock->sk->sk_family, sock->sk->sk_type, sock->sk->sk_protocol, @@ -115,28 +117,44 @@ int rds_tcp_accept_one(struct socket *sock) * rds_tcp_state_change() will do that cleanup */ rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data; - if (rs_tcp->t_sock && - ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) { - struct sock *nsk = new_sock->sk; - - nsk->sk_user_data = NULL; - nsk->sk_prot->disconnect(nsk, 0); - tcp_done(nsk); - new_sock = NULL; - ret = 0; - goto out; - } else if (rs_tcp->t_sock) { - rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp); - conn->c_outgoing = 0; - } - rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING); + mutex_lock(&rs_tcp->t_conn_lock); + conn_state = rds_conn_state(conn); + if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP) + goto rst_nsk; + if (rs_tcp->t_sock) { + /* Need to resolve a duelling SYN between peers. + * We have an outstanding SYN to this peer, which may + * potentially have transitioned to the RDS_CONN_UP state, + * so we must quiesce any send threads before resetting + * c_transport_data. + */ + wait_event(conn->c_waitq, + !test_bit(RDS_IN_XMIT, &conn->c_flags)); + if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) { + goto rst_nsk; + } else if (rs_tcp->t_sock) { + rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp); + conn->c_outgoing = 0; + } + } rds_tcp_set_callbacks(new_sock, conn); - rds_connect_complete(conn); + rds_connect_complete(conn); /* marks RDS_CONN_UP */ + new_sock = NULL; + ret = 0; + goto out; +rst_nsk: + /* reset the newly returned accept sock and bail */ + nsk = new_sock->sk; + rds_tcp_stats_inc(s_tcp_listen_closed_stale); + nsk->sk_user_data = NULL; + nsk->sk_prot->disconnect(nsk, 0); + tcp_done(nsk); new_sock = NULL; ret = 0; - out: + if (rs_tcp) + mutex_unlock(&rs_tcp->t_conn_lock); if (new_sock) sock_release(new_sock); return ret; diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 27a992154804..d75d8b56a9e3 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -207,22 +207,14 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, } if (left && tc->t_tinc_data_rem) { - clone = skb_clone(skb, arg->gfp); + to_copy = min(tc->t_tinc_data_rem, left); + + clone = pskb_extract(skb, offset, to_copy, arg->gfp); if (!clone) { desc->error = -ENOMEM; goto out; } - to_copy = min(tc->t_tinc_data_rem, left); - if (!pskb_pull(clone, offset) || - pskb_trim(clone, to_copy)) { - pr_warn("rds_tcp_data_recv: pull/trim failed " - "left %zu data_rem %zu skb_len %d\n", - left, tc->t_tinc_data_rem, skb->len); - kfree_skb(clone); - desc->error = -ENOMEM; - goto out; - } skb_queue_tail(&tinc->ti_skb_list, clone); rdsdebug("skb %p data %p len %d off %u to_copy %zu -> " diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c index 01e038146b7c..6ff97412a0bb 100644 --- a/net/rxrpc/ar-input.c +++ b/net/rxrpc/ar-input.c @@ -698,12 +698,12 @@ void rxrpc_data_ready(struct sock *sk) if (skb_checksum_complete(skb)) { rxrpc_free_skb(skb); rxrpc_put_local(local); - UDP_INC_STATS_BH(&init_net, UDP_MIB_INERRORS, 0); + __UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0); _leave(" [CSUM failed]"); return; } - UDP_INC_STATS_BH(&init_net, UDP_MIB_INDATAGRAMS, 0); + __UDP_INC_STATS(&init_net, UDP_MIB_INDATAGRAMS, 0); /* The socket buffer we have is owned by UDP, with UDP's data all over * it, but we really want our own data there. diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 96066665e376..336774a535c3 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -657,12 +657,15 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, if (compat_mode) { if (a->type == TCA_OLD_COMPAT) err = gnet_stats_start_copy_compat(skb, 0, - TCA_STATS, TCA_XSTATS, &p->tcfc_lock, &d); + TCA_STATS, + TCA_XSTATS, + &p->tcfc_lock, &d, + TCA_PAD); else return 0; } else err = gnet_stats_start_copy(skb, TCA_ACT_STATS, - &p->tcfc_lock, &d); + &p->tcfc_lock, &d, TCA_ACT_PAD); if (err < 0) goto errout; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 8c9f1f0459ab..c7123e01c2ca 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -53,9 +53,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); + bpf_compute_data_end(skb); filter_res = BPF_PROG_RUN(filter, skb); __skb_pull(skb, skb->mac_len); } else { + bpf_compute_data_end(skb); filter_res = BPF_PROG_RUN(filter, skb); } rcu_read_unlock(); @@ -156,7 +158,8 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, tm.lastuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.lastuse); tm.expires = jiffies_to_clock_t(prog->tcf_tm.expires); - if (nla_put(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm)) + if (nla_put_64bit(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm, + TCA_ACT_BPF_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index c0ed93ce2391..2ba700c765e0 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -163,7 +163,8 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, t.install = jiffies_to_clock_t(jiffies - ci->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(ci->tcf_tm.expires); - if (nla_put(skb, TCA_CONNMARK_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t, + TCA_CONNMARK_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index d22426cdebc0..28e934ed038a 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -549,7 +549,7 @@ static int tcf_csum_dump(struct sk_buff *skb, t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); - if (nla_put(skb, TCA_CSUM_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 887fc1f209ff..1a6e09fbb2a5 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -177,7 +177,7 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(gact->tcf_tm.expires); - if (nla_put(skb, TCA_GACT_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index c589a9ba506a..556f44c9c454 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -550,7 +550,7 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, t.install = jiffies_to_clock_t(jiffies - ife->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - ife->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(ife->tcf_tm.expires); - if (nla_put(skb, TCA_IFE_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_IFE_TM, sizeof(t), &t, TCA_IFE_PAD)) goto nla_put_failure; if (!is_zero_ether_addr(ife->eth_dst)) { diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 350e134cffb3..1464f6a09446 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -275,7 +275,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install); tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse); tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires); - if (nla_put(skb, TCA_IPT_TM, sizeof (tm), &tm)) + if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD)) goto nla_put_failure; kfree(t); return skb->len; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index e8a760cf7775..dea57c1ec90c 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -214,7 +214,7 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(m->tcf_tm.expires); - if (nla_put(skb, TCA_MIRRED_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 0f65cdfbfb1d..c0a879f940de 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -267,7 +267,7 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); - if (nla_put(skb, TCA_NAT_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 429c3ab65142..c6e18f230af6 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -203,7 +203,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); - if (nla_put(skb, TCA_PEDIT_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; kfree(opt); return skb->len; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 75b2be13fbcc..2057fd56d74c 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -155,7 +155,7 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(d->tcf_tm.expires); - if (nla_put(skb, TCA_DEF_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index cfcdbdc00c9b..51b24998904f 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -167,7 +167,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(d->tcf_tm.expires); - if (nla_put(skb, TCA_SKBEDIT_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index bab8ae0cefc0..c1682ab9bc7e 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -175,7 +175,7 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, t.install = jiffies_to_clock_t(jiffies - v->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - v->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(v->tcf_tm.expires); - if (nla_put(skb, TCA_VLAN_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 425fe6a0eda3..7b342c779da7 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -96,9 +96,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, if (at_ingress) { /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); + bpf_compute_data_end(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { + bpf_compute_data_end(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 563cdad76448..e64877a3c084 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -1140,9 +1140,10 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, gpf->kcnts[i] += pf->kcnts[i]; } - if (nla_put(skb, TCA_U32_PCNT, - sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64), - gpf)) { + if (nla_put_64bit(skb, TCA_U32_PCNT, + sizeof(struct tc_u32_pcnt) + + n->sel.nkeys * sizeof(u64), + gpf, TCA_U32_PAD)) { kfree(gpf); goto nla_put_failure; } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 3b180ff72f79..64f71a2155f3 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1365,7 +1365,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, - qdisc_root_sleeping_lock(q), &d) < 0) + qdisc_root_sleeping_lock(q), &d, + TCA_PAD) < 0) goto nla_put_failure; if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) @@ -1679,7 +1680,8 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, - qdisc_root_sleeping_lock(q), &d) < 0) + qdisc_root_sleeping_lock(q), &d, + TCA_PAD) < 0) goto nla_put_failure; if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index a5e420b3d4ab..bb8bd9314629 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -59,8 +59,12 @@ struct fq_codel_sched_data { u32 flows_cnt; /* number of flows */ u32 perturbation; /* hash perturbation */ u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ + u32 drop_batch_size; + u32 memory_limit; struct codel_params cparams; struct codel_stats cstats; + u32 memory_usage; + u32 drop_overmemory; u32 drop_overlimit; u32 new_flow_count; @@ -135,17 +139,21 @@ static inline void flow_queue_add(struct fq_codel_flow *flow, skb->next = NULL; } -static unsigned int fq_codel_drop(struct Qdisc *sch) +static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets) { struct fq_codel_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; unsigned int maxbacklog = 0, idx = 0, i, len; struct fq_codel_flow *flow; + unsigned int threshold; + unsigned int mem = 0; - /* Queue is full! Find the fat flow and drop packet from it. + /* Queue is full! Find the fat flow and drop packet(s) from it. * This might sound expensive, but with 1024 flows, we scan * 4KB of memory, and we dont need to handle a complex tree * in fast path (packet queue/enqueue) with many cache misses. + * In stress mode, we'll try to drop 64 packets from the flow, + * amortizing this linear lookup to one cache line per drop. */ for (i = 0; i < q->flows_cnt; i++) { if (q->backlogs[i] > maxbacklog) { @@ -153,15 +161,26 @@ static unsigned int fq_codel_drop(struct Qdisc *sch) idx = i; } } + + /* Our goal is to drop half of this fat flow backlog */ + threshold = maxbacklog >> 1; + flow = &q->flows[idx]; - skb = dequeue_head(flow); - len = qdisc_pkt_len(skb); + len = 0; + i = 0; + do { + skb = dequeue_head(flow); + len += qdisc_pkt_len(skb); + mem += skb->truesize; + kfree_skb(skb); + } while (++i < max_packets && len < threshold); + + flow->dropped += i; q->backlogs[idx] -= len; - sch->q.qlen--; - qdisc_qstats_drop(sch); - qdisc_qstats_backlog_dec(sch, skb); - kfree_skb(skb); - flow->dropped++; + q->memory_usage -= mem; + sch->qstats.drops += i; + sch->qstats.backlog -= len; + sch->q.qlen -= i; return idx; } @@ -170,16 +189,17 @@ static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch) unsigned int prev_backlog; prev_backlog = sch->qstats.backlog; - fq_codel_drop(sch); + fq_codel_drop(sch, 1U); return prev_backlog - sch->qstats.backlog; } static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); - unsigned int idx, prev_backlog; + unsigned int idx, prev_backlog, prev_qlen; struct fq_codel_flow *flow; int uninitialized_var(ret); + bool memory_limited; idx = fq_codel_classify(skb, sch, &ret); if (idx == 0) { @@ -202,20 +222,29 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) flow->deficit = q->quantum; flow->dropped = 0; } - if (++sch->q.qlen <= sch->limit) + q->memory_usage += skb->truesize; + memory_limited = q->memory_usage > q->memory_limit; + if (++sch->q.qlen <= sch->limit && !memory_limited) return NET_XMIT_SUCCESS; prev_backlog = sch->qstats.backlog; - q->drop_overlimit++; - /* Return Congestion Notification only if we dropped a packet - * from this flow. + prev_qlen = sch->q.qlen; + + /* fq_codel_drop() is quite expensive, as it performs a linear search + * in q->backlogs[] to find a fat flow. + * So instead of dropping a single packet, drop half of its backlog + * with a 64 packets limit to not add a too big cpu spike here. */ - if (fq_codel_drop(sch) == idx) - return NET_XMIT_CN; + ret = fq_codel_drop(sch, q->drop_batch_size); + + q->drop_overlimit += prev_qlen - sch->q.qlen; + if (memory_limited) + q->drop_overmemory += prev_qlen - sch->q.qlen; + /* As we dropped packet(s), better let upper stack know this */ + qdisc_tree_reduce_backlog(sch, prev_qlen - sch->q.qlen, + prev_backlog - sch->qstats.backlog); - /* As we dropped a packet, better let upper stack know this */ - qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog); - return NET_XMIT_SUCCESS; + return ret == idx ? NET_XMIT_CN : NET_XMIT_SUCCESS; } /* This is the specific function called from codel_dequeue() @@ -289,6 +318,7 @@ begin: list_del_init(&flow->flowchain); goto begin; } + q->memory_usage -= skb->truesize; qdisc_bstats_update(sch, skb); flow->deficit -= qdisc_pkt_len(skb); /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0, @@ -335,6 +365,8 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 }, [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 }, [TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_DROP_BATCH_SIZE] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_MEMORY_LIMIT] = { .type = NLA_U32 }, }; static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) @@ -386,7 +418,14 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_FQ_CODEL_QUANTUM]) q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); - while (sch->q.qlen > sch->limit) { + if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]) + q->drop_batch_size = min(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])); + + if (tb[TCA_FQ_CODEL_MEMORY_LIMIT]) + q->memory_limit = min(1U << 31, nla_get_u32(tb[TCA_FQ_CODEL_MEMORY_LIMIT])); + + while (sch->q.qlen > sch->limit || + q->memory_usage > q->memory_limit) { struct sk_buff *skb = fq_codel_dequeue(sch); q->cstats.drop_len += qdisc_pkt_len(skb); @@ -431,6 +470,8 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) sch->limit = 10*1024; q->flows_cnt = 1024; + q->memory_limit = 32 << 20; /* 32 MBytes */ + q->drop_batch_size = 64; q->quantum = psched_mtu(qdisc_dev(sch)); q->perturbation = prandom_u32(); INIT_LIST_HEAD(&q->new_flows); @@ -489,6 +530,10 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) q->cparams.ecn) || nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM, q->quantum) || + nla_put_u32(skb, TCA_FQ_CODEL_DROP_BATCH_SIZE, + q->drop_batch_size) || + nla_put_u32(skb, TCA_FQ_CODEL_MEMORY_LIMIT, + q->memory_limit) || nla_put_u32(skb, TCA_FQ_CODEL_FLOWS, q->flows_cnt)) goto nla_put_failure; @@ -517,6 +562,8 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.qdisc_stats.ecn_mark = q->cstats.ecn_mark; st.qdisc_stats.new_flow_count = q->new_flow_count; st.qdisc_stats.ce_mark = q->cstats.ce_mark; + st.qdisc_stats.memory_usage = q->memory_usage; + st.qdisc_stats.drop_overmemory = q->drop_overmemory; list_for_each(pos, &q->new_flows) st.qdisc_stats.new_flows_len++; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 80742edea96f..269dd71b3828 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -108,35 +108,6 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, return skb; } -static inline int handle_dev_cpu_collision(struct sk_buff *skb, - struct netdev_queue *dev_queue, - struct Qdisc *q) -{ - int ret; - - if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) { - /* - * Same CPU holding the lock. It may be a transient - * configuration error, when hard_start_xmit() recurses. We - * detect it by checking xmit owner and drop the packet when - * deadloop is detected. Return OK to try the next skb. - */ - kfree_skb_list(skb); - net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n", - dev_queue->dev->name); - ret = qdisc_qlen(q); - } else { - /* - * Another cpu is holding lock, requeue & delay xmits for - * some time. - */ - __this_cpu_inc(softnet_data.cpu_collision); - ret = dev_requeue_skb(skb, q); - } - - return ret; -} - /* * Transmit possibly several skbs, and handle the return status as * required. Holding the __QDISC___STATE_RUNNING bit guarantees that @@ -174,9 +145,6 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, if (dev_xmit_complete(ret)) { /* Driver sent out skb successfully or skb was consumed */ ret = qdisc_qlen(q); - } else if (ret == NETDEV_TX_LOCKED) { - /* Driver try lock failed */ - ret = handle_dev_cpu_collision(skb, txq, q); } else { /* Driver returned NETDEV_TX_BUSY - requeue skb */ if (unlikely(ret != NETDEV_TX_BUSY)) @@ -259,13 +227,12 @@ unsigned long dev_trans_start(struct net_device *dev) if (is_vlan_dev(dev)) dev = vlan_dev_real_dev(dev); - res = dev->trans_start; - for (i = 0; i < dev->num_tx_queues; i++) { + res = netdev_get_tx_queue(dev, 0)->trans_start; + for (i = 1; i < dev->num_tx_queues; i++) { val = netdev_get_tx_queue(dev, i)->trans_start; if (val && time_after(val, res)) res = val; } - dev->trans_start = res; return res; } @@ -288,10 +255,7 @@ static void dev_watchdog(unsigned long arg) struct netdev_queue *txq; txq = netdev_get_tx_queue(dev, i); - /* - * old device drivers set dev->trans_start - */ - trans_start = txq->trans_start ? : dev->trans_start; + trans_start = txq->trans_start; if (netif_xmit_stopped(txq) && time_after(jiffies, (trans_start + dev->watchdog_timeo))) { @@ -807,7 +771,7 @@ void dev_activate(struct net_device *dev) transition_one_qdisc(dev, dev_ingress_queue(dev), NULL); if (need_watchdog) { - dev->trans_start = jiffies; + netif_trans_update(dev); dev_watchdog_up(dev); } } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 491d6fd6430c..205bed00dd34 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -395,6 +395,25 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) sch->q.qlen++; } +/* netem can't properly corrupt a megapacket (like we get from GSO), so instead + * when we statistically choose to corrupt one, we instead segment it, returning + * the first packet to be corrupted, and re-enqueue the remaining frames + */ +static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch) +{ + struct sk_buff *segs; + netdev_features_t features = netif_skb_features(skb); + + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + + if (IS_ERR_OR_NULL(segs)) { + qdisc_reshape_fail(skb, sch); + return NULL; + } + consume_skb(skb); + return segs; +} + /* * Insert one skb into qdisc. * Note: parent depends on return value to account for queue length. @@ -407,7 +426,11 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* We don't fill cb now as skb_unshare() may invalidate it */ struct netem_skb_cb *cb; struct sk_buff *skb2; + struct sk_buff *segs = NULL; + unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb); + int nb = 0; int count = 1; + int rc = NET_XMIT_SUCCESS; /* Random duplication */ if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) @@ -453,10 +476,23 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) * do it now in software before we mangle it. */ if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { + if (skb_is_gso(skb)) { + segs = netem_segment(skb, sch); + if (!segs) + return NET_XMIT_DROP; + } else { + segs = skb; + } + + skb = segs; + segs = segs->next; + if (!(skb = skb_unshare(skb, GFP_ATOMIC)) || (skb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_help(skb))) - return qdisc_drop(skb, sch); + skb_checksum_help(skb))) { + rc = qdisc_drop(skb, sch); + goto finish_segs; + } skb->data[prandom_u32() % skb_headlen(skb)] ^= 1<<(prandom_u32() % 8); @@ -516,6 +552,27 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) sch->qstats.requeues++; } +finish_segs: + if (segs) { + while (segs) { + skb2 = segs->next; + segs->next = NULL; + qdisc_skb_cb(segs)->pkt_len = segs->len; + last_len = segs->len; + rc = qdisc_enqueue(segs, sch); + if (rc != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(rc)) + qdisc_qstats_drop(sch); + } else { + nb++; + len += last_len; + } + segs = skb2; + } + sch->q.qlen += nb; + if (nb > 1) + qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len); + } return NET_XMIT_SUCCESS; } diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 958ef5f33f4b..1eb94bf18ef4 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -239,7 +239,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, offset = 0; if ((whole > 1) || (whole && over)) - SCTP_INC_STATS_USER(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS); + SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS); /* Create chunks for all the full sized DATA chunks. */ for (i = 0, len = first_len; i < whole; i++) { diff --git a/net/sctp/input.c b/net/sctp/input.c index 00b8445364e3..a701527a9480 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -84,7 +84,7 @@ static inline int sctp_rcv_checksum(struct net *net, struct sk_buff *skb) if (val != cmp) { /* CRC failure, dump it. */ - SCTP_INC_STATS_BH(net, SCTP_MIB_CHECKSUMERRORS); + __SCTP_INC_STATS(net, SCTP_MIB_CHECKSUMERRORS); return -1; } return 0; @@ -122,7 +122,7 @@ int sctp_rcv(struct sk_buff *skb) if (skb->pkt_type != PACKET_HOST) goto discard_it; - SCTP_INC_STATS_BH(net, SCTP_MIB_INSCTPPACKS); + __SCTP_INC_STATS(net, SCTP_MIB_INSCTPPACKS); if (skb_linearize(skb)) goto discard_it; @@ -208,7 +208,7 @@ int sctp_rcv(struct sk_buff *skb) */ if (!asoc) { if (sctp_rcv_ootb(skb)) { - SCTP_INC_STATS_BH(net, SCTP_MIB_OUTOFBLUES); + __SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES); goto discard_release; } } @@ -264,9 +264,9 @@ int sctp_rcv(struct sk_buff *skb) skb = NULL; /* sctp_chunk_free already freed the skb */ goto discard_release; } - SCTP_INC_STATS_BH(net, SCTP_MIB_IN_PKT_BACKLOG); + __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_BACKLOG); } else { - SCTP_INC_STATS_BH(net, SCTP_MIB_IN_PKT_SOFTIRQ); + __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_SOFTIRQ); sctp_inq_push(&chunk->rcvr->inqueue, chunk); } @@ -281,7 +281,7 @@ int sctp_rcv(struct sk_buff *skb) return 0; discard_it: - SCTP_INC_STATS_BH(net, SCTP_MIB_IN_PKT_DISCARDS); + __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_DISCARDS); kfree_skb(skb); return 0; @@ -532,7 +532,7 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, * servers this needs to be solved differently. */ if (sock_owned_by_user(sk)) - NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); + __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); *app = asoc; *tpp = transport; @@ -589,7 +589,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) skb->network_header = saveip; skb->transport_header = savesctp; if (!sk) { - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return; } /* Warning: The sock lock is held. Remember to call diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index b335ffcef0b9..9d87bba0ff1d 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -89,10 +89,12 @@ void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *chunk) * Eventually, we should clean up inqueue to not rely * on the BH related data structures. */ + local_bh_disable(); list_add_tail(&chunk->list, &q->in_chunk_list); if (chunk->asoc) chunk->asoc->stats.ipackets++; q->immediate.func(&q->immediate); + local_bh_enable(); } /* Peek at the next chunk on the inqeue. */ diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index ce46f1c7f133..0657d18a85bf 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -162,7 +162,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, skb->network_header = saveip; skb->transport_header = savesctp; if (!sk) { - ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(net, idev, ICMP6_MIB_INERRORS); goto out; } diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c index bb2d8d9608e9..8e3e769dc9ea 100644 --- a/net/sctp/sctp_diag.c +++ b/net/sctp/sctp_diag.c @@ -145,7 +145,11 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, else amt = sk_wmem_alloc_get(sk); mem[SK_MEMINFO_WMEM_ALLOC] = amt; - mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); + if (asoc && asoc->ep->rcvbuf_policy) + amt = atomic_read(&asoc->rmem_alloc); + else + amt = sk_rmem_alloc_get(sk); + mem[SK_MEMINFO_RMEM_ALLOC] = amt; mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; @@ -161,8 +165,9 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, if (ext & (1 << (INET_DIAG_INFO - 1))) { struct nlattr *attr; - attr = nla_reserve(skb, INET_DIAG_INFO, - sizeof(struct sctp_info)); + attr = nla_reserve_64bit(skb, INET_DIAG_INFO, + sizeof(struct sctp_info), + INET_DIAG_PAD); if (!attr) goto errout; diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index e8f0112f9b28..aa3712259368 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1741,10 +1741,9 @@ out: } else if (local_cork) error = sctp_outq_uncork(&asoc->outqueue, gfp); - if (sp->pending_data_ready) { - sk->sk_data_ready(sk); - sp->pending_data_ready = 0; - } + if (sp->data_ready_signalled) + sp->data_ready_signalled = 0; + return error; nomem: error = -ENOMEM; diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index ec12a8920e5f..ec166d2bd2d9 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -194,6 +194,7 @@ static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq) int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sock *sk = ulpq->asoc->base.sk; + struct sctp_sock *sp = sctp_sk(sk); struct sk_buff_head *queue, *skb_list; struct sk_buff *skb = sctp_event2skb(event); int clear_pd = 0; @@ -211,7 +212,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) sk_incoming_cpu_update(sk); } /* Check if the user wishes to receive this event. */ - if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe)) + if (!sctp_ulpevent_is_enabled(event, &sp->subscribe)) goto out_free; /* If we are in partial delivery mode, post to the lobby until @@ -219,7 +220,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) * the association the cause of the partial delivery. */ - if (atomic_read(&sctp_sk(sk)->pd_mode) == 0) { + if (atomic_read(&sp->pd_mode) == 0) { queue = &sk->sk_receive_queue; } else { if (ulpq->pd_mode) { @@ -231,7 +232,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) if ((event->msg_flags & MSG_NOTIFICATION) || (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK))) - queue = &sctp_sk(sk)->pd_lobby; + queue = &sp->pd_lobby; else { clear_pd = event->msg_flags & MSG_EOR; queue = &sk->sk_receive_queue; @@ -242,10 +243,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) * can queue this to the receive queue instead * of the lobby. */ - if (sctp_sk(sk)->frag_interleave) + if (sp->frag_interleave) queue = &sk->sk_receive_queue; else - queue = &sctp_sk(sk)->pd_lobby; + queue = &sp->pd_lobby; } } @@ -264,8 +265,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) if (clear_pd) sctp_ulpq_clear_pd(ulpq); - if (queue == &sk->sk_receive_queue) - sctp_sk(sk)->pending_data_ready = 1; + if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) { + sp->data_ready_signalled = 1; + sk->sk_data_ready(sk); + } return 1; out_free: @@ -1126,11 +1129,13 @@ void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_ulpevent *ev = NULL; struct sock *sk; + struct sctp_sock *sp; if (!ulpq->pd_mode) return; sk = ulpq->asoc->base.sk; + sp = sctp_sk(sk); if (sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT, &sctp_sk(sk)->subscribe)) ev = sctp_ulpevent_make_pdapi(ulpq->asoc, @@ -1140,6 +1145,8 @@ void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); /* If there is data waiting, send it up the socket now. */ - if (sctp_ulpq_clear_pd(ulpq) || ev) - sctp_sk(sk)->pending_data_ready = 1; + if ((sctp_ulpq_clear_pd(ulpq) || ev) && !sp->data_ready_signalled) { + sp->data_ready_signalled = 1; + sk->sk_data_ready(sk); + } } diff --git a/net/socket.c b/net/socket.c index 5dbb0bbe12a7..7789d79609dd 100644 --- a/net/socket.c +++ b/net/socket.c @@ -600,9 +600,6 @@ void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags) if (tsflags & SOF_TIMESTAMPING_TX_SCHED) flags |= SKBTX_SCHED_TSTAMP; - if (tsflags & SOF_TIMESTAMPING_TX_ACK) - flags |= SKBTX_ACK_TSTAMP; - *tx_flags = flags; } EXPORT_SYMBOL(__sock_tx_timestamp); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index d0756ac5c0f2..a6c68dc086af 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1018,11 +1018,11 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, /* Suck it into the iovec, verify checksum if not done by hw. */ if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) { - UDPX_INC_STATS_BH(sk, UDP_MIB_INERRORS); + __UDPX_INC_STATS(sk, UDP_MIB_INERRORS); goto out_unlock; } - UDPX_INC_STATS_BH(sk, UDP_MIB_INDATAGRAMS); + __UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS); xprt_adjust_cwnd(xprt, task, copied); xprt_complete_rqst(task, copied); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 2b9b98f1c2ff..b7e01d88bdc5 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -305,6 +305,8 @@ static void switchdev_port_attr_set_deferred(struct net_device *dev, if (err && err != -EOPNOTSUPP) netdev_err(dev, "failed (err=%d) to set attribute (id=%d)\n", err, attr->id); + if (attr->complete) + attr->complete(dev, err, attr->complete_priv); } static int switchdev_port_attr_set_defer(struct net_device *dev, @@ -434,6 +436,8 @@ static void switchdev_port_obj_add_deferred(struct net_device *dev, if (err && err != -EOPNOTSUPP) netdev_err(dev, "failed (err=%d) to add object (id=%d)\n", err, obj->id); + if (obj->complete) + obj->complete(dev, err, obj->complete_priv); } static int switchdev_port_obj_add_defer(struct net_device *dev, @@ -502,6 +506,8 @@ static void switchdev_port_obj_del_deferred(struct net_device *dev, if (err && err != -EOPNOTSUPP) netdev_err(dev, "failed (err=%d) to del object (id=%d)\n", err, obj->id); + if (obj->complete) + obj->complete(dev, err, obj->complete_priv); } static int switchdev_port_obj_del_defer(struct net_device *dev, diff --git a/net/tipc/core.c b/net/tipc/core.c index e2bdb07a49a2..fe1b062c4f18 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -112,11 +112,9 @@ static int __init tipc_init(void) pr_info("Activated (version " TIPC_MOD_VER ")\n"); - sysctl_tipc_rmem[0] = TIPC_CONN_OVERLOAD_LIMIT >> 4 << - TIPC_LOW_IMPORTANCE; - sysctl_tipc_rmem[1] = TIPC_CONN_OVERLOAD_LIMIT >> 4 << - TIPC_CRITICAL_IMPORTANCE; - sysctl_tipc_rmem[2] = TIPC_CONN_OVERLOAD_LIMIT; + sysctl_tipc_rmem[0] = RCVBUF_MIN; + sysctl_tipc_rmem[1] = RCVBUF_DEF; + sysctl_tipc_rmem[2] = RCVBUF_MAX; err = tipc_netlink_start(); if (err) diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 58bf51541813..024da8af91f0 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -743,16 +743,26 @@ static inline void msg_set_msgcnt(struct tipc_msg *m, u16 n) msg_set_bits(m, 9, 16, 0xffff, n); } -static inline u32 msg_bcast_tag(struct tipc_msg *m) +static inline u32 msg_conn_ack(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } -static inline void msg_set_bcast_tag(struct tipc_msg *m, u32 n) +static inline void msg_set_conn_ack(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 16, 0xffff, n); } +static inline u32 msg_adv_win(struct tipc_msg *m) +{ + return msg_bits(m, 9, 0, 0xffff); +} + +static inline void msg_set_adv_win(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 9, 0, 0xffff, n); +} + static inline u32 msg_max_pkt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff) * 4; diff --git a/net/tipc/node.c b/net/tipc/node.c index 68d9f7b8485c..d903f560e2fd 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1,7 +1,7 @@ /* * net/tipc/node.c: TIPC node management routines * - * Copyright (c) 2000-2006, 2012-2015, Ericsson AB + * Copyright (c) 2000-2006, 2012-2016, Ericsson AB * Copyright (c) 2005-2006, 2010-2014, Wind River Systems * All rights reserved. * @@ -191,6 +191,20 @@ int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) tipc_node_put(n); return mtu; } + +u16 tipc_node_get_capabilities(struct net *net, u32 addr) +{ + struct tipc_node *n; + u16 caps; + + n = tipc_node_find(net, addr); + if (unlikely(!n)) + return TIPC_NODE_CAPABILITIES; + caps = n->capabilities; + tipc_node_put(n); + return caps; +} + /* * A trivial power-of-two bitmask technique is used for speed, since this * operation is done for every incoming TIPC packet. The number of hash table @@ -304,8 +318,11 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) spin_lock_bh(&tn->node_list_lock); n = tipc_node_find(net, addr); - if (n) + if (n) { + /* Same node may come back with new capabilities */ + n->capabilities = capabilities; goto exit; + } n = kzalloc(sizeof(*n), GFP_ATOMIC); if (!n) { pr_warn("Node creation failed, no memory\n"); @@ -554,6 +571,7 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, *slot1 = bearer_id; tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT); n->action_flags |= TIPC_NOTIFY_NODE_UP; + tipc_link_set_active(nl, true); tipc_bcast_add_peer(n->net, nl, xmitq); return; } @@ -1451,6 +1469,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) int bearer_id = b->identity; struct tipc_link_entry *le; u16 bc_ack = msg_bcast_ack(hdr); + u32 self = tipc_own_addr(net); int rc = 0; __skb_queue_head_init(&xmitq); @@ -1467,6 +1486,10 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) return tipc_node_bc_rcv(net, skb, bearer_id); } + /* Discard unicast link messages destined for another node */ + if (unlikely(!msg_short(hdr) && (msg_destnode(hdr) != self))) + goto discard; + /* Locate neighboring node that sent packet */ n = tipc_node_find(net, msg_prevnode(hdr)); if (unlikely(!n)) diff --git a/net/tipc/node.h b/net/tipc/node.h index f39d9d06e8bb..8264b3d97dc4 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -45,10 +45,11 @@ /* Optional capabilities supported by this code version */ enum { - TIPC_BCAST_SYNCH = (1 << 1) + TIPC_BCAST_SYNCH = (1 << 1), + TIPC_BLOCK_FLOWCTL = (2 << 1) }; -#define TIPC_NODE_CAPABILITIES TIPC_BCAST_SYNCH +#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | TIPC_BLOCK_FLOWCTL) #define INVALID_BEARER_ID -1 void tipc_node_stop(struct net *net); @@ -70,6 +71,7 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb); int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port); int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel); +u16 tipc_node_get_capabilities(struct net *net, u32 addr); int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3eeb50a27b89..12628890c219 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -96,8 +96,11 @@ struct tipc_sock { uint conn_timeout; atomic_t dupl_rcvcnt; bool link_cong; - uint sent_unacked; - uint rcv_unacked; + u16 snt_unacked; + u16 snd_win; + u16 peer_caps; + u16 rcv_unacked; + u16 rcv_win; struct sockaddr_tipc remote; struct rhash_head node; struct rcu_head rcu; @@ -227,9 +230,29 @@ static struct tipc_sock *tipc_sk(const struct sock *sk) return container_of(sk, struct tipc_sock, sk); } -static int tsk_conn_cong(struct tipc_sock *tsk) +static bool tsk_conn_cong(struct tipc_sock *tsk) { - return tsk->sent_unacked >= TIPC_FLOWCTRL_WIN; + return tsk->snt_unacked >= tsk->snd_win; +} + +/* tsk_blocks(): translate a buffer size in bytes to number of + * advertisable blocks, taking into account the ratio truesize(len)/len + * We can trust that this ratio is always < 4 for len >= FLOWCTL_BLK_SZ + */ +static u16 tsk_adv_blocks(int len) +{ + return len / FLOWCTL_BLK_SZ / 4; +} + +/* tsk_inc(): increment counter for sent or received data + * - If block based flow control is not supported by peer we + * fall back to message based ditto, incrementing the counter + */ +static u16 tsk_inc(struct tipc_sock *tsk, int msglen) +{ + if (likely(tsk->peer_caps & TIPC_BLOCK_FLOWCTL)) + return ((msglen / FLOWCTL_BLK_SZ) + 1); + return 1; } /** @@ -377,9 +400,12 @@ static int tipc_sk_create(struct net *net, struct socket *sock, sk->sk_write_space = tipc_write_space; sk->sk_destruct = tipc_sock_destruct; tsk->conn_timeout = CONN_TIMEOUT_DEFAULT; - tsk->sent_unacked = 0; atomic_set(&tsk->dupl_rcvcnt, 0); + /* Start out with safe limits until we receive an advertised window */ + tsk->snd_win = tsk_adv_blocks(RCVBUF_MIN); + tsk->rcv_win = tsk->snd_win; + if (sock->state == SS_READY) { tsk_set_unreturnable(tsk, true); if (sock->type == SOCK_DGRAM) @@ -775,7 +801,7 @@ static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb) struct sock *sk = &tsk->sk; struct tipc_msg *hdr = buf_msg(skb); int mtyp = msg_type(hdr); - int conn_cong; + bool conn_cong; /* Ignore if connection cannot be validated: */ if (!tsk_peer_msg(tsk, hdr)) @@ -789,7 +815,9 @@ static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb) return; } else if (mtyp == CONN_ACK) { conn_cong = tsk_conn_cong(tsk); - tsk->sent_unacked -= msg_msgcnt(hdr); + tsk->snt_unacked -= msg_conn_ack(hdr); + if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) + tsk->snd_win = msg_adv_win(hdr); if (conn_cong) sk->sk_write_space(sk); } else if (mtyp != CONN_PROBE_REPLY) { @@ -1020,12 +1048,14 @@ static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz) u32 dnode; uint mtu, send, sent = 0; struct iov_iter save; + int hlen = MIN_H_SIZE; /* Handle implied connection establishment */ if (unlikely(dest)) { rc = __tipc_sendmsg(sock, m, dsz); + hlen = msg_hdr_sz(mhdr); if (dsz && (dsz == rc)) - tsk->sent_unacked = 1; + tsk->snt_unacked = tsk_inc(tsk, dsz + hlen); return rc; } if (dsz > (uint)INT_MAX) @@ -1054,7 +1084,7 @@ next: if (likely(!tsk_conn_cong(tsk))) { rc = tipc_node_xmit(net, &pktchain, dnode, portid); if (likely(!rc)) { - tsk->sent_unacked++; + tsk->snt_unacked += tsk_inc(tsk, send + hlen); sent += send; if (sent == dsz) return dsz; @@ -1118,6 +1148,13 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, sk_reset_timer(sk, &sk->sk_timer, jiffies + tsk->probing_intv); tipc_node_add_conn(net, peer_node, tsk->portid, peer_port); tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid); + tsk->peer_caps = tipc_node_get_capabilities(net, peer_node); + if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) + return; + + /* Fall back to message based flow control */ + tsk->rcv_win = FLOWCTL_MSG_WIN; + tsk->snd_win = FLOWCTL_MSG_WIN; } /** @@ -1214,7 +1251,7 @@ static int tipc_sk_anc_data_recv(struct msghdr *m, struct tipc_msg *msg, return 0; } -static void tipc_sk_send_ack(struct tipc_sock *tsk, uint ack) +static void tipc_sk_send_ack(struct tipc_sock *tsk) { struct net *net = sock_net(&tsk->sk); struct sk_buff *skb = NULL; @@ -1230,7 +1267,14 @@ static void tipc_sk_send_ack(struct tipc_sock *tsk, uint ack) if (!skb) return; msg = buf_msg(skb); - msg_set_msgcnt(msg, ack); + msg_set_conn_ack(msg, tsk->rcv_unacked); + tsk->rcv_unacked = 0; + + /* Adjust to and advertize the correct window limit */ + if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) { + tsk->rcv_win = tsk_adv_blocks(tsk->sk.sk_rcvbuf); + msg_set_adv_win(msg, tsk->rcv_win); + } tipc_node_xmit_skb(net, skb, dnode, msg_link_selector(msg)); } @@ -1288,7 +1332,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buf_len, long timeo; unsigned int sz; u32 err; - int res; + int res, hlen; /* Catch invalid receive requests */ if (unlikely(!buf_len)) @@ -1313,6 +1357,7 @@ restart: buf = skb_peek(&sk->sk_receive_queue); msg = buf_msg(buf); sz = msg_data_sz(msg); + hlen = msg_hdr_sz(msg); err = msg_errcode(msg); /* Discard an empty non-errored message & try again */ @@ -1335,7 +1380,7 @@ restart: sz = buf_len; m->msg_flags |= MSG_TRUNC; } - res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg), m, sz); + res = skb_copy_datagram_msg(buf, hlen, m, sz); if (res) goto exit; res = sz; @@ -1347,15 +1392,15 @@ restart: res = -ECONNRESET; } - /* Consume received message (optional) */ - if (likely(!(flags & MSG_PEEK))) { - if ((sock->state != SS_READY) && - (++tsk->rcv_unacked >= TIPC_CONNACK_INTV)) { - tipc_sk_send_ack(tsk, tsk->rcv_unacked); - tsk->rcv_unacked = 0; - } - tsk_advance_rx_queue(sk); + if (unlikely(flags & MSG_PEEK)) + goto exit; + + if (likely(sock->state != SS_READY)) { + tsk->rcv_unacked += tsk_inc(tsk, hlen + sz); + if (unlikely(tsk->rcv_unacked >= (tsk->rcv_win / 4))) + tipc_sk_send_ack(tsk); } + tsk_advance_rx_queue(sk); exit: release_sock(sk); return res; @@ -1384,7 +1429,7 @@ static int tipc_recv_stream(struct socket *sock, struct msghdr *m, int sz_to_copy, target, needed; int sz_copied = 0; u32 err; - int res = 0; + int res = 0, hlen; /* Catch invalid receive attempts */ if (unlikely(!buf_len)) @@ -1410,6 +1455,7 @@ restart: buf = skb_peek(&sk->sk_receive_queue); msg = buf_msg(buf); sz = msg_data_sz(msg); + hlen = msg_hdr_sz(msg); err = msg_errcode(msg); /* Discard an empty non-errored message & try again */ @@ -1434,8 +1480,7 @@ restart: needed = (buf_len - sz_copied); sz_to_copy = (sz <= needed) ? sz : needed; - res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg) + offset, - m, sz_to_copy); + res = skb_copy_datagram_msg(buf, hlen + offset, m, sz_to_copy); if (res) goto exit; @@ -1457,20 +1502,18 @@ restart: res = -ECONNRESET; } - /* Consume received message (optional) */ - if (likely(!(flags & MSG_PEEK))) { - if (unlikely(++tsk->rcv_unacked >= TIPC_CONNACK_INTV)) { - tipc_sk_send_ack(tsk, tsk->rcv_unacked); - tsk->rcv_unacked = 0; - } - tsk_advance_rx_queue(sk); - } + if (unlikely(flags & MSG_PEEK)) + goto exit; + + tsk->rcv_unacked += tsk_inc(tsk, hlen + sz); + if (unlikely(tsk->rcv_unacked >= (tsk->rcv_win / 4))) + tipc_sk_send_ack(tsk); + tsk_advance_rx_queue(sk); /* Loop around if more data is required */ if ((sz_copied < buf_len) && /* didn't get all requested data */ (!skb_queue_empty(&sk->sk_receive_queue) || (sz_copied < target)) && /* and more is ready or required */ - (!(flags & MSG_PEEK)) && /* and aren't just peeking at data */ (!err)) /* and haven't reached a FIN */ goto restart; @@ -1602,30 +1645,33 @@ static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) /** * rcvbuf_limit - get proper overload limit of socket receive queue * @sk: socket - * @buf: message + * @skb: message * - * For all connection oriented messages, irrespective of importance, - * the default overload value (i.e. 67MB) is set as limit. + * For connection oriented messages, irrespective of importance, + * default queue limit is 2 MB. * - * For all connectionless messages, by default new queue limits are - * as belows: + * For connectionless messages, queue limits are based on message + * importance as follows: * - * TIPC_LOW_IMPORTANCE (4 MB) - * TIPC_MEDIUM_IMPORTANCE (8 MB) - * TIPC_HIGH_IMPORTANCE (16 MB) - * TIPC_CRITICAL_IMPORTANCE (32 MB) + * TIPC_LOW_IMPORTANCE (2 MB) + * TIPC_MEDIUM_IMPORTANCE (4 MB) + * TIPC_HIGH_IMPORTANCE (8 MB) + * TIPC_CRITICAL_IMPORTANCE (16 MB) * * Returns overload limit according to corresponding message importance */ -static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf) +static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) { - struct tipc_msg *msg = buf_msg(buf); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_msg *hdr = buf_msg(skb); + + if (unlikely(!msg_connected(hdr))) + return sk->sk_rcvbuf << msg_importance(hdr); - if (msg_connected(msg)) - return sysctl_tipc_rmem[2]; + if (likely(tsk->peer_caps & TIPC_BLOCK_FLOWCTL)) + return sk->sk_rcvbuf; - return sk->sk_rcvbuf >> TIPC_CRITICAL_IMPORTANCE << - msg_importance(msg); + return FLOWCTL_MSG_LIM; } /** @@ -1748,7 +1794,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, /* Try backlog, compensating for double-counted bytes */ dcnt = &tipc_sk(sk)->dupl_rcvcnt; - if (sk->sk_backlog.len) + if (!sk->sk_backlog.len) atomic_set(dcnt, 0); lim = rcvbuf_limit(sk, skb) + atomic_read(dcnt); if (likely(!sk_add_backlog(sk, skb, lim))) diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 4241f22069dc..06fb5944cf76 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -1,6 +1,6 @@ /* net/tipc/socket.h: Include file for TIPC socket code * - * Copyright (c) 2014-2015, Ericsson AB + * Copyright (c) 2014-2016, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -38,10 +38,17 @@ #include <net/sock.h> #include <net/genetlink.h> -#define TIPC_CONNACK_INTV 256 -#define TIPC_FLOWCTRL_WIN (TIPC_CONNACK_INTV * 2) -#define TIPC_CONN_OVERLOAD_LIMIT ((TIPC_FLOWCTRL_WIN * 2 + 1) * \ - SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE)) +/* Compatibility values for deprecated message based flow control */ +#define FLOWCTL_MSG_WIN 512 +#define FLOWCTL_MSG_LIM ((FLOWCTL_MSG_WIN * 2 + 1) * SKB_TRUESIZE(MAX_MSG_SIZE)) + +#define FLOWCTL_BLK_SZ 1024 + +/* Socket receive buffer sizes */ +#define RCVBUF_MIN (FLOWCTL_BLK_SZ * 512) +#define RCVBUF_DEF (FLOWCTL_BLK_SZ * 1024 * 2) +#define RCVBUF_MAX (FLOWCTL_BLK_SZ * 1024 * 16) + int tipc_socket_init(void); void tipc_socket_stop(void); void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq); diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 79de588c7bd6..0dd02244e21d 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -326,8 +326,7 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid, return tipc_subscrp_cancel(s, subscriber); } - if (s) - tipc_subscrp_subscribe(net, s, subscriber, swap); + tipc_subscrp_subscribe(net, s, subscriber, swap); } /* Handle one request to establish a new subscriber */ diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 3dce53ebea92..b5f1221f48d4 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1808,27 +1808,8 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, else if (sk->sk_shutdown & RCV_SHUTDOWN) err = 0; - if (copied > 0) { - /* We only do these additional bookkeeping/notification steps - * if we actually copied something out of the queue pair - * instead of just peeking ahead. - */ - - if (!(flags & MSG_PEEK)) { - /* If the other side has shutdown for sending and there - * is nothing more to read, then modify the socket - * state. - */ - if (vsk->peer_shutdown & SEND_SHUTDOWN) { - if (vsock_stream_has_data(vsk) <= 0) { - sk->sk_state = SS_UNCONNECTED; - sock_set_flag(sk, SOCK_DONE); - sk->sk_state_change(sk); - } - } - } + if (copied > 0) err = copied; - } out: release_sock(sk); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 56214736fe88..4120b7a538be 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -2051,7 +2051,7 @@ static u32 vmci_transport_get_local_cid(void) return vmci_get_context_id(); } -static struct vsock_transport vmci_transport = { +static const struct vsock_transport vmci_transport = { .init = vmci_transport_socket_init, .destruct = vmci_transport_destruct, .release = vmci_transport_release, diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index ff4a91fcab9f..637387bbaaea 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -99,6 +99,9 @@ static int xfrm_output_one(struct sk_buff *skb, int err) skb_dst_force(skb); + /* Inner headers are invalid now. */ + skb->encapsulation = 0; + err = x->type->output(x, skb); if (err == -EINPROGRESS) goto out; |