From e0f802fbcaa3bffe4728e37a8fa1279b5d554173 Mon Sep 17 00:00:00 2001
From: Octavian Purdila
Date: Tue, 17 Jun 2014 11:25:37 +0300
Subject: tcp: move ir_mark initialization to tcp_openreq_init

ir_mark initialization is done for both TCP v4 and v6, move it in the
common tcp_openreq_init function.

Signed-off-by: Octavian Purdila <octavian.purdila@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 77cccda1ad0c..180336d47df6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1311,14 +1311,13 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		tcp_clear_options(&tmp_opt);
 
 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
-	tcp_openreq_init(req, &tmp_opt, skb);
+	tcp_openreq_init(req, &tmp_opt, skb, sk);
 
 	ireq = inet_rsk(req);
 	ireq->ir_loc_addr = daddr;
 	ireq->ir_rmt_addr = saddr;
 	ireq->no_srccheck = inet_sk(sk)->transparent;
 	ireq->opt = tcp_v4_save_options(skb);
-	ireq->ir_mark = inet_request_mark(sk, skb);
 
 	if (security_inet_conn_request(sk, skb, req))
 		goto drop_and_free;
-- 
cgit v1.2.3


From 7200135bc1e61f1437dc326ae2ef2f310c50b4eb Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Mon, 16 Jun 2014 13:01:52 +0200
Subject: netfilter: kill ulog targets

This has been marked as deprecated for quite some time and the NFLOG
target replacement has been also available since 2006.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter_bridge/Kbuild     |   1 -
 include/uapi/linux/netfilter_bridge/ebt_ulog.h |  38 --
 include/uapi/linux/netfilter_ipv4/Kbuild       |   1 -
 include/uapi/linux/netfilter_ipv4/ipt_ULOG.h   |  49 ---
 net/bridge/netfilter/Kconfig                   |  16 -
 net/bridge/netfilter/ebt_ulog.c                | 393 -------------------
 net/ipv4/netfilter/Kconfig                     |  19 -
 net/ipv4/netfilter/ipt_ULOG.c                  | 498 -------------------------
 8 files changed, 1015 deletions(-)
 delete mode 100644 include/uapi/linux/netfilter_bridge/ebt_ulog.h
 delete mode 100644 include/uapi/linux/netfilter_ipv4/ipt_ULOG.h
 delete mode 100644 net/bridge/netfilter/ebt_ulog.c
 delete mode 100644 net/ipv4/netfilter/ipt_ULOG.c

(limited to 'net/ipv4')

diff --git a/include/uapi/linux/netfilter_bridge/Kbuild b/include/uapi/linux/netfilter_bridge/Kbuild
index 348717c3a22f..0fbad8ef96de 100644
--- a/include/uapi/linux/netfilter_bridge/Kbuild
+++ b/include/uapi/linux/netfilter_bridge/Kbuild
@@ -14,6 +14,5 @@ header-y += ebt_nflog.h
 header-y += ebt_pkttype.h
 header-y += ebt_redirect.h
 header-y += ebt_stp.h
-header-y += ebt_ulog.h
 header-y += ebt_vlan.h
 header-y += ebtables.h
diff --git a/include/uapi/linux/netfilter_bridge/ebt_ulog.h b/include/uapi/linux/netfilter_bridge/ebt_ulog.h
deleted file mode 100644
index 89a6becb5269..000000000000
--- a/include/uapi/linux/netfilter_bridge/ebt_ulog.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef _EBT_ULOG_H
-#define _EBT_ULOG_H
-
-#include <linux/types.h>
-
-#define EBT_ULOG_DEFAULT_NLGROUP 0
-#define EBT_ULOG_DEFAULT_QTHRESHOLD 1
-#define EBT_ULOG_MAXNLGROUPS 32 /* hardcoded netlink max */
-#define EBT_ULOG_PREFIX_LEN 32
-#define EBT_ULOG_MAX_QLEN 50
-#define EBT_ULOG_WATCHER "ulog"
-#define EBT_ULOG_VERSION 1
-
-struct ebt_ulog_info {
-	__u32 nlgroup;
-	unsigned int cprange;
-	unsigned int qthreshold;
-	char prefix[EBT_ULOG_PREFIX_LEN];
-};
-
-typedef struct ebt_ulog_packet_msg {
-	int version;
-	char indev[IFNAMSIZ];
-	char outdev[IFNAMSIZ];
-	char physindev[IFNAMSIZ];
-	char physoutdev[IFNAMSIZ];
-	char prefix[EBT_ULOG_PREFIX_LEN];
-	struct timeval stamp;
-	unsigned long mark;
-	unsigned int hook;
-	size_t data_len;
-	/* The complete packet, including Ethernet header and perhaps
-	 * the VLAN header is appended */
-	unsigned char data[0] __attribute__
-	                      ((aligned (__alignof__(struct ebt_ulog_info))));
-} ebt_ulog_packet_msg_t;
-
-#endif /* _EBT_ULOG_H */
diff --git a/include/uapi/linux/netfilter_ipv4/Kbuild b/include/uapi/linux/netfilter_ipv4/Kbuild
index fb008437dde1..ecb291df390e 100644
--- a/include/uapi/linux/netfilter_ipv4/Kbuild
+++ b/include/uapi/linux/netfilter_ipv4/Kbuild
@@ -5,7 +5,6 @@ header-y += ipt_ECN.h
 header-y += ipt_LOG.h
 header-y += ipt_REJECT.h
 header-y += ipt_TTL.h
-header-y += ipt_ULOG.h
 header-y += ipt_ah.h
 header-y += ipt_ecn.h
 header-y += ipt_ttl.h
diff --git a/include/uapi/linux/netfilter_ipv4/ipt_ULOG.h b/include/uapi/linux/netfilter_ipv4/ipt_ULOG.h
deleted file mode 100644
index 417aad280bcc..000000000000
--- a/include/uapi/linux/netfilter_ipv4/ipt_ULOG.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Header file for IP tables userspace logging, Version 1.8
- *
- * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org>
- * 
- * Distributed under the terms of GNU GPL */
-
-#ifndef _IPT_ULOG_H
-#define _IPT_ULOG_H
-
-#ifndef NETLINK_NFLOG
-#define NETLINK_NFLOG 	5
-#endif
-
-#define ULOG_DEFAULT_NLGROUP	1
-#define ULOG_DEFAULT_QTHRESHOLD	1
-
-#define ULOG_MAC_LEN	80
-#define ULOG_PREFIX_LEN	32
-
-#define ULOG_MAX_QLEN	50
-/* Why 50? Well... there is a limit imposed by the slab cache 131000
- * bytes. So the multipart netlink-message has to be < 131000 bytes.
- * Assuming a standard ethernet-mtu of 1500, we could define this up
- * to 80... but even 50 seems to be big enough. */
-
-/* private data structure for each rule with a ULOG target */
-struct ipt_ulog_info {
-	unsigned int nl_group;
-	size_t copy_range;
-	size_t qthreshold;
-	char prefix[ULOG_PREFIX_LEN];
-};
-
-/* Format of the ULOG packets passed through netlink */
-typedef struct ulog_packet_msg {
-	unsigned long mark;
-	long timestamp_sec;
-	long timestamp_usec;
-	unsigned int hook;
-	char indev_name[IFNAMSIZ];
-	char outdev_name[IFNAMSIZ];
-	size_t data_len;
-	char prefix[ULOG_PREFIX_LEN];
-	unsigned char mac_len;
-	unsigned char mac[ULOG_MAC_LEN];
-	unsigned char payload[0];
-} ulog_packet_msg_t;
-
-#endif /*_IPT_ULOG_H*/
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index 629dc77874a9..3a76ac7b7141 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -202,22 +202,6 @@ config BRIDGE_EBT_LOG
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
-config BRIDGE_EBT_ULOG
-	tristate "ebt: ulog support (OBSOLETE)"
-	help
-	  This option enables the old bridge-specific "ebt_ulog" implementation
-	  which has been obsoleted by the new "nfnetlink_log" code (see
-	  CONFIG_NETFILTER_NETLINK_LOG).
-
-	  This option adds the ulog watcher, that you can use in any rule
-	  in any ebtables table. The packet is passed to a userspace
-	  logging daemon using netlink multicast sockets. This differs
-	  from the log watcher in the sense that the complete packet is
-	  sent to userspace instead of a descriptive text and that
-	  netlink multicast sockets are used instead of the syslog.
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 config BRIDGE_EBT_NFLOG
 	tristate "ebt: nflog support"
 	help
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
deleted file mode 100644
index 7c470c371e14..000000000000
--- a/net/bridge/netfilter/ebt_ulog.c
+++ /dev/null
@@ -1,393 +0,0 @@
-/*
- * netfilter module for userspace bridged Ethernet frames logging daemons
- *
- *	Authors:
- *	Bart De Schuymer <bdschuym@pandora.be>
- *	Harald Welte <laforge@netfilter.org>
- *
- *  November, 2004
- *
- * Based on ipt_ULOG.c, which is
- * (C) 2000-2002 by Harald Welte <laforge@netfilter.org>
- *
- * This module accepts two parameters:
- *
- * nlbufsiz:
- *   The parameter specifies how big the buffer for each netlink multicast
- * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will
- * get accumulated in the kernel until they are sent to userspace. It is
- * NOT possible to allocate more than 128kB, and it is strongly discouraged,
- * because atomically allocating 128kB inside the network rx softirq is not
- * reliable. Please also keep in mind that this buffer size is allocated for
- * each nlgroup you are using, so the total kernel memory usage increases
- * by that factor.
- *
- * flushtimeout:
- *   Specify, after how many hundredths of a second the queue should be
- *   flushed even if it is not full yet.
- *
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/socket.h>
-#include <linux/skbuff.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <net/netlink.h>
-#include <linux/netdevice.h>
-#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_ulog.h>
-#include <net/netfilter/nf_log.h>
-#include <net/netns/generic.h>
-#include <net/sock.h>
-#include "../br_private.h"
-
-static unsigned int nlbufsiz = NLMSG_GOODSIZE;
-module_param(nlbufsiz, uint, 0600);
-MODULE_PARM_DESC(nlbufsiz, "netlink buffer size (number of bytes) "
-			   "(defaults to 4096)");
-
-static unsigned int flushtimeout = 10;
-module_param(flushtimeout, uint, 0600);
-MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths ofa second) "
-			       "(defaults to 10)");
-
-typedef struct {
-	unsigned int qlen;		/* number of nlmsgs' in the skb */
-	struct nlmsghdr *lastnlh;	/* netlink header of last msg in skb */
-	struct sk_buff *skb;		/* the pre-allocated skb */
-	struct timer_list timer;	/* the timer function */
-	spinlock_t lock;		/* the per-queue lock */
-} ebt_ulog_buff_t;
-
-static int ebt_ulog_net_id __read_mostly;
-struct ebt_ulog_net {
-	unsigned int nlgroup[EBT_ULOG_MAXNLGROUPS];
-	ebt_ulog_buff_t ulog_buffers[EBT_ULOG_MAXNLGROUPS];
-	struct sock *ebtulognl;
-};
-
-static struct ebt_ulog_net *ebt_ulog_pernet(struct net *net)
-{
-	return net_generic(net, ebt_ulog_net_id);
-}
-
-/* send one ulog_buff_t to userspace */
-static void ulog_send(struct ebt_ulog_net *ebt, unsigned int nlgroup)
-{
-	ebt_ulog_buff_t *ub = &ebt->ulog_buffers[nlgroup];
-
-	del_timer(&ub->timer);
-
-	if (!ub->skb)
-		return;
-
-	/* last nlmsg needs NLMSG_DONE */
-	if (ub->qlen > 1)
-		ub->lastnlh->nlmsg_type = NLMSG_DONE;
-
-	NETLINK_CB(ub->skb).dst_group = nlgroup + 1;
-	netlink_broadcast(ebt->ebtulognl, ub->skb, 0, nlgroup + 1, GFP_ATOMIC);
-
-	ub->qlen = 0;
-	ub->skb = NULL;
-}
-
-/* timer function to flush queue in flushtimeout time */
-static void ulog_timer(unsigned long data)
-{
-	struct ebt_ulog_net *ebt = container_of((void *)data,
-						struct ebt_ulog_net,
-						nlgroup[*(unsigned int *)data]);
-
-	ebt_ulog_buff_t *ub = &ebt->ulog_buffers[*(unsigned int *)data];
-	spin_lock_bh(&ub->lock);
-	if (ub->skb)
-		ulog_send(ebt, *(unsigned int *)data);
-	spin_unlock_bh(&ub->lock);
-}
-
-static struct sk_buff *ulog_alloc_skb(unsigned int size)
-{
-	struct sk_buff *skb;
-	unsigned int n;
-
-	n = max(size, nlbufsiz);
-	skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN);
-	if (!skb) {
-		if (n > size) {
-			/* try to allocate only as much as we need for
-			 * current packet */
-			skb = alloc_skb(size, GFP_ATOMIC);
-			if (!skb)
-				pr_debug("cannot even allocate buffer of size %ub\n",
-					 size);
-		}
-	}
-
-	return skb;
-}
-
-static void ebt_ulog_packet(struct net *net, unsigned int hooknr,
-			    const struct sk_buff *skb,
-			    const struct net_device *in,
-			    const struct net_device *out,
-			    const struct ebt_ulog_info *uloginfo,
-			    const char *prefix)
-{
-	ebt_ulog_packet_msg_t *pm;
-	size_t size, copy_len;
-	struct nlmsghdr *nlh;
-	struct ebt_ulog_net *ebt = ebt_ulog_pernet(net);
-	unsigned int group = uloginfo->nlgroup;
-	ebt_ulog_buff_t *ub = &ebt->ulog_buffers[group];
-	spinlock_t *lock = &ub->lock;
-	ktime_t kt;
-
-	if ((uloginfo->cprange == 0) ||
-	    (uloginfo->cprange > skb->len + ETH_HLEN))
-		copy_len = skb->len + ETH_HLEN;
-	else
-		copy_len = uloginfo->cprange;
-
-	size = nlmsg_total_size(sizeof(*pm) + copy_len);
-	if (size > nlbufsiz) {
-		pr_debug("Size %Zd needed, but nlbufsiz=%d\n", size, nlbufsiz);
-		return;
-	}
-
-	spin_lock_bh(lock);
-
-	if (!ub->skb) {
-		if (!(ub->skb = ulog_alloc_skb(size)))
-			goto unlock;
-	} else if (size > skb_tailroom(ub->skb)) {
-		ulog_send(ebt, group);
-
-		if (!(ub->skb = ulog_alloc_skb(size)))
-			goto unlock;
-	}
-
-	nlh = nlmsg_put(ub->skb, 0, ub->qlen, 0,
-			size - NLMSG_ALIGN(sizeof(*nlh)), 0);
-	if (!nlh) {
-		kfree_skb(ub->skb);
-		ub->skb = NULL;
-		goto unlock;
-	}
-	ub->qlen++;
-
-	pm = nlmsg_data(nlh);
-	memset(pm, 0, sizeof(*pm));
-
-	/* Fill in the ulog data */
-	pm->version = EBT_ULOG_VERSION;
-	kt = ktime_get_real();
-	pm->stamp = ktime_to_timeval(kt);
-	if (ub->qlen == 1)
-		ub->skb->tstamp = kt;
-	pm->data_len = copy_len;
-	pm->mark = skb->mark;
-	pm->hook = hooknr;
-	if (uloginfo->prefix != NULL)
-		strcpy(pm->prefix, uloginfo->prefix);
-
-	if (in) {
-		strcpy(pm->physindev, in->name);
-		/* If in isn't a bridge, then physindev==indev */
-		if (br_port_exists(in))
-			/* rcu_read_lock()ed by nf_hook_slow */
-			strcpy(pm->indev, br_port_get_rcu(in)->br->dev->name);
-		else
-			strcpy(pm->indev, in->name);
-	}
-
-	if (out) {
-		/* If out exists, then out is a bridge port */
-		strcpy(pm->physoutdev, out->name);
-		/* rcu_read_lock()ed by nf_hook_slow */
-		strcpy(pm->outdev, br_port_get_rcu(out)->br->dev->name);
-	}
-
-	if (skb_copy_bits(skb, -ETH_HLEN, pm->data, copy_len) < 0)
-		BUG();
-
-	if (ub->qlen > 1)
-		ub->lastnlh->nlmsg_flags |= NLM_F_MULTI;
-
-	ub->lastnlh = nlh;
-
-	if (ub->qlen >= uloginfo->qthreshold)
-		ulog_send(ebt, group);
-	else if (!timer_pending(&ub->timer)) {
-		ub->timer.expires = jiffies + flushtimeout * HZ / 100;
-		add_timer(&ub->timer);
-	}
-
-unlock:
-	spin_unlock_bh(lock);
-}
-
-/* this function is registered with the netfilter core */
-static void ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
-   const struct sk_buff *skb, const struct net_device *in,
-   const struct net_device *out, const struct nf_loginfo *li,
-   const char *prefix)
-{
-	struct ebt_ulog_info loginfo;
-
-	if (!li || li->type != NF_LOG_TYPE_ULOG) {
-		loginfo.nlgroup = EBT_ULOG_DEFAULT_NLGROUP;
-		loginfo.cprange = 0;
-		loginfo.qthreshold = EBT_ULOG_DEFAULT_QTHRESHOLD;
-		loginfo.prefix[0] = '\0';
-	} else {
-		loginfo.nlgroup = li->u.ulog.group;
-		loginfo.cprange = li->u.ulog.copy_len;
-		loginfo.qthreshold = li->u.ulog.qthreshold;
-		strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
-	}
-
-	ebt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix);
-}
-
-static unsigned int
-ebt_ulog_tg(struct sk_buff *skb, const struct xt_action_param *par)
-{
-	struct net *net = dev_net(par->in ? par->in : par->out);
-
-	ebt_ulog_packet(net, par->hooknum, skb, par->in, par->out,
-	                par->targinfo, NULL);
-	return EBT_CONTINUE;
-}
-
-static int ebt_ulog_tg_check(const struct xt_tgchk_param *par)
-{
-	struct ebt_ulog_info *uloginfo = par->targinfo;
-
-	if (!par->net->xt.ebt_ulog_warn_deprecated) {
-		pr_info("ebt_ulog is deprecated and it will be removed soon, "
-			"use ebt_nflog instead\n");
-		par->net->xt.ebt_ulog_warn_deprecated = true;
-	}
-
-	if (uloginfo->nlgroup > 31)
-		return -EINVAL;
-
-	uloginfo->prefix[EBT_ULOG_PREFIX_LEN - 1] = '\0';
-
-	if (uloginfo->qthreshold > EBT_ULOG_MAX_QLEN)
-		uloginfo->qthreshold = EBT_ULOG_MAX_QLEN;
-
-	return 0;
-}
-
-static struct xt_target ebt_ulog_tg_reg __read_mostly = {
-	.name		= "ulog",
-	.revision	= 0,
-	.family		= NFPROTO_BRIDGE,
-	.target		= ebt_ulog_tg,
-	.checkentry	= ebt_ulog_tg_check,
-	.targetsize	= sizeof(struct ebt_ulog_info),
-	.me		= THIS_MODULE,
-};
-
-static struct nf_logger ebt_ulog_logger __read_mostly = {
-	.name		= "ebt_ulog",
-	.logfn		= &ebt_log_packet,
-	.me		= THIS_MODULE,
-};
-
-static int __net_init ebt_ulog_net_init(struct net *net)
-{
-	int i;
-	struct ebt_ulog_net *ebt = ebt_ulog_pernet(net);
-
-	struct netlink_kernel_cfg cfg = {
-		.groups	= EBT_ULOG_MAXNLGROUPS,
-	};
-
-	/* initialize ulog_buffers */
-	for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) {
-		ebt->nlgroup[i] = i;
-		setup_timer(&ebt->ulog_buffers[i].timer, ulog_timer,
-			    (unsigned long)&ebt->nlgroup[i]);
-		spin_lock_init(&ebt->ulog_buffers[i].lock);
-	}
-
-	ebt->ebtulognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg);
-	if (!ebt->ebtulognl)
-		return -ENOMEM;
-
-	nf_log_set(net, NFPROTO_BRIDGE, &ebt_ulog_logger);
-	return 0;
-}
-
-static void __net_exit ebt_ulog_net_fini(struct net *net)
-{
-	int i;
-	struct ebt_ulog_net *ebt = ebt_ulog_pernet(net);
-
-	nf_log_unset(net, &ebt_ulog_logger);
-	for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) {
-		ebt_ulog_buff_t *ub = &ebt->ulog_buffers[i];
-		del_timer(&ub->timer);
-
-		if (ub->skb) {
-			kfree_skb(ub->skb);
-			ub->skb = NULL;
-		}
-	}
-	netlink_kernel_release(ebt->ebtulognl);
-}
-
-static struct pernet_operations ebt_ulog_net_ops = {
-	.init = ebt_ulog_net_init,
-	.exit = ebt_ulog_net_fini,
-	.id   = &ebt_ulog_net_id,
-	.size = sizeof(struct ebt_ulog_net),
-};
-
-static int __init ebt_ulog_init(void)
-{
-	int ret;
-
-	if (nlbufsiz >= 128*1024) {
-		pr_warn("Netlink buffer has to be <= 128kB,"
-			"please try a smaller nlbufsiz parameter.\n");
-		return -EINVAL;
-	}
-
-	ret = register_pernet_subsys(&ebt_ulog_net_ops);
-	if (ret)
-		goto out_pernet;
-
-	ret = xt_register_target(&ebt_ulog_tg_reg);
-	if (ret)
-		goto out_target;
-
-	nf_log_register(NFPROTO_BRIDGE, &ebt_ulog_logger);
-
-	return 0;
-
-out_target:
-	unregister_pernet_subsys(&ebt_ulog_net_ops);
-out_pernet:
-	return ret;
-}
-
-static void __exit ebt_ulog_fini(void)
-{
-	nf_log_unregister(&ebt_ulog_logger);
-	xt_unregister_target(&ebt_ulog_tg_reg);
-	unregister_pernet_subsys(&ebt_ulog_net_ops);
-}
-
-module_init(ebt_ulog_init);
-module_exit(ebt_ulog_fini);
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
-MODULE_DESCRIPTION("Ebtables: Packet logging to netlink using ULOG");
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index a26ce035e3fa..730faacbc5f8 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -159,25 +159,6 @@ config IP_NF_TARGET_SYNPROXY
 
 	  To compile it as a module, choose M here. If unsure, say N.
 
-config IP_NF_TARGET_ULOG
-	tristate "ULOG target support (obsolete)"
-	default m if NETFILTER_ADVANCED=n
-	---help---
-
-	  This option enables the old IPv4-only "ipt_ULOG" implementation
-	  which has been obsoleted by the new "nfnetlink_log" code (see
-	  CONFIG_NETFILTER_NETLINK_LOG).
-
-	  This option adds a `ULOG' target, which allows you to create rules in
-	  any iptables table. The packet is passed to a userspace logging
-	  daemon using netlink multicast sockets; unlike the LOG target
-	  which can only be viewed through syslog.
-
-	  The appropriate userspace logging daemon (ulogd) may be obtained from
-	  <http://www.netfilter.org/projects/ulogd/index.html>
-
-	  To compile it as a module, choose M here.  If unsure, say N.
-
 # NAT + specific targets: nf_conntrack
 config NF_NAT_IPV4
 	tristate "IPv4 NAT"
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
deleted file mode 100644
index 9cb993cd224b..000000000000
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ /dev/null
@@ -1,498 +0,0 @@
-/*
- * netfilter module for userspace packet logging daemons
- *
- * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
- * (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2005-2007 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This module accepts two parameters:
- *
- * nlbufsiz:
- *   The parameter specifies how big the buffer for each netlink multicast
- * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will
- * get accumulated in the kernel until they are sent to userspace. It is
- * NOT possible to allocate more than 128kB, and it is strongly discouraged,
- * because atomically allocating 128kB inside the network rx softirq is not
- * reliable. Please also keep in mind that this buffer size is allocated for
- * each nlgroup you are using, so the total kernel memory usage increases
- * by that factor.
- *
- * Actually you should use nlbufsiz a bit smaller than PAGE_SIZE, since
- * nlbufsiz is used with alloc_skb, which adds another
- * sizeof(struct skb_shared_info).  Use NLMSG_GOODSIZE instead.
- *
- * flushtimeout:
- *   Specify, after how many hundredths of a second the queue should be
- *   flushed even if it is not full yet.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/socket.h>
-#include <linux/slab.h>
-#include <linux/skbuff.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <net/netlink.h>
-#include <linux/netdevice.h>
-#include <linux/mm.h>
-#include <linux/moduleparam.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter_ipv4/ipt_ULOG.h>
-#include <net/netfilter/nf_log.h>
-#include <net/netns/generic.h>
-#include <net/sock.h>
-#include <linux/bitops.h>
-#include <asm/unaligned.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
-MODULE_DESCRIPTION("Xtables: packet logging to netlink using ULOG");
-MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG);
-
-#define ULOG_NL_EVENT		111		/* Harald's favorite number */
-#define ULOG_MAXNLGROUPS	32		/* numer of nlgroups */
-
-static unsigned int nlbufsiz = NLMSG_GOODSIZE;
-module_param(nlbufsiz, uint, 0400);
-MODULE_PARM_DESC(nlbufsiz, "netlink buffer size");
-
-static unsigned int flushtimeout = 10;
-module_param(flushtimeout, uint, 0600);
-MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)");
-
-static bool nflog = true;
-module_param(nflog, bool, 0400);
-MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
-
-/* global data structures */
-
-typedef struct {
-	unsigned int qlen;		/* number of nlmsgs' in the skb */
-	struct nlmsghdr *lastnlh;	/* netlink header of last msg in skb */
-	struct sk_buff *skb;		/* the pre-allocated skb */
-	struct timer_list timer;	/* the timer function */
-} ulog_buff_t;
-
-static int ulog_net_id __read_mostly;
-struct ulog_net {
-	unsigned int nlgroup[ULOG_MAXNLGROUPS];
-	ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS];
-	struct sock *nflognl;
-	spinlock_t lock;
-};
-
-static struct ulog_net *ulog_pernet(struct net *net)
-{
-	return net_generic(net, ulog_net_id);
-}
-
-/* send one ulog_buff_t to userspace */
-static void ulog_send(struct ulog_net *ulog, unsigned int nlgroupnum)
-{
-	ulog_buff_t *ub = &ulog->ulog_buffers[nlgroupnum];
-
-	pr_debug("ulog_send: timer is deleting\n");
-	del_timer(&ub->timer);
-
-	if (!ub->skb) {
-		pr_debug("ulog_send: nothing to send\n");
-		return;
-	}
-
-	/* last nlmsg needs NLMSG_DONE */
-	if (ub->qlen > 1)
-		ub->lastnlh->nlmsg_type = NLMSG_DONE;
-
-	NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1;
-	pr_debug("throwing %d packets to netlink group %u\n",
-		 ub->qlen, nlgroupnum + 1);
-	netlink_broadcast(ulog->nflognl, ub->skb, 0, nlgroupnum + 1,
-			  GFP_ATOMIC);
-
-	ub->qlen = 0;
-	ub->skb = NULL;
-	ub->lastnlh = NULL;
-}
-
-
-/* timer function to flush queue in flushtimeout time */
-static void ulog_timer(unsigned long data)
-{
-	unsigned int groupnum = *((unsigned int *)data);
-	struct ulog_net *ulog = container_of((void *)data,
-					     struct ulog_net,
-					     nlgroup[groupnum]);
-	pr_debug("timer function called, calling ulog_send\n");
-
-	/* lock to protect against somebody modifying our structure
-	 * from ipt_ulog_target at the same time */
-	spin_lock_bh(&ulog->lock);
-	ulog_send(ulog, groupnum);
-	spin_unlock_bh(&ulog->lock);
-}
-
-static struct sk_buff *ulog_alloc_skb(unsigned int size)
-{
-	struct sk_buff *skb;
-	unsigned int n;
-
-	/* alloc skb which should be big enough for a whole
-	 * multipart message. WARNING: has to be <= 131000
-	 * due to slab allocator restrictions */
-
-	n = max(size, nlbufsiz);
-	skb = alloc_skb(n, GFP_ATOMIC | __GFP_NOWARN);
-	if (!skb) {
-		if (n > size) {
-			/* try to allocate only as much as we need for
-			 * current packet */
-
-			skb = alloc_skb(size, GFP_ATOMIC);
-			if (!skb)
-				pr_debug("cannot even allocate %ub\n", size);
-		}
-	}
-
-	return skb;
-}
-
-static void ipt_ulog_packet(struct net *net,
-			    unsigned int hooknum,
-			    const struct sk_buff *skb,
-			    const struct net_device *in,
-			    const struct net_device *out,
-			    const struct ipt_ulog_info *loginfo,
-			    const char *prefix)
-{
-	ulog_buff_t *ub;
-	ulog_packet_msg_t *pm;
-	size_t size, copy_len;
-	struct nlmsghdr *nlh;
-	struct timeval tv;
-	struct ulog_net *ulog = ulog_pernet(net);
-
-	/* ffs == find first bit set, necessary because userspace
-	 * is already shifting groupnumber, but we need unshifted.
-	 * ffs() returns [1..32], we need [0..31] */
-	unsigned int groupnum = ffs(loginfo->nl_group) - 1;
-
-	/* calculate the size of the skb needed */
-	if (loginfo->copy_range == 0 || loginfo->copy_range > skb->len)
-		copy_len = skb->len;
-	else
-		copy_len = loginfo->copy_range;
-
-	size = nlmsg_total_size(sizeof(*pm) + copy_len);
-
-	ub = &ulog->ulog_buffers[groupnum];
-
-	spin_lock_bh(&ulog->lock);
-
-	if (!ub->skb) {
-		if (!(ub->skb = ulog_alloc_skb(size)))
-			goto alloc_failure;
-	} else if (ub->qlen >= loginfo->qthreshold ||
-		   size > skb_tailroom(ub->skb)) {
-		/* either the queue len is too high or we don't have
-		 * enough room in nlskb left. send it to userspace. */
-
-		ulog_send(ulog, groupnum);
-
-		if (!(ub->skb = ulog_alloc_skb(size)))
-			goto alloc_failure;
-	}
-
-	pr_debug("qlen %d, qthreshold %Zu\n", ub->qlen, loginfo->qthreshold);
-
-	nlh = nlmsg_put(ub->skb, 0, ub->qlen, ULOG_NL_EVENT,
-			sizeof(*pm)+copy_len, 0);
-	if (!nlh) {
-		pr_debug("error during nlmsg_put\n");
-		goto out_unlock;
-	}
-	ub->qlen++;
-
-	pm = nlmsg_data(nlh);
-	memset(pm, 0, sizeof(*pm));
-
-	/* We might not have a timestamp, get one */
-	if (skb->tstamp.tv64 == 0)
-		__net_timestamp((struct sk_buff *)skb);
-
-	/* copy hook, prefix, timestamp, payload, etc. */
-	pm->data_len = copy_len;
-	tv = ktime_to_timeval(skb->tstamp);
-	put_unaligned(tv.tv_sec, &pm->timestamp_sec);
-	put_unaligned(tv.tv_usec, &pm->timestamp_usec);
-	put_unaligned(skb->mark, &pm->mark);
-	pm->hook = hooknum;
-	if (prefix != NULL) {
-		strncpy(pm->prefix, prefix, sizeof(pm->prefix) - 1);
-		pm->prefix[sizeof(pm->prefix) - 1] = '\0';
-	}
-	else if (loginfo->prefix[0] != '\0')
-		strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix));
-
-	if (in && in->hard_header_len > 0 &&
-	    skb->mac_header != skb->network_header &&
-	    in->hard_header_len <= ULOG_MAC_LEN) {
-		memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len);
-		pm->mac_len = in->hard_header_len;
-	} else
-		pm->mac_len = 0;
-
-	if (in)
-		strncpy(pm->indev_name, in->name, sizeof(pm->indev_name));
-
-	if (out)
-		strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
-
-	/* copy_len <= skb->len, so can't fail. */
-	if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0)
-		BUG();
-
-	/* check if we are building multi-part messages */
-	if (ub->qlen > 1)
-		ub->lastnlh->nlmsg_flags |= NLM_F_MULTI;
-
-	ub->lastnlh = nlh;
-
-	/* if timer isn't already running, start it */
-	if (!timer_pending(&ub->timer)) {
-		ub->timer.expires = jiffies + flushtimeout * HZ / 100;
-		add_timer(&ub->timer);
-	}
-
-	/* if threshold is reached, send message to userspace */
-	if (ub->qlen >= loginfo->qthreshold) {
-		if (loginfo->qthreshold > 1)
-			nlh->nlmsg_type = NLMSG_DONE;
-		ulog_send(ulog, groupnum);
-	}
-out_unlock:
-	spin_unlock_bh(&ulog->lock);
-
-	return;
-
-alloc_failure:
-	pr_debug("Error building netlink message\n");
-	spin_unlock_bh(&ulog->lock);
-}
-
-static unsigned int
-ulog_tg(struct sk_buff *skb, const struct xt_action_param *par)
-{
-	struct net *net = dev_net(par->in ? par->in : par->out);
-
-	ipt_ulog_packet(net, par->hooknum, skb, par->in, par->out,
-	                par->targinfo, NULL);
-	return XT_CONTINUE;
-}
-
-static void ipt_logfn(struct net *net,
-		      u_int8_t pf,
-		      unsigned int hooknum,
-		      const struct sk_buff *skb,
-		      const struct net_device *in,
-		      const struct net_device *out,
-		      const struct nf_loginfo *li,
-		      const char *prefix)
-{
-	struct ipt_ulog_info loginfo;
-
-	if (!li || li->type != NF_LOG_TYPE_ULOG) {
-		loginfo.nl_group = ULOG_DEFAULT_NLGROUP;
-		loginfo.copy_range = 0;
-		loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD;
-		loginfo.prefix[0] = '\0';
-	} else {
-		loginfo.nl_group = li->u.ulog.group;
-		loginfo.copy_range = li->u.ulog.copy_len;
-		loginfo.qthreshold = li->u.ulog.qthreshold;
-		strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
-	}
-
-	ipt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix);
-}
-
-static int ulog_tg_check(const struct xt_tgchk_param *par)
-{
-	const struct ipt_ulog_info *loginfo = par->targinfo;
-
-	if (!par->net->xt.ulog_warn_deprecated) {
-		pr_info("ULOG is deprecated and it will be removed soon, "
-			"use NFLOG instead\n");
-		par->net->xt.ulog_warn_deprecated = true;
-	}
-
-	if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
-		pr_debug("prefix not null-terminated\n");
-		return -EINVAL;
-	}
-	if (loginfo->qthreshold > ULOG_MAX_QLEN) {
-		pr_debug("queue threshold %Zu > MAX_QLEN\n",
-			 loginfo->qthreshold);
-		return -EINVAL;
-	}
-	return 0;
-}
-
-#ifdef CONFIG_COMPAT
-struct compat_ipt_ulog_info {
-	compat_uint_t	nl_group;
-	compat_size_t	copy_range;
-	compat_size_t	qthreshold;
-	char		prefix[ULOG_PREFIX_LEN];
-};
-
-static void ulog_tg_compat_from_user(void *dst, const void *src)
-{
-	const struct compat_ipt_ulog_info *cl = src;
-	struct ipt_ulog_info l = {
-		.nl_group	= cl->nl_group,
-		.copy_range	= cl->copy_range,
-		.qthreshold	= cl->qthreshold,
-	};
-
-	memcpy(l.prefix, cl->prefix, sizeof(l.prefix));
-	memcpy(dst, &l, sizeof(l));
-}
-
-static int ulog_tg_compat_to_user(void __user *dst, const void *src)
-{
-	const struct ipt_ulog_info *l = src;
-	struct compat_ipt_ulog_info cl = {
-		.nl_group	= l->nl_group,
-		.copy_range	= l->copy_range,
-		.qthreshold	= l->qthreshold,
-	};
-
-	memcpy(cl.prefix, l->prefix, sizeof(cl.prefix));
-	return copy_to_user(dst, &cl, sizeof(cl)) ? -EFAULT : 0;
-}
-#endif /* CONFIG_COMPAT */
-
-static struct xt_target ulog_tg_reg __read_mostly = {
-	.name		= "ULOG",
-	.family		= NFPROTO_IPV4,
-	.target		= ulog_tg,
-	.targetsize	= sizeof(struct ipt_ulog_info),
-	.checkentry	= ulog_tg_check,
-#ifdef CONFIG_COMPAT
-	.compatsize	= sizeof(struct compat_ipt_ulog_info),
-	.compat_from_user = ulog_tg_compat_from_user,
-	.compat_to_user	= ulog_tg_compat_to_user,
-#endif
-	.me		= THIS_MODULE,
-};
-
-static struct nf_logger ipt_ulog_logger __read_mostly = {
-	.name		= "ipt_ULOG",
-	.logfn		= ipt_logfn,
-	.me		= THIS_MODULE,
-};
-
-static int __net_init ulog_tg_net_init(struct net *net)
-{
-	int i;
-	struct ulog_net *ulog = ulog_pernet(net);
-	struct netlink_kernel_cfg cfg = {
-		.groups	= ULOG_MAXNLGROUPS,
-	};
-
-	spin_lock_init(&ulog->lock);
-	/* initialize ulog_buffers */
-	for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
-		ulog->nlgroup[i] = i;
-		setup_timer(&ulog->ulog_buffers[i].timer, ulog_timer,
-			    (unsigned long)&ulog->nlgroup[i]);
-	}
-
-	ulog->nflognl = netlink_kernel_create(net, NETLINK_NFLOG, &cfg);
-	if (!ulog->nflognl)
-		return -ENOMEM;
-
-	if (nflog)
-		nf_log_set(net, NFPROTO_IPV4, &ipt_ulog_logger);
-
-	return 0;
-}
-
-static void __net_exit ulog_tg_net_exit(struct net *net)
-{
-	ulog_buff_t *ub;
-	int i;
-	struct ulog_net *ulog = ulog_pernet(net);
-
-	if (nflog)
-		nf_log_unset(net, &ipt_ulog_logger);
-
-	netlink_kernel_release(ulog->nflognl);
-
-	/* remove pending timers and free allocated skb's */
-	for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
-		ub = &ulog->ulog_buffers[i];
-		pr_debug("timer is deleting\n");
-		del_timer(&ub->timer);
-
-		if (ub->skb) {
-			kfree_skb(ub->skb);
-			ub->skb = NULL;
-		}
-	}
-}
-
-static struct pernet_operations ulog_tg_net_ops = {
-	.init = ulog_tg_net_init,
-	.exit = ulog_tg_net_exit,
-	.id   = &ulog_net_id,
-	.size = sizeof(struct ulog_net),
-};
-
-static int __init ulog_tg_init(void)
-{
-	int ret;
-	pr_debug("init module\n");
-
-	if (nlbufsiz > 128*1024) {
-		pr_warn("Netlink buffer has to be <= 128kB\n");
-		return -EINVAL;
-	}
-
-	ret = register_pernet_subsys(&ulog_tg_net_ops);
-	if (ret)
-		goto out_pernet;
-
-	ret = xt_register_target(&ulog_tg_reg);
-	if (ret < 0)
-		goto out_target;
-
-	if (nflog)
-		nf_log_register(NFPROTO_IPV4, &ipt_ulog_logger);
-
-	return 0;
-
-out_target:
-	unregister_pernet_subsys(&ulog_tg_net_ops);
-out_pernet:
-	return ret;
-}
-
-static void __exit ulog_tg_exit(void)
-{
-	pr_debug("cleanup_module\n");
-	if (nflog)
-		nf_log_unregister(&ipt_ulog_logger);
-	xt_unregister_target(&ulog_tg_reg);
-	unregister_pernet_subsys(&ulog_tg_net_ops);
-}
-
-module_init(ulog_tg_init);
-module_exit(ulog_tg_exit);
-- 
cgit v1.2.3


From 1990e4f883a6383b04ef37f32bdaa02dc0dbae8d Mon Sep 17 00:00:00 2001
From: Mathias Krause
Date: Fri, 9 May 2014 23:43:42 +0200
Subject: vti: Simplify error handling in module init and exit

The error handling in the module init and exit functions can be
shortened to safe us some code.

1/ Remove the code duplications in the init function, jump straight to
the existing cleanup code by adding some labels. Also give the error
message some more value by telling the reason why loading the module has
failed. Furthermore fix the "IPSec" typo -- it should be "IPsec" instead.

2/ Remove the error handling in the exit function as the only legitimate
reason xfrm4_protocol_deregister() might fail is inet_del_protocol()
returning -1. That, in turn, means some other protocol handler had been
registered for this very protocol in the meantime. But that essentially
means we haven't been handling that protocol any more, anyway. What it
definitely means not is that we "can't deregister tunnel". Therefore
just get rid of that bogus warning. It's plain wrong.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/ip_vti.c | 54 +++++++++++++++++++++---------------------------------
 1 file changed, 21 insertions(+), 33 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index b8960f3527f3..e453cb724a95 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -534,40 +534,28 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = {
 
 static int __init vti_init(void)
 {
+	const char *msg;
 	int err;
 
-	pr_info("IPv4 over IPSec tunneling driver\n");
+	pr_info("IPv4 over IPsec tunneling driver\n");
 
+	msg = "tunnel device";
 	err = register_pernet_device(&vti_net_ops);
 	if (err < 0)
-		return err;
-	err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP);
-	if (err < 0) {
-		unregister_pernet_device(&vti_net_ops);
-		pr_info("vti init: can't register tunnel\n");
-
-		return err;
-	}
+		goto pernet_dev_failed;
 
+	msg = "tunnel protocols";
+	err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP);
+	if (err < 0)
+		goto xfrm_proto_esp_failed;
 	err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH);
-	if (err < 0) {
-		xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
-		unregister_pernet_device(&vti_net_ops);
-		pr_info("vti init: can't register tunnel\n");
-
-		return err;
-	}
-
+	if (err < 0)
+		goto xfrm_proto_ah_failed;
 	err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP);
-	if (err < 0) {
-		xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
-		xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
-		unregister_pernet_device(&vti_net_ops);
-		pr_info("vti init: can't register tunnel\n");
-
-		return err;
-	}
+	if (err < 0)
+		goto xfrm_proto_comp_failed;
 
+	msg = "netlink interface";
 	err = rtnl_link_register(&vti_link_ops);
 	if (err < 0)
 		goto rtnl_link_failed;
@@ -576,23 +564,23 @@ static int __init vti_init(void)
 
 rtnl_link_failed:
 	xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+xfrm_proto_comp_failed:
 	xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+xfrm_proto_ah_failed:
 	xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
+xfrm_proto_esp_failed:
 	unregister_pernet_device(&vti_net_ops);
+pernet_dev_failed:
+	pr_err("vti init: failed to register %s\n", msg);
 	return err;
 }
 
 static void __exit vti_fini(void)
 {
 	rtnl_link_unregister(&vti_link_ops);
-	if (xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP))
-		pr_info("vti close: can't deregister tunnel\n");
-	if (xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH))
-		pr_info("vti close: can't deregister tunnel\n");
-	if (xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP))
-		pr_info("vti close: can't deregister tunnel\n");
-
-
+	xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
+	xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
+	xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
 	unregister_pernet_device(&vti_net_ops);
 }
 
-- 
cgit v1.2.3


From 83e96d443b372611adf19e4171d41deb1d8760cf Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Thu, 19 Jun 2014 20:47:14 +0200
Subject: netfilter: log: split family specific code to
 nf_log_{ip,ip6,common}.c files

The plain text logging is currently embedded into the xt_LOG target.
In order to be able to use the plain text logging from nft_log, as a
first step, this patch moves the family specific code to the following
files and Kconfig symbols:

1) net/ipv4/netfilter/nf_log_ip.c: CONFIG_NF_LOG_IPV4
2) net/ipv6/netfilter/nf_log_ip6.c: CONFIG_NF_LOG_IPV6
3) net/netfilter/nf_log_common.c: CONFIG_NF_LOG_COMMON

These new modules will be required by xt_LOG and nft_log. This patch
is based on original patch from Arturo Borrero Gonzalez.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_log.h   |  28 ++
 net/ipv4/netfilter/Kconfig       |   5 +
 net/ipv4/netfilter/Makefile      |   3 +
 net/ipv4/netfilter/nf_log_ipv4.c | 385 +++++++++++++++++
 net/ipv6/netfilter/Kconfig       |   5 +
 net/ipv6/netfilter/Makefile      |   3 +
 net/ipv6/netfilter/nf_log_ipv6.c | 417 +++++++++++++++++++
 net/netfilter/Kconfig            |   6 +
 net/netfilter/Makefile           |   3 +
 net/netfilter/nf_log_common.c    | 187 +++++++++
 net/netfilter/xt_LOG.c           | 879 +--------------------------------------
 11 files changed, 1047 insertions(+), 874 deletions(-)
 create mode 100644 net/ipv4/netfilter/nf_log_ipv4.c
 create mode 100644 net/ipv6/netfilter/nf_log_ipv6.c
 create mode 100644 net/netfilter/nf_log_common.c

(limited to 'net/ipv4')

diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index aaec845de106..bba354e78f49 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -78,4 +78,32 @@ struct nf_log_buf *nf_log_buf_open(void);
 __printf(2, 3) int nf_log_buf_add(struct nf_log_buf *m, const char *f, ...);
 void nf_log_buf_close(struct nf_log_buf *m);
 
+void nf_log_ip_packet(struct net *net, u_int8_t pf,
+		      unsigned int hooknum, const struct sk_buff *skb,
+		      const struct net_device *in,
+		      const struct net_device *out,
+		      const struct nf_loginfo *loginfo,
+		      const char *prefix);
+
+void nf_log_ip6_packet(struct net *net, u_int8_t pf,
+		       unsigned int hooknum, const struct sk_buff *skb,
+		       const struct net_device *in,
+		       const struct net_device *out,
+		       const struct nf_loginfo *loginfo,
+		       const char *prefix);
+
+/* common logging functions */
+int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb,
+			   u8 proto, int fragment, unsigned int offset);
+int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb,
+			   u8 proto, int fragment, unsigned int offset,
+			   unsigned int logflags);
+void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk);
+void nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf,
+			       unsigned int hooknum, const struct sk_buff *skb,
+			       const struct net_device *in,
+			       const struct net_device *out,
+			       const struct nf_loginfo *loginfo,
+			       const char *prefix);
+
 #endif /* _NF_LOG_H */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 730faacbc5f8..0c980293d225 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -159,6 +159,11 @@ config IP_NF_TARGET_SYNPROXY
 
 	  To compile it as a module, choose M here. If unsure, say N.
 
+config NF_LOG_IPV4
+	tristate "IPv4 packet logging"
+	default m if NETFILTER_ADVANCED=n
+	select NF_LOG_COMMON
+
 # NAT + specific targets: nf_conntrack
 config NF_NAT_IPV4
 	tristate "IPv4 NAT"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 90b82405331e..730e0c1ead90 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -19,6 +19,9 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
 # defrag
 obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
 
+# logging
+obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o
+
 # NAT helpers (nf_conntrack)
 obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
 obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
new file mode 100644
index 000000000000..7e69a401a29e
--- /dev/null
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -0,0 +1,385 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/xt_LOG.h>
+#include <net/netfilter/nf_log.h>
+
+static struct nf_loginfo default_loginfo = {
+	.type	= NF_LOG_TYPE_LOG,
+	.u = {
+		.log = {
+			.level	  = 5,
+			.logflags = NF_LOG_MASK,
+		},
+	},
+};
+
+/* One level of recursion won't kill us */
+static void dump_ipv4_packet(struct nf_log_buf *m,
+			     const struct nf_loginfo *info,
+			     const struct sk_buff *skb, unsigned int iphoff)
+{
+	struct iphdr _iph;
+	const struct iphdr *ih;
+	unsigned int logflags;
+
+	if (info->type == NF_LOG_TYPE_LOG)
+		logflags = info->u.log.logflags;
+	else
+		logflags = NF_LOG_MASK;
+
+	ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
+	if (ih == NULL) {
+		nf_log_buf_add(m, "TRUNCATED");
+		return;
+	}
+
+	/* Important fields:
+	 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
+	/* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
+	nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr);
+
+	/* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
+	nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+		       ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
+		       ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
+
+	/* Max length: 6 "CE DF MF " */
+	if (ntohs(ih->frag_off) & IP_CE)
+		nf_log_buf_add(m, "CE ");
+	if (ntohs(ih->frag_off) & IP_DF)
+		nf_log_buf_add(m, "DF ");
+	if (ntohs(ih->frag_off) & IP_MF)
+		nf_log_buf_add(m, "MF ");
+
+	/* Max length: 11 "FRAG:65535 " */
+	if (ntohs(ih->frag_off) & IP_OFFSET)
+		nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+
+	if ((logflags & XT_LOG_IPOPT) &&
+	    ih->ihl * 4 > sizeof(struct iphdr)) {
+		const unsigned char *op;
+		unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
+		unsigned int i, optsize;
+
+		optsize = ih->ihl * 4 - sizeof(struct iphdr);
+		op = skb_header_pointer(skb, iphoff+sizeof(_iph),
+					optsize, _opt);
+		if (op == NULL) {
+			nf_log_buf_add(m, "TRUNCATED");
+			return;
+		}
+
+		/* Max length: 127 "OPT (" 15*4*2chars ") " */
+		nf_log_buf_add(m, "OPT (");
+		for (i = 0; i < optsize; i++)
+			nf_log_buf_add(m, "%02X", op[i]);
+		nf_log_buf_add(m, ") ");
+	}
+
+	switch (ih->protocol) {
+	case IPPROTO_TCP:
+		if (nf_log_dump_tcp_header(m, skb, ih->protocol,
+					   ntohs(ih->frag_off) & IP_OFFSET,
+					   iphoff+ih->ihl*4, logflags))
+			return;
+		break;
+	case IPPROTO_UDP:
+	case IPPROTO_UDPLITE:
+		if (nf_log_dump_udp_header(m, skb, ih->protocol,
+					   ntohs(ih->frag_off) & IP_OFFSET,
+					   iphoff+ih->ihl*4))
+			return;
+		break;
+	case IPPROTO_ICMP: {
+		struct icmphdr _icmph;
+		const struct icmphdr *ich;
+		static const size_t required_len[NR_ICMP_TYPES+1]
+			= { [ICMP_ECHOREPLY] = 4,
+			    [ICMP_DEST_UNREACH]
+			    = 8 + sizeof(struct iphdr),
+			    [ICMP_SOURCE_QUENCH]
+			    = 8 + sizeof(struct iphdr),
+			    [ICMP_REDIRECT]
+			    = 8 + sizeof(struct iphdr),
+			    [ICMP_ECHO] = 4,
+			    [ICMP_TIME_EXCEEDED]
+			    = 8 + sizeof(struct iphdr),
+			    [ICMP_PARAMETERPROB]
+			    = 8 + sizeof(struct iphdr),
+			    [ICMP_TIMESTAMP] = 20,
+			    [ICMP_TIMESTAMPREPLY] = 20,
+			    [ICMP_ADDRESS] = 12,
+			    [ICMP_ADDRESSREPLY] = 12 };
+
+		/* Max length: 11 "PROTO=ICMP " */
+		nf_log_buf_add(m, "PROTO=ICMP ");
+
+		if (ntohs(ih->frag_off) & IP_OFFSET)
+			break;
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+					 sizeof(_icmph), &_icmph);
+		if (ich == NULL) {
+			nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+				       skb->len - iphoff - ih->ihl*4);
+			break;
+		}
+
+		/* Max length: 18 "TYPE=255 CODE=255 " */
+		nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		if (ich->type <= NR_ICMP_TYPES &&
+		    required_len[ich->type] &&
+		    skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
+			nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+				       skb->len - iphoff - ih->ihl*4);
+			break;
+		}
+
+		switch (ich->type) {
+		case ICMP_ECHOREPLY:
+		case ICMP_ECHO:
+			/* Max length: 19 "ID=65535 SEQ=65535 " */
+			nf_log_buf_add(m, "ID=%u SEQ=%u ",
+				       ntohs(ich->un.echo.id),
+				       ntohs(ich->un.echo.sequence));
+			break;
+
+		case ICMP_PARAMETERPROB:
+			/* Max length: 14 "PARAMETER=255 " */
+			nf_log_buf_add(m, "PARAMETER=%u ",
+				       ntohl(ich->un.gateway) >> 24);
+			break;
+		case ICMP_REDIRECT:
+			/* Max length: 24 "GATEWAY=255.255.255.255 " */
+			nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
+			/* Fall through */
+		case ICMP_DEST_UNREACH:
+		case ICMP_SOURCE_QUENCH:
+		case ICMP_TIME_EXCEEDED:
+			/* Max length: 3+maxlen */
+			if (!iphoff) { /* Only recurse once. */
+				nf_log_buf_add(m, "[");
+				dump_ipv4_packet(m, info, skb,
+					    iphoff + ih->ihl*4+sizeof(_icmph));
+				nf_log_buf_add(m, "] ");
+			}
+
+			/* Max length: 10 "MTU=65535 " */
+			if (ich->type == ICMP_DEST_UNREACH &&
+			    ich->code == ICMP_FRAG_NEEDED) {
+				nf_log_buf_add(m, "MTU=%u ",
+					       ntohs(ich->un.frag.mtu));
+			}
+		}
+		break;
+	}
+	/* Max Length */
+	case IPPROTO_AH: {
+		struct ip_auth_hdr _ahdr;
+		const struct ip_auth_hdr *ah;
+
+		if (ntohs(ih->frag_off) & IP_OFFSET)
+			break;
+
+		/* Max length: 9 "PROTO=AH " */
+		nf_log_buf_add(m, "PROTO=AH ");
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
+					sizeof(_ahdr), &_ahdr);
+		if (ah == NULL) {
+			nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+				       skb->len - iphoff - ih->ihl*4);
+			break;
+		}
+
+		/* Length: 15 "SPI=0xF1234567 " */
+		nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
+		break;
+	}
+	case IPPROTO_ESP: {
+		struct ip_esp_hdr _esph;
+		const struct ip_esp_hdr *eh;
+
+		/* Max length: 10 "PROTO=ESP " */
+		nf_log_buf_add(m, "PROTO=ESP ");
+
+		if (ntohs(ih->frag_off) & IP_OFFSET)
+			break;
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
+					sizeof(_esph), &_esph);
+		if (eh == NULL) {
+			nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+				       skb->len - iphoff - ih->ihl*4);
+			break;
+		}
+
+		/* Length: 15 "SPI=0xF1234567 " */
+		nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi));
+		break;
+	}
+	/* Max length: 10 "PROTO 255 " */
+	default:
+		nf_log_buf_add(m, "PROTO=%u ", ih->protocol);
+	}
+
+	/* Max length: 15 "UID=4294967295 " */
+	if ((logflags & XT_LOG_UID) && !iphoff)
+		nf_log_dump_sk_uid_gid(m, skb->sk);
+
+	/* Max length: 16 "MARK=0xFFFFFFFF " */
+	if (!iphoff && skb->mark)
+		nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
+
+	/* Proto    Max log string length */
+	/* IP:	    40+46+6+11+127 = 230 */
+	/* TCP:     10+max(25,20+30+13+9+32+11+127) = 252 */
+	/* UDP:     10+max(25,20) = 35 */
+	/* UDPLITE: 14+max(25,20) = 39 */
+	/* ICMP:    11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
+	/* ESP:     10+max(25)+15 = 50 */
+	/* AH:	    9+max(25)+15 = 49 */
+	/* unknown: 10 */
+
+	/* (ICMP allows recursion one level deep) */
+	/* maxlen =  IP + ICMP +  IP + max(TCP,UDP,ICMP,unknown) */
+	/* maxlen = 230+   91  + 230 + 252 = 803 */
+}
+
+static void dump_ipv4_mac_header(struct nf_log_buf *m,
+			    const struct nf_loginfo *info,
+			    const struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	unsigned int logflags = 0;
+
+	if (info->type == NF_LOG_TYPE_LOG)
+		logflags = info->u.log.logflags;
+
+	if (!(logflags & XT_LOG_MACDECODE))
+		goto fallback;
+
+	switch (dev->type) {
+	case ARPHRD_ETHER:
+		nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+			       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+			       ntohs(eth_hdr(skb)->h_proto));
+		return;
+	default:
+		break;
+	}
+
+fallback:
+	nf_log_buf_add(m, "MAC=");
+	if (dev->hard_header_len &&
+	    skb->mac_header != skb->network_header) {
+		const unsigned char *p = skb_mac_header(skb);
+		unsigned int i;
+
+		nf_log_buf_add(m, "%02x", *p++);
+		for (i = 1; i < dev->hard_header_len; i++, p++)
+			nf_log_buf_add(m, ":%02x", *p);
+	}
+	nf_log_buf_add(m, " ");
+}
+
+void nf_log_ip_packet(struct net *net, u_int8_t pf,
+		      unsigned int hooknum, const struct sk_buff *skb,
+		      const struct net_device *in,
+		      const struct net_device *out,
+		      const struct nf_loginfo *loginfo,
+		      const char *prefix)
+{
+	struct nf_log_buf *m;
+
+	/* FIXME: Disabled from containers until syslog ns is supported */
+	if (!net_eq(net, &init_net))
+		return;
+
+	m = nf_log_buf_open();
+
+	if (!loginfo)
+		loginfo = &default_loginfo;
+
+	nf_log_dump_packet_common(m, pf, hooknum, skb, in,
+				  out, loginfo, prefix);
+
+	if (in != NULL)
+		dump_ipv4_mac_header(m, loginfo, skb);
+
+	dump_ipv4_packet(m, loginfo, skb, 0);
+
+	nf_log_buf_close(m);
+}
+EXPORT_SYMBOL_GPL(nf_log_ip_packet);
+
+static struct nf_logger nf_ip_logger __read_mostly = {
+	.name		= "nf_log_ipv4",
+	.type		= NF_LOG_TYPE_LOG,
+	.logfn		= nf_log_ip_packet,
+	.me		= THIS_MODULE,
+};
+
+static int __net_init nf_log_ipv4_net_init(struct net *net)
+{
+	nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger);
+	return 0;
+}
+
+static void __net_exit nf_log_ipv4_net_exit(struct net *net)
+{
+	nf_log_unset(net, &nf_ip_logger);
+}
+
+static struct pernet_operations nf_log_ipv4_net_ops = {
+	.init = nf_log_ipv4_net_init,
+	.exit = nf_log_ipv4_net_exit,
+};
+
+static int __init nf_log_ipv4_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&nf_log_ipv4_net_ops);
+	if (ret < 0)
+		return ret;
+
+	nf_log_register(NFPROTO_IPV4, &nf_ip_logger);
+	return 0;
+}
+
+static void __exit nf_log_ipv4_exit(void)
+{
+	unregister_pernet_subsys(&nf_log_ipv4_net_ops);
+	nf_log_unregister(&nf_ip_logger);
+}
+
+module_init(nf_log_ipv4_init);
+module_exit(nf_log_ipv4_exit);
+
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter IPv4 packet logging");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 4bff1f297e39..1537130e9f5b 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -227,6 +227,11 @@ config IP6_NF_SECURITY
 
          If unsure, say N.
 
+config NF_LOG_IPV6
+	tristate "IPv6 packet logging"
+	depends on NETFILTER_ADVANCED
+	select NF_LOG_COMMON
+
 config NF_NAT_IPV6
 	tristate "IPv6 NAT"
 	depends on NF_CONNTRACK_IPV6
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 70d3dd66f2cd..c0b263104ed2 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -23,6 +23,9 @@ obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o
 nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
 obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
 
+# logging
+obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o
+
 # nf_tables
 obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o
 obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
new file mode 100644
index 000000000000..804060946d2b
--- /dev/null
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -0,0 +1,417 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter/xt_LOG.h>
+#include <net/netfilter/nf_log.h>
+
+static struct nf_loginfo default_loginfo = {
+	.type	= NF_LOG_TYPE_LOG,
+	.u = {
+		.log = {
+			.level	  = 5,
+			.logflags = NF_LOG_MASK,
+		},
+	},
+};
+
+/* One level of recursion won't kill us */
+static void dump_ipv6_packet(struct nf_log_buf *m,
+			     const struct nf_loginfo *info,
+			     const struct sk_buff *skb, unsigned int ip6hoff,
+			     int recurse)
+{
+	u_int8_t currenthdr;
+	int fragment;
+	struct ipv6hdr _ip6h;
+	const struct ipv6hdr *ih;
+	unsigned int ptr;
+	unsigned int hdrlen = 0;
+	unsigned int logflags;
+
+	if (info->type == NF_LOG_TYPE_LOG)
+		logflags = info->u.log.logflags;
+	else
+		logflags = NF_LOG_MASK;
+
+	ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
+	if (ih == NULL) {
+		nf_log_buf_add(m, "TRUNCATED");
+		return;
+	}
+
+	/* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
+	nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
+
+	/* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
+	nf_log_buf_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
+	       ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
+	       (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
+	       ih->hop_limit,
+	       (ntohl(*(__be32 *)ih) & 0x000fffff));
+
+	fragment = 0;
+	ptr = ip6hoff + sizeof(struct ipv6hdr);
+	currenthdr = ih->nexthdr;
+	while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) {
+		struct ipv6_opt_hdr _hdr;
+		const struct ipv6_opt_hdr *hp;
+
+		hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
+		if (hp == NULL) {
+			nf_log_buf_add(m, "TRUNCATED");
+			return;
+		}
+
+		/* Max length: 48 "OPT (...) " */
+		if (logflags & XT_LOG_IPOPT)
+			nf_log_buf_add(m, "OPT ( ");
+
+		switch (currenthdr) {
+		case IPPROTO_FRAGMENT: {
+			struct frag_hdr _fhdr;
+			const struct frag_hdr *fh;
+
+			nf_log_buf_add(m, "FRAG:");
+			fh = skb_header_pointer(skb, ptr, sizeof(_fhdr),
+						&_fhdr);
+			if (fh == NULL) {
+				nf_log_buf_add(m, "TRUNCATED ");
+				return;
+			}
+
+			/* Max length: 6 "65535 " */
+			nf_log_buf_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8);
+
+			/* Max length: 11 "INCOMPLETE " */
+			if (fh->frag_off & htons(0x0001))
+				nf_log_buf_add(m, "INCOMPLETE ");
+
+			nf_log_buf_add(m, "ID:%08x ",
+				       ntohl(fh->identification));
+
+			if (ntohs(fh->frag_off) & 0xFFF8)
+				fragment = 1;
+
+			hdrlen = 8;
+
+			break;
+		}
+		case IPPROTO_DSTOPTS:
+		case IPPROTO_ROUTING:
+		case IPPROTO_HOPOPTS:
+			if (fragment) {
+				if (logflags & XT_LOG_IPOPT)
+					nf_log_buf_add(m, ")");
+				return;
+			}
+			hdrlen = ipv6_optlen(hp);
+			break;
+		/* Max Length */
+		case IPPROTO_AH:
+			if (logflags & XT_LOG_IPOPT) {
+				struct ip_auth_hdr _ahdr;
+				const struct ip_auth_hdr *ah;
+
+				/* Max length: 3 "AH " */
+				nf_log_buf_add(m, "AH ");
+
+				if (fragment) {
+					nf_log_buf_add(m, ")");
+					return;
+				}
+
+				ah = skb_header_pointer(skb, ptr, sizeof(_ahdr),
+							&_ahdr);
+				if (ah == NULL) {
+					/*
+					 * Max length: 26 "INCOMPLETE [65535
+					 *  bytes] )"
+					 */
+					nf_log_buf_add(m, "INCOMPLETE [%u bytes] )",
+						       skb->len - ptr);
+					return;
+				}
+
+				/* Length: 15 "SPI=0xF1234567 */
+				nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
+
+			}
+
+			hdrlen = (hp->hdrlen+2)<<2;
+			break;
+		case IPPROTO_ESP:
+			if (logflags & XT_LOG_IPOPT) {
+				struct ip_esp_hdr _esph;
+				const struct ip_esp_hdr *eh;
+
+				/* Max length: 4 "ESP " */
+				nf_log_buf_add(m, "ESP ");
+
+				if (fragment) {
+					nf_log_buf_add(m, ")");
+					return;
+				}
+
+				/*
+				 * Max length: 26 "INCOMPLETE [65535 bytes] )"
+				 */
+				eh = skb_header_pointer(skb, ptr, sizeof(_esph),
+							&_esph);
+				if (eh == NULL) {
+					nf_log_buf_add(m, "INCOMPLETE [%u bytes] )",
+						       skb->len - ptr);
+					return;
+				}
+
+				/* Length: 16 "SPI=0xF1234567 )" */
+				nf_log_buf_add(m, "SPI=0x%x )",
+					       ntohl(eh->spi));
+			}
+			return;
+		default:
+			/* Max length: 20 "Unknown Ext Hdr 255" */
+			nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr);
+			return;
+		}
+		if (logflags & XT_LOG_IPOPT)
+			nf_log_buf_add(m, ") ");
+
+		currenthdr = hp->nexthdr;
+		ptr += hdrlen;
+	}
+
+	switch (currenthdr) {
+	case IPPROTO_TCP:
+		if (nf_log_dump_tcp_header(m, skb, currenthdr, fragment,
+					   ptr, logflags))
+			return;
+		break;
+	case IPPROTO_UDP:
+	case IPPROTO_UDPLITE:
+		if (nf_log_dump_udp_header(m, skb, currenthdr, fragment, ptr))
+			return;
+		break;
+	case IPPROTO_ICMPV6: {
+		struct icmp6hdr _icmp6h;
+		const struct icmp6hdr *ic;
+
+		/* Max length: 13 "PROTO=ICMPv6 " */
+		nf_log_buf_add(m, "PROTO=ICMPv6 ");
+
+		if (fragment)
+			break;
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h);
+		if (ic == NULL) {
+			nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
+				       skb->len - ptr);
+			return;
+		}
+
+		/* Max length: 18 "TYPE=255 CODE=255 " */
+		nf_log_buf_add(m, "TYPE=%u CODE=%u ",
+			       ic->icmp6_type, ic->icmp6_code);
+
+		switch (ic->icmp6_type) {
+		case ICMPV6_ECHO_REQUEST:
+		case ICMPV6_ECHO_REPLY:
+			/* Max length: 19 "ID=65535 SEQ=65535 " */
+			nf_log_buf_add(m, "ID=%u SEQ=%u ",
+				ntohs(ic->icmp6_identifier),
+				ntohs(ic->icmp6_sequence));
+			break;
+		case ICMPV6_MGM_QUERY:
+		case ICMPV6_MGM_REPORT:
+		case ICMPV6_MGM_REDUCTION:
+			break;
+
+		case ICMPV6_PARAMPROB:
+			/* Max length: 17 "POINTER=ffffffff " */
+			nf_log_buf_add(m, "POINTER=%08x ",
+				       ntohl(ic->icmp6_pointer));
+			/* Fall through */
+		case ICMPV6_DEST_UNREACH:
+		case ICMPV6_PKT_TOOBIG:
+		case ICMPV6_TIME_EXCEED:
+			/* Max length: 3+maxlen */
+			if (recurse) {
+				nf_log_buf_add(m, "[");
+				dump_ipv6_packet(m, info, skb,
+						 ptr + sizeof(_icmp6h), 0);
+				nf_log_buf_add(m, "] ");
+			}
+
+			/* Max length: 10 "MTU=65535 " */
+			if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) {
+				nf_log_buf_add(m, "MTU=%u ",
+					       ntohl(ic->icmp6_mtu));
+			}
+		}
+		break;
+	}
+	/* Max length: 10 "PROTO=255 " */
+	default:
+		nf_log_buf_add(m, "PROTO=%u ", currenthdr);
+	}
+
+	/* Max length: 15 "UID=4294967295 " */
+	if ((logflags & XT_LOG_UID) && recurse)
+		nf_log_dump_sk_uid_gid(m, skb->sk);
+
+	/* Max length: 16 "MARK=0xFFFFFFFF " */
+	if (recurse && skb->mark)
+		nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
+}
+
+static void dump_ipv6_mac_header(struct nf_log_buf *m,
+				 const struct nf_loginfo *info,
+				 const struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	unsigned int logflags = 0;
+
+	if (info->type == NF_LOG_TYPE_LOG)
+		logflags = info->u.log.logflags;
+
+	if (!(logflags & XT_LOG_MACDECODE))
+		goto fallback;
+
+	switch (dev->type) {
+	case ARPHRD_ETHER:
+		nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+		       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
+		       ntohs(eth_hdr(skb)->h_proto));
+		return;
+	default:
+		break;
+	}
+
+fallback:
+	nf_log_buf_add(m, "MAC=");
+	if (dev->hard_header_len &&
+	    skb->mac_header != skb->network_header) {
+		const unsigned char *p = skb_mac_header(skb);
+		unsigned int len = dev->hard_header_len;
+		unsigned int i;
+
+		if (dev->type == ARPHRD_SIT) {
+			p -= ETH_HLEN;
+
+			if (p < skb->head)
+				p = NULL;
+		}
+
+		if (p != NULL) {
+			nf_log_buf_add(m, "%02x", *p++);
+			for (i = 1; i < len; i++)
+				nf_log_buf_add(m, ":%02x", *p++);
+		}
+		nf_log_buf_add(m, " ");
+
+		if (dev->type == ARPHRD_SIT) {
+			const struct iphdr *iph =
+				(struct iphdr *)skb_mac_header(skb);
+			nf_log_buf_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr,
+				       &iph->daddr);
+		}
+	} else {
+		nf_log_buf_add(m, " ");
+	}
+}
+
+void nf_log_ip6_packet(struct net *net, u_int8_t pf,
+		       unsigned int hooknum, const struct sk_buff *skb,
+		       const struct net_device *in,
+		       const struct net_device *out,
+		       const struct nf_loginfo *loginfo,
+		       const char *prefix)
+{
+	struct nf_log_buf *m;
+
+	/* FIXME: Disabled from containers until syslog ns is supported */
+	if (!net_eq(net, &init_net))
+		return;
+
+	m = nf_log_buf_open();
+
+	if (!loginfo)
+		loginfo = &default_loginfo;
+
+	nf_log_dump_packet_common(m, pf, hooknum, skb, in, out,
+				  loginfo, prefix);
+
+	if (in != NULL)
+		dump_ipv6_mac_header(m, loginfo, skb);
+
+	dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1);
+
+	nf_log_buf_close(m);
+}
+EXPORT_SYMBOL_GPL(nf_log_ip6_packet);
+
+static struct nf_logger nf_ip6_logger __read_mostly = {
+	.name		= "nf_log_ipv6",
+	.type		= NF_LOG_TYPE_LOG,
+	.logfn		= nf_log_ip6_packet,
+	.me		= THIS_MODULE,
+};
+
+static int __net_init nf_log_ipv6_net_init(struct net *net)
+{
+	nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger);
+	return 0;
+}
+
+static void __net_exit nf_log_ipv6_net_exit(struct net *net)
+{
+	nf_log_unset(net, &nf_ip6_logger);
+}
+
+static struct pernet_operations nf_log_ipv6_net_ops = {
+	.init = nf_log_ipv6_net_init,
+	.exit = nf_log_ipv6_net_exit,
+};
+
+static int __init nf_log_ipv6_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&nf_log_ipv6_net_ops);
+	if (ret < 0)
+		return ret;
+
+	nf_log_register(NFPROTO_IPV6, &nf_ip6_logger);
+	return 0;
+}
+
+static void __exit nf_log_ipv6_exit(void)
+{
+	unregister_pernet_subsys(&nf_log_ipv6_net_ops);
+	nf_log_unregister(&nf_ip6_logger);
+}
+
+module_init(nf_log_ipv6_init);
+module_exit(nf_log_ipv6_exit);
+
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter IPv4 packet logging");
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index e9410d17619d..f17b273c9082 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -359,6 +359,9 @@ config NETFILTER_NETLINK_QUEUE_CT
 	  If this option is enabled, NFQUEUE can include Connection Tracking
 	  information together with the packet is the enqueued via NFNETLINK.
 
+config NF_LOG_COMMON
+	tristate
+
 config NF_NAT
 	tristate
 
@@ -744,6 +747,9 @@ config NETFILTER_XT_TARGET_LED
 
 config NETFILTER_XT_TARGET_LOG
 	tristate "LOG target support"
+	select NF_LOG
+	select NF_LOG_IPV4
+	select NF_LOG_IPV6
 	default m if NETFILTER_ADVANCED=n
 	help
 	  This option adds a `LOG' target, which allows you to create rules in
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index bffdad774da7..8308624a406a 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -47,6 +47,9 @@ obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o
 nf_nat-y	:= nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \
 		   nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o
 
+# generic transport layer logging
+obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
+
 obj-$(CONFIG_NF_NAT) += nf_nat.o
 
 # NAT protocols (nf_nat)
diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c
new file mode 100644
index 000000000000..eeb8ef4ff1a3
--- /dev/null
+++ b/net/netfilter/nf_log_common.c
@@ -0,0 +1,187 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/xt_LOG.h>
+#include <net/netfilter/nf_log.h>
+
+int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb,
+			   u8 proto, int fragment, unsigned int offset)
+{
+	struct udphdr _udph;
+	const struct udphdr *uh;
+
+	if (proto == IPPROTO_UDP)
+		/* Max length: 10 "PROTO=UDP "     */
+		nf_log_buf_add(m, "PROTO=UDP ");
+	else	/* Max length: 14 "PROTO=UDPLITE " */
+		nf_log_buf_add(m, "PROTO=UDPLITE ");
+
+	if (fragment)
+		goto out;
+
+	/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+	uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
+	if (uh == NULL) {
+		nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
+
+		return 1;
+	}
+
+	/* Max length: 20 "SPT=65535 DPT=65535 " */
+	nf_log_buf_add(m, "SPT=%u DPT=%u LEN=%u ",
+		       ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len));
+
+out:
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_log_dump_udp_header);
+
+int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb,
+			   u8 proto, int fragment, unsigned int offset,
+			   unsigned int logflags)
+{
+	struct tcphdr _tcph;
+	const struct tcphdr *th;
+
+	/* Max length: 10 "PROTO=TCP " */
+	nf_log_buf_add(m, "PROTO=TCP ");
+
+	if (fragment)
+		return 0;
+
+	/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+	th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
+	if (th == NULL) {
+		nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
+		return 1;
+	}
+
+	/* Max length: 20 "SPT=65535 DPT=65535 " */
+	nf_log_buf_add(m, "SPT=%u DPT=%u ",
+		       ntohs(th->source), ntohs(th->dest));
+	/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
+	if (logflags & XT_LOG_TCPSEQ) {
+		nf_log_buf_add(m, "SEQ=%u ACK=%u ",
+			       ntohl(th->seq), ntohl(th->ack_seq));
+	}
+
+	/* Max length: 13 "WINDOW=65535 " */
+	nf_log_buf_add(m, "WINDOW=%u ", ntohs(th->window));
+	/* Max length: 9 "RES=0x3C " */
+	nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) &
+					    TCP_RESERVED_BITS) >> 22));
+	/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
+	if (th->cwr)
+		nf_log_buf_add(m, "CWR ");
+	if (th->ece)
+		nf_log_buf_add(m, "ECE ");
+	if (th->urg)
+		nf_log_buf_add(m, "URG ");
+	if (th->ack)
+		nf_log_buf_add(m, "ACK ");
+	if (th->psh)
+		nf_log_buf_add(m, "PSH ");
+	if (th->rst)
+		nf_log_buf_add(m, "RST ");
+	if (th->syn)
+		nf_log_buf_add(m, "SYN ");
+	if (th->fin)
+		nf_log_buf_add(m, "FIN ");
+	/* Max length: 11 "URGP=65535 " */
+	nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr));
+
+	if ((logflags & XT_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) {
+		u_int8_t _opt[60 - sizeof(struct tcphdr)];
+		const u_int8_t *op;
+		unsigned int i;
+		unsigned int optsize = th->doff*4 - sizeof(struct tcphdr);
+
+		op = skb_header_pointer(skb, offset + sizeof(struct tcphdr),
+					optsize, _opt);
+		if (op == NULL) {
+			nf_log_buf_add(m, "OPT (TRUNCATED)");
+			return 1;
+		}
+
+		/* Max length: 127 "OPT (" 15*4*2chars ") " */
+		nf_log_buf_add(m, "OPT (");
+		for (i = 0; i < optsize; i++)
+			nf_log_buf_add(m, "%02X", op[i]);
+
+		nf_log_buf_add(m, ") ");
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header);
+
+void nf_log_dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk)
+{
+	if (!sk || sk->sk_state == TCP_TIME_WAIT)
+		return;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	if (sk->sk_socket && sk->sk_socket->file) {
+		const struct cred *cred = sk->sk_socket->file->f_cred;
+		nf_log_buf_add(m, "UID=%u GID=%u ",
+			from_kuid_munged(&init_user_ns, cred->fsuid),
+			from_kgid_munged(&init_user_ns, cred->fsgid));
+	}
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+EXPORT_SYMBOL_GPL(nf_log_dump_sk_uid_gid);
+
+void
+nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf,
+			  unsigned int hooknum, const struct sk_buff *skb,
+			  const struct net_device *in,
+			  const struct net_device *out,
+			  const struct nf_loginfo *loginfo, const char *prefix)
+{
+	nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ",
+	       '0' + loginfo->u.log.level, prefix,
+	       in ? in->name : "",
+	       out ? out->name : "");
+#ifdef CONFIG_BRIDGE_NETFILTER
+	if (skb->nf_bridge) {
+		const struct net_device *physindev;
+		const struct net_device *physoutdev;
+
+		physindev = skb->nf_bridge->physindev;
+		if (physindev && in != physindev)
+			nf_log_buf_add(m, "PHYSIN=%s ", physindev->name);
+		physoutdev = skb->nf_bridge->physoutdev;
+		if (physoutdev && out != physoutdev)
+			nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name);
+	}
+#endif
+}
+EXPORT_SYMBOL_GPL(nf_log_dump_packet_common);
+
+static int __init nf_log_common_init(void)
+{
+	return 0;
+}
+
+static void __exit nf_log_common_exit(void) {}
+
+module_init(nf_log_common_init);
+module_exit(nf_log_common_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
index 649b85fc5463..5a6bd60e20d6 100644
--- a/net/netfilter/xt_LOG.c
+++ b/net/netfilter/xt_LOG.c
@@ -28,813 +28,6 @@
 #include <linux/netfilter_ipv6/ip6_tables.h>
 #include <net/netfilter/nf_log.h>
 
-static struct nf_loginfo default_loginfo = {
-	.type	= NF_LOG_TYPE_LOG,
-	.u = {
-		.log = {
-			.level    = 5,
-			.logflags = NF_LOG_MASK,
-		},
-	},
-};
-
-static int dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb,
-			   u8 proto, int fragment, unsigned int offset)
-{
-	struct udphdr _udph;
-	const struct udphdr *uh;
-
-	if (proto == IPPROTO_UDP)
-		/* Max length: 10 "PROTO=UDP "     */
-		nf_log_buf_add(m, "PROTO=UDP ");
-	else	/* Max length: 14 "PROTO=UDPLITE " */
-		nf_log_buf_add(m, "PROTO=UDPLITE ");
-
-	if (fragment)
-		goto out;
-
-	/* Max length: 25 "INCOMPLETE [65535 bytes] " */
-	uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
-	if (uh == NULL) {
-		nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
-
-		return 1;
-	}
-
-	/* Max length: 20 "SPT=65535 DPT=65535 " */
-	nf_log_buf_add(m, "SPT=%u DPT=%u LEN=%u ",
-		       ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len));
-
-out:
-	return 0;
-}
-
-static int dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb,
-			   u8 proto, int fragment, unsigned int offset,
-			   unsigned int logflags)
-{
-	struct tcphdr _tcph;
-	const struct tcphdr *th;
-
-	/* Max length: 10 "PROTO=TCP " */
-	nf_log_buf_add(m, "PROTO=TCP ");
-
-	if (fragment)
-		return 0;
-
-	/* Max length: 25 "INCOMPLETE [65535 bytes] " */
-	th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
-	if (th == NULL) {
-		nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset);
-		return 1;
-	}
-
-	/* Max length: 20 "SPT=65535 DPT=65535 " */
-	nf_log_buf_add(m, "SPT=%u DPT=%u ",
-		       ntohs(th->source), ntohs(th->dest));
-	/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
-	if (logflags & XT_LOG_TCPSEQ) {
-		nf_log_buf_add(m, "SEQ=%u ACK=%u ",
-			       ntohl(th->seq), ntohl(th->ack_seq));
-	}
-
-	/* Max length: 13 "WINDOW=65535 " */
-	nf_log_buf_add(m, "WINDOW=%u ", ntohs(th->window));
-	/* Max length: 9 "RES=0x3C " */
-	nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) &
-					    TCP_RESERVED_BITS) >> 22));
-	/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
-	if (th->cwr)
-		nf_log_buf_add(m, "CWR ");
-	if (th->ece)
-		nf_log_buf_add(m, "ECE ");
-	if (th->urg)
-		nf_log_buf_add(m, "URG ");
-	if (th->ack)
-		nf_log_buf_add(m, "ACK ");
-	if (th->psh)
-		nf_log_buf_add(m, "PSH ");
-	if (th->rst)
-		nf_log_buf_add(m, "RST ");
-	if (th->syn)
-		nf_log_buf_add(m, "SYN ");
-	if (th->fin)
-		nf_log_buf_add(m, "FIN ");
-	/* Max length: 11 "URGP=65535 " */
-	nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr));
-
-	if ((logflags & XT_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) {
-		u_int8_t _opt[60 - sizeof(struct tcphdr)];
-		const u_int8_t *op;
-		unsigned int i;
-		unsigned int optsize = th->doff*4 - sizeof(struct tcphdr);
-
-		op = skb_header_pointer(skb, offset + sizeof(struct tcphdr),
-					optsize, _opt);
-		if (op == NULL) {
-			nf_log_buf_add(m, "OPT (TRUNCATED)");
-			return 1;
-		}
-
-		/* Max length: 127 "OPT (" 15*4*2chars ") " */
-		nf_log_buf_add(m, "OPT (");
-		for (i = 0; i < optsize; i++)
-			nf_log_buf_add(m, "%02X", op[i]);
-
-		nf_log_buf_add(m, ") ");
-	}
-
-	return 0;
-}
-
-static void dump_sk_uid_gid(struct nf_log_buf *m, struct sock *sk)
-{
-	if (!sk || sk->sk_state == TCP_TIME_WAIT)
-		return;
-
-	read_lock_bh(&sk->sk_callback_lock);
-	if (sk->sk_socket && sk->sk_socket->file) {
-		const struct cred *cred = sk->sk_socket->file->f_cred;
-		nf_log_buf_add(m, "UID=%u GID=%u ",
-			from_kuid_munged(&init_user_ns, cred->fsuid),
-			from_kgid_munged(&init_user_ns, cred->fsgid));
-	}
-	read_unlock_bh(&sk->sk_callback_lock);
-}
-
-/* One level of recursion won't kill us */
-static void dump_ipv4_packet(struct nf_log_buf *m,
-			     const struct nf_loginfo *info,
-			     const struct sk_buff *skb, unsigned int iphoff)
-{
-	struct iphdr _iph;
-	const struct iphdr *ih;
-	unsigned int logflags;
-
-	if (info->type == NF_LOG_TYPE_LOG)
-		logflags = info->u.log.logflags;
-	else
-		logflags = NF_LOG_MASK;
-
-	ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
-	if (ih == NULL) {
-		nf_log_buf_add(m, "TRUNCATED");
-		return;
-	}
-
-	/* Important fields:
-	 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
-	/* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
-	nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ",
-	       &ih->saddr, &ih->daddr);
-
-	/* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
-	nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
-	       ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
-	       ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
-
-	/* Max length: 6 "CE DF MF " */
-	if (ntohs(ih->frag_off) & IP_CE)
-		nf_log_buf_add(m, "CE ");
-	if (ntohs(ih->frag_off) & IP_DF)
-		nf_log_buf_add(m, "DF ");
-	if (ntohs(ih->frag_off) & IP_MF)
-		nf_log_buf_add(m, "MF ");
-
-	/* Max length: 11 "FRAG:65535 " */
-	if (ntohs(ih->frag_off) & IP_OFFSET)
-		nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
-
-	if ((logflags & XT_LOG_IPOPT) &&
-	    ih->ihl * 4 > sizeof(struct iphdr)) {
-		const unsigned char *op;
-		unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
-		unsigned int i, optsize;
-
-		optsize = ih->ihl * 4 - sizeof(struct iphdr);
-		op = skb_header_pointer(skb, iphoff+sizeof(_iph),
-					optsize, _opt);
-		if (op == NULL) {
-			nf_log_buf_add(m, "TRUNCATED");
-			return;
-		}
-
-		/* Max length: 127 "OPT (" 15*4*2chars ") " */
-		nf_log_buf_add(m, "OPT (");
-		for (i = 0; i < optsize; i++)
-			nf_log_buf_add(m, "%02X", op[i]);
-		nf_log_buf_add(m, ") ");
-	}
-
-	switch (ih->protocol) {
-	case IPPROTO_TCP:
-		if (dump_tcp_header(m, skb, ih->protocol,
-				    ntohs(ih->frag_off) & IP_OFFSET,
-				    iphoff+ih->ihl*4, logflags))
-			return;
-		break;
-	case IPPROTO_UDP:
-	case IPPROTO_UDPLITE:
-		if (dump_udp_header(m, skb, ih->protocol,
-				    ntohs(ih->frag_off) & IP_OFFSET,
-				    iphoff+ih->ihl*4))
-			return;
-		break;
-	case IPPROTO_ICMP: {
-		struct icmphdr _icmph;
-		const struct icmphdr *ich;
-		static const size_t required_len[NR_ICMP_TYPES+1]
-			= { [ICMP_ECHOREPLY] = 4,
-			    [ICMP_DEST_UNREACH]
-			    = 8 + sizeof(struct iphdr),
-			    [ICMP_SOURCE_QUENCH]
-			    = 8 + sizeof(struct iphdr),
-			    [ICMP_REDIRECT]
-			    = 8 + sizeof(struct iphdr),
-			    [ICMP_ECHO] = 4,
-			    [ICMP_TIME_EXCEEDED]
-			    = 8 + sizeof(struct iphdr),
-			    [ICMP_PARAMETERPROB]
-			    = 8 + sizeof(struct iphdr),
-			    [ICMP_TIMESTAMP] = 20,
-			    [ICMP_TIMESTAMPREPLY] = 20,
-			    [ICMP_ADDRESS] = 12,
-			    [ICMP_ADDRESSREPLY] = 12 };
-
-		/* Max length: 11 "PROTO=ICMP " */
-		nf_log_buf_add(m, "PROTO=ICMP ");
-
-		if (ntohs(ih->frag_off) & IP_OFFSET)
-			break;
-
-		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
-		ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
-					 sizeof(_icmph), &_icmph);
-		if (ich == NULL) {
-			nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
-				       skb->len - iphoff - ih->ihl*4);
-			break;
-		}
-
-		/* Max length: 18 "TYPE=255 CODE=255 " */
-		nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
-
-		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
-		if (ich->type <= NR_ICMP_TYPES &&
-		    required_len[ich->type] &&
-		    skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
-			nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
-			       skb->len - iphoff - ih->ihl*4);
-			break;
-		}
-
-		switch (ich->type) {
-		case ICMP_ECHOREPLY:
-		case ICMP_ECHO:
-			/* Max length: 19 "ID=65535 SEQ=65535 " */
-			nf_log_buf_add(m, "ID=%u SEQ=%u ",
-			       ntohs(ich->un.echo.id),
-			       ntohs(ich->un.echo.sequence));
-			break;
-
-		case ICMP_PARAMETERPROB:
-			/* Max length: 14 "PARAMETER=255 " */
-			nf_log_buf_add(m, "PARAMETER=%u ",
-			       ntohl(ich->un.gateway) >> 24);
-			break;
-		case ICMP_REDIRECT:
-			/* Max length: 24 "GATEWAY=255.255.255.255 " */
-			nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
-			/* Fall through */
-		case ICMP_DEST_UNREACH:
-		case ICMP_SOURCE_QUENCH:
-		case ICMP_TIME_EXCEEDED:
-			/* Max length: 3+maxlen */
-			if (!iphoff) { /* Only recurse once. */
-				nf_log_buf_add(m, "[");
-				dump_ipv4_packet(m, info, skb,
-					    iphoff + ih->ihl*4+sizeof(_icmph));
-				nf_log_buf_add(m, "] ");
-			}
-
-			/* Max length: 10 "MTU=65535 " */
-			if (ich->type == ICMP_DEST_UNREACH &&
-			    ich->code == ICMP_FRAG_NEEDED) {
-				nf_log_buf_add(m, "MTU=%u ",
-					       ntohs(ich->un.frag.mtu));
-			}
-		}
-		break;
-	}
-	/* Max Length */
-	case IPPROTO_AH: {
-		struct ip_auth_hdr _ahdr;
-		const struct ip_auth_hdr *ah;
-
-		if (ntohs(ih->frag_off) & IP_OFFSET)
-			break;
-
-		/* Max length: 9 "PROTO=AH " */
-		nf_log_buf_add(m, "PROTO=AH ");
-
-		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
-		ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
-					sizeof(_ahdr), &_ahdr);
-		if (ah == NULL) {
-			nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
-				       skb->len - iphoff - ih->ihl*4);
-			break;
-		}
-
-		/* Length: 15 "SPI=0xF1234567 " */
-		nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
-		break;
-	}
-	case IPPROTO_ESP: {
-		struct ip_esp_hdr _esph;
-		const struct ip_esp_hdr *eh;
-
-		/* Max length: 10 "PROTO=ESP " */
-		nf_log_buf_add(m, "PROTO=ESP ");
-
-		if (ntohs(ih->frag_off) & IP_OFFSET)
-			break;
-
-		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
-		eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
-					sizeof(_esph), &_esph);
-		if (eh == NULL) {
-			nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
-				       skb->len - iphoff - ih->ihl*4);
-			break;
-		}
-
-		/* Length: 15 "SPI=0xF1234567 " */
-		nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi));
-		break;
-	}
-	/* Max length: 10 "PROTO 255 " */
-	default:
-		nf_log_buf_add(m, "PROTO=%u ", ih->protocol);
-	}
-
-	/* Max length: 15 "UID=4294967295 " */
-	if ((logflags & XT_LOG_UID) && !iphoff)
-		dump_sk_uid_gid(m, skb->sk);
-
-	/* Max length: 16 "MARK=0xFFFFFFFF " */
-	if (!iphoff && skb->mark)
-		nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
-
-	/* Proto    Max log string length */
-	/* IP:      40+46+6+11+127 = 230 */
-	/* TCP:     10+max(25,20+30+13+9+32+11+127) = 252 */
-	/* UDP:     10+max(25,20) = 35 */
-	/* UDPLITE: 14+max(25,20) = 39 */
-	/* ICMP:    11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
-	/* ESP:     10+max(25)+15 = 50 */
-	/* AH:      9+max(25)+15 = 49 */
-	/* unknown: 10 */
-
-	/* (ICMP allows recursion one level deep) */
-	/* maxlen =  IP + ICMP +  IP + max(TCP,UDP,ICMP,unknown) */
-	/* maxlen = 230+   91  + 230 + 252 = 803 */
-}
-
-static void dump_ipv4_mac_header(struct nf_log_buf *m,
-				 const struct nf_loginfo *info,
-				 const struct sk_buff *skb)
-{
-	struct net_device *dev = skb->dev;
-	unsigned int logflags = 0;
-
-	if (info->type == NF_LOG_TYPE_LOG)
-		logflags = info->u.log.logflags;
-
-	if (!(logflags & XT_LOG_MACDECODE))
-		goto fallback;
-
-	switch (dev->type) {
-	case ARPHRD_ETHER:
-		nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
-			       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
-			       ntohs(eth_hdr(skb)->h_proto));
-		return;
-	default:
-		break;
-	}
-
-fallback:
-	nf_log_buf_add(m, "MAC=");
-	if (dev->hard_header_len &&
-	    skb->mac_header != skb->network_header) {
-		const unsigned char *p = skb_mac_header(skb);
-		unsigned int i;
-
-		nf_log_buf_add(m, "%02x", *p++);
-		for (i = 1; i < dev->hard_header_len; i++, p++)
-			nf_log_buf_add(m, ":%02x", *p);
-	}
-	nf_log_buf_add(m, " ");
-}
-
-static void
-log_packet_common(struct nf_log_buf *m,
-		  u_int8_t pf,
-		  unsigned int hooknum,
-		  const struct sk_buff *skb,
-		  const struct net_device *in,
-		  const struct net_device *out,
-		  const struct nf_loginfo *loginfo,
-		  const char *prefix)
-{
-	nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ",
-		       '0' + loginfo->u.log.level, prefix,
-		       in ? in->name : "",
-		       out ? out->name : "");
-#ifdef CONFIG_BRIDGE_NETFILTER
-	if (skb->nf_bridge) {
-		const struct net_device *physindev;
-		const struct net_device *physoutdev;
-
-		physindev = skb->nf_bridge->physindev;
-		if (physindev && in != physindev)
-			nf_log_buf_add(m, "PHYSIN=%s ", physindev->name);
-		physoutdev = skb->nf_bridge->physoutdev;
-		if (physoutdev && out != physoutdev)
-			nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name);
-	}
-#endif
-}
-
-
-static void
-ipt_log_packet(struct net *net,
-	       u_int8_t pf,
-	       unsigned int hooknum,
-	       const struct sk_buff *skb,
-	       const struct net_device *in,
-	       const struct net_device *out,
-	       const struct nf_loginfo *loginfo,
-	       const char *prefix)
-{
-	struct nf_log_buf *m;
-
-	/* FIXME: Disabled from containers until syslog ns is supported */
-	if (!net_eq(net, &init_net))
-		return;
-
-	m = nf_log_buf_open();
-
-	if (!loginfo)
-		loginfo = &default_loginfo;
-
-	log_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix);
-
-	if (in != NULL)
-		dump_ipv4_mac_header(m, loginfo, skb);
-
-	dump_ipv4_packet(m, loginfo, skb, 0);
-
-	nf_log_buf_close(m);
-}
-
-#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
-/* One level of recursion won't kill us */
-static void dump_ipv6_packet(struct nf_log_buf *m,
-			     const struct nf_loginfo *info,
-			     const struct sk_buff *skb, unsigned int ip6hoff,
-			     int recurse)
-{
-	u_int8_t currenthdr;
-	int fragment;
-	struct ipv6hdr _ip6h;
-	const struct ipv6hdr *ih;
-	unsigned int ptr;
-	unsigned int hdrlen = 0;
-	unsigned int logflags;
-
-	if (info->type == NF_LOG_TYPE_LOG)
-		logflags = info->u.log.logflags;
-	else
-		logflags = NF_LOG_MASK;
-
-	ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
-	if (ih == NULL) {
-		nf_log_buf_add(m, "TRUNCATED");
-		return;
-	}
-
-	/* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
-	nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
-
-	/* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
-	nf_log_buf_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
-		       ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
-		       (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
-		       ih->hop_limit, (ntohl(*(__be32 *)ih) & 0x000fffff));
-
-	fragment = 0;
-	ptr = ip6hoff + sizeof(struct ipv6hdr);
-	currenthdr = ih->nexthdr;
-	while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) {
-		struct ipv6_opt_hdr _hdr;
-		const struct ipv6_opt_hdr *hp;
-
-		hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
-		if (hp == NULL) {
-			nf_log_buf_add(m, "TRUNCATED");
-			return;
-		}
-
-		/* Max length: 48 "OPT (...) " */
-		if (logflags & XT_LOG_IPOPT)
-			nf_log_buf_add(m, "OPT ( ");
-
-		switch (currenthdr) {
-		case IPPROTO_FRAGMENT: {
-			struct frag_hdr _fhdr;
-			const struct frag_hdr *fh;
-
-			nf_log_buf_add(m, "FRAG:");
-			fh = skb_header_pointer(skb, ptr, sizeof(_fhdr),
-						&_fhdr);
-			if (fh == NULL) {
-				nf_log_buf_add(m, "TRUNCATED ");
-				return;
-			}
-
-			/* Max length: 6 "65535 " */
-			nf_log_buf_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8);
-
-			/* Max length: 11 "INCOMPLETE " */
-			if (fh->frag_off & htons(0x0001))
-				nf_log_buf_add(m, "INCOMPLETE ");
-
-			nf_log_buf_add(m, "ID:%08x ", ntohl(fh->identification));
-
-			if (ntohs(fh->frag_off) & 0xFFF8)
-				fragment = 1;
-
-			hdrlen = 8;
-
-			break;
-		}
-		case IPPROTO_DSTOPTS:
-		case IPPROTO_ROUTING:
-		case IPPROTO_HOPOPTS:
-			if (fragment) {
-				if (logflags & XT_LOG_IPOPT)
-					nf_log_buf_add(m, ")");
-				return;
-			}
-			hdrlen = ipv6_optlen(hp);
-			break;
-		/* Max Length */
-		case IPPROTO_AH:
-			if (logflags & XT_LOG_IPOPT) {
-				struct ip_auth_hdr _ahdr;
-				const struct ip_auth_hdr *ah;
-
-				/* Max length: 3 "AH " */
-				nf_log_buf_add(m, "AH ");
-
-				if (fragment) {
-					nf_log_buf_add(m, ")");
-					return;
-				}
-
-				ah = skb_header_pointer(skb, ptr, sizeof(_ahdr),
-							&_ahdr);
-				if (ah == NULL) {
-					/*
-					 * Max length: 26 "INCOMPLETE [65535
-					 *  bytes] )"
-					 */
-					nf_log_buf_add(m, "INCOMPLETE [%u bytes] )",
-						       skb->len - ptr);
-					return;
-				}
-
-				/* Length: 15 "SPI=0xF1234567 */
-				nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
-
-			}
-
-			hdrlen = (hp->hdrlen+2)<<2;
-			break;
-		case IPPROTO_ESP:
-			if (logflags & XT_LOG_IPOPT) {
-				struct ip_esp_hdr _esph;
-				const struct ip_esp_hdr *eh;
-
-				/* Max length: 4 "ESP " */
-				nf_log_buf_add(m, "ESP ");
-
-				if (fragment) {
-					nf_log_buf_add(m, ")");
-					return;
-				}
-
-				/*
-				 * Max length: 26 "INCOMPLETE [65535 bytes] )"
-				 */
-				eh = skb_header_pointer(skb, ptr, sizeof(_esph),
-							&_esph);
-				if (eh == NULL) {
-					nf_log_buf_add(m, "INCOMPLETE [%u bytes] )",
-						       skb->len - ptr);
-					return;
-				}
-
-				/* Length: 16 "SPI=0xF1234567 )" */
-				nf_log_buf_add(m, "SPI=0x%x )", ntohl(eh->spi));
-
-			}
-			return;
-		default:
-			/* Max length: 20 "Unknown Ext Hdr 255" */
-			nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr);
-			return;
-		}
-		if (logflags & XT_LOG_IPOPT)
-			nf_log_buf_add(m, ") ");
-
-		currenthdr = hp->nexthdr;
-		ptr += hdrlen;
-	}
-
-	switch (currenthdr) {
-	case IPPROTO_TCP:
-		if (dump_tcp_header(m, skb, currenthdr, fragment, ptr,
-		    logflags))
-			return;
-		break;
-	case IPPROTO_UDP:
-	case IPPROTO_UDPLITE:
-		if (dump_udp_header(m, skb, currenthdr, fragment, ptr))
-			return;
-		break;
-	case IPPROTO_ICMPV6: {
-		struct icmp6hdr _icmp6h;
-		const struct icmp6hdr *ic;
-
-		/* Max length: 13 "PROTO=ICMPv6 " */
-		nf_log_buf_add(m, "PROTO=ICMPv6 ");
-
-		if (fragment)
-			break;
-
-		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
-		ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h);
-		if (ic == NULL) {
-			nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
-				       skb->len - ptr);
-			return;
-		}
-
-		/* Max length: 18 "TYPE=255 CODE=255 " */
-		nf_log_buf_add(m, "TYPE=%u CODE=%u ",
-			       ic->icmp6_type, ic->icmp6_code);
-
-		switch (ic->icmp6_type) {
-		case ICMPV6_ECHO_REQUEST:
-		case ICMPV6_ECHO_REPLY:
-			/* Max length: 19 "ID=65535 SEQ=65535 " */
-			nf_log_buf_add(m, "ID=%u SEQ=%u ",
-				       ntohs(ic->icmp6_identifier),
-				       ntohs(ic->icmp6_sequence));
-			break;
-		case ICMPV6_MGM_QUERY:
-		case ICMPV6_MGM_REPORT:
-		case ICMPV6_MGM_REDUCTION:
-			break;
-
-		case ICMPV6_PARAMPROB:
-			/* Max length: 17 "POINTER=ffffffff " */
-			nf_log_buf_add(m, "POINTER=%08x ",
-				       ntohl(ic->icmp6_pointer));
-			/* Fall through */
-		case ICMPV6_DEST_UNREACH:
-		case ICMPV6_PKT_TOOBIG:
-		case ICMPV6_TIME_EXCEED:
-			/* Max length: 3+maxlen */
-			if (recurse) {
-				nf_log_buf_add(m, "[");
-				dump_ipv6_packet(m, info, skb,
-					    ptr + sizeof(_icmp6h), 0);
-				nf_log_buf_add(m, "] ");
-			}
-
-			/* Max length: 10 "MTU=65535 " */
-			if (ic->icmp6_type == ICMPV6_PKT_TOOBIG)
-				nf_log_buf_add(m, "MTU=%u ",
-					       ntohl(ic->icmp6_mtu));
-		}
-		break;
-	}
-	/* Max length: 10 "PROTO=255 " */
-	default:
-		nf_log_buf_add(m, "PROTO=%u ", currenthdr);
-	}
-
-	/* Max length: 15 "UID=4294967295 " */
-	if ((logflags & XT_LOG_UID) && recurse)
-		dump_sk_uid_gid(m, skb->sk);
-
-	/* Max length: 16 "MARK=0xFFFFFFFF " */
-	if (recurse && skb->mark)
-		nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
-}
-
-static void dump_ipv6_mac_header(struct nf_log_buf *m,
-				 const struct nf_loginfo *info,
-				 const struct sk_buff *skb)
-{
-	struct net_device *dev = skb->dev;
-	unsigned int logflags = 0;
-
-	if (info->type == NF_LOG_TYPE_LOG)
-		logflags = info->u.log.logflags;
-
-	if (!(logflags & XT_LOG_MACDECODE))
-		goto fallback;
-
-	switch (dev->type) {
-	case ARPHRD_ETHER:
-		nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
-			       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
-			       ntohs(eth_hdr(skb)->h_proto));
-		return;
-	default:
-		break;
-	}
-
-fallback:
-	nf_log_buf_add(m, "MAC=");
-	if (dev->hard_header_len &&
-	    skb->mac_header != skb->network_header) {
-		const unsigned char *p = skb_mac_header(skb);
-		unsigned int len = dev->hard_header_len;
-		unsigned int i;
-
-		if (dev->type == ARPHRD_SIT) {
-			p -= ETH_HLEN;
-
-			if (p < skb->head)
-				p = NULL;
-		}
-
-		if (p != NULL) {
-			nf_log_buf_add(m, "%02x", *p++);
-			for (i = 1; i < len; i++)
-				nf_log_buf_add(m, ":%02x", *p++);
-		}
-		nf_log_buf_add(m, " ");
-
-		if (dev->type == ARPHRD_SIT) {
-			const struct iphdr *iph =
-				(struct iphdr *)skb_mac_header(skb);
-			nf_log_buf_add(m, "TUNNEL=%pI4->%pI4 ",
-				       &iph->saddr, &iph->daddr);
-		}
-	} else {
-		nf_log_buf_add(m, " ");
-	}
-}
-
-static void
-ip6t_log_packet(struct net *net,
-		u_int8_t pf,
-		unsigned int hooknum,
-		const struct sk_buff *skb,
-		const struct net_device *in,
-		const struct net_device *out,
-		const struct nf_loginfo *loginfo,
-		const char *prefix)
-{
-	struct nf_log_buf *m;
-
-	/* FIXME: Disabled from containers until syslog ns is supported */
-	if (!net_eq(net, &init_net))
-		return;
-
-	m = nf_log_buf_open();
-
-	if (!loginfo)
-		loginfo = &default_loginfo;
-
-	log_packet_common(m, pf, hooknum, skb, in, out, loginfo, prefix);
-
-	if (in != NULL)
-		dump_ipv6_mac_header(m, loginfo, skb);
-
-	dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1);
-
-	nf_log_buf_close(m);
-}
-#endif
-
 static unsigned int
 log_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
@@ -847,12 +40,12 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	li.u.log.logflags = loginfo->logflags;
 
 	if (par->family == NFPROTO_IPV4)
-		ipt_log_packet(net, NFPROTO_IPV4, par->hooknum, skb, par->in,
-			       par->out, &li, loginfo->prefix);
+		nf_log_ip_packet(net, NFPROTO_IPV4, par->hooknum, skb, par->in,
+			         par->out, &li, loginfo->prefix);
 #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
 	else if (par->family == NFPROTO_IPV6)
-		ip6t_log_packet(net, NFPROTO_IPV6, par->hooknum, skb, par->in,
-				par->out, &li, loginfo->prefix);
+		nf_log_ip6_packet(net, NFPROTO_IPV6, par->hooknum, skb, par->in,
+				  par->out, &li, loginfo->prefix);
 #endif
 	else
 		WARN_ON_ONCE(1);
@@ -901,75 +94,13 @@ static struct xt_target log_tg_regs[] __read_mostly = {
 #endif
 };
 
-static struct nf_logger ipt_log_logger __read_mostly = {
-	.name		= "ipt_LOG",
-	.type		= NF_LOG_TYPE_LOG,
-	.logfn		= &ipt_log_packet,
-	.me		= THIS_MODULE,
-};
-
-#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
-static struct nf_logger ip6t_log_logger __read_mostly = {
-	.name		= "ip6t_LOG",
-	.type		= NF_LOG_TYPE_LOG,
-	.logfn		= &ip6t_log_packet,
-	.me		= THIS_MODULE,
-};
-#endif
-
-static int __net_init log_net_init(struct net *net)
-{
-	nf_log_set(net, NFPROTO_IPV4, &ipt_log_logger);
-#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
-	nf_log_set(net, NFPROTO_IPV6, &ip6t_log_logger);
-#endif
-	return 0;
-}
-
-static void __net_exit log_net_exit(struct net *net)
-{
-	nf_log_unset(net, &ipt_log_logger);
-#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
-	nf_log_unset(net, &ip6t_log_logger);
-#endif
-}
-
-static struct pernet_operations log_net_ops = {
-	.init = log_net_init,
-	.exit = log_net_exit,
-};
-
 static int __init log_tg_init(void)
 {
-	int ret;
-
-	ret = register_pernet_subsys(&log_net_ops);
-	if (ret < 0)
-		goto err_pernet;
-
-	ret = xt_register_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs));
-	if (ret < 0)
-		goto err_target;
-
-	nf_log_register(NFPROTO_IPV4, &ipt_log_logger);
-#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
-	nf_log_register(NFPROTO_IPV6, &ip6t_log_logger);
-#endif
-	return 0;
-
-err_target:
-	unregister_pernet_subsys(&log_net_ops);
-err_pernet:
-	return ret;
+	return xt_register_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs));
 }
 
 static void __exit log_tg_exit(void)
 {
-	unregister_pernet_subsys(&log_net_ops);
-	nf_log_unregister(&ipt_log_logger);
-#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
-	nf_log_unregister(&ip6t_log_logger);
-#endif
 	xt_unregister_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs));
 }
 
-- 
cgit v1.2.3


From fab4085f4e248b8a80bb1dadbbacb2bacd8017c3 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Wed, 18 Jun 2014 19:38:25 +0200
Subject: netfilter: log: nf_log_packet() as real unified interface

Before this patch, the nf_loginfo parameter specified the logging
configuration in case the specified default logger was loaded. This
patch updates the semantics of the nf_loginfo parameter in
nf_log_packet() which now indicates the logger that you explicitly
want to use.

Thus, nf_log_packet() is exposed as an unified interface which
internally routes the log message to the corresponding logger type
by family.

The module dependencies are expressed by the new nf_logger_find_get()
and nf_logger_put() functions which bump the logger module refcount.
Thus, you can not remove logger modules that are used by rules anymore.

Another important effect of this change is that the family specific
module is only loaded when required. Therefore, xt_LOG and nft_log
will just trigger the autoload of the nf_log_{ip,ip6} modules
according to the family.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_log.h   | 20 ++++++--------------
 net/ipv4/netfilter/nf_log_ipv4.c | 14 +++++++-------
 net/ipv6/netfilter/nf_log_ipv6.c | 14 +++++++-------
 net/netfilter/nf_log.c           | 41 +++++++++++++++++++++++++++++++++++++++-
 net/netfilter/nfnetlink_log.c    |  3 +++
 net/netfilter/xt_LOG.c           | 22 ++++++++++-----------
 6 files changed, 73 insertions(+), 41 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index bba354e78f49..b82dd19b8f26 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -61,6 +61,12 @@ int nf_log_bind_pf(struct net *net, u_int8_t pf,
 		   const struct nf_logger *logger);
 void nf_log_unbind_pf(struct net *net, u_int8_t pf);
 
+int nf_logger_find_get(int pf, enum nf_log_type type);
+void nf_logger_put(int pf, enum nf_log_type type);
+
+#define MODULE_ALIAS_NF_LOGGER(family, type) \
+	MODULE_ALIAS("nf-logger-" __stringify(family) "-" __stringify(type))
+
 /* Calls the registered backend logging function */
 __printf(8, 9)
 void nf_log_packet(struct net *net,
@@ -78,20 +84,6 @@ struct nf_log_buf *nf_log_buf_open(void);
 __printf(2, 3) int nf_log_buf_add(struct nf_log_buf *m, const char *f, ...);
 void nf_log_buf_close(struct nf_log_buf *m);
 
-void nf_log_ip_packet(struct net *net, u_int8_t pf,
-		      unsigned int hooknum, const struct sk_buff *skb,
-		      const struct net_device *in,
-		      const struct net_device *out,
-		      const struct nf_loginfo *loginfo,
-		      const char *prefix);
-
-void nf_log_ip6_packet(struct net *net, u_int8_t pf,
-		       unsigned int hooknum, const struct sk_buff *skb,
-		       const struct net_device *in,
-		       const struct net_device *out,
-		       const struct nf_loginfo *loginfo,
-		       const char *prefix);
-
 /* common logging functions */
 int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb,
 			   u8 proto, int fragment, unsigned int offset);
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 7e69a401a29e..078bdca1b607 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -306,12 +306,12 @@ fallback:
 	nf_log_buf_add(m, " ");
 }
 
-void nf_log_ip_packet(struct net *net, u_int8_t pf,
-		      unsigned int hooknum, const struct sk_buff *skb,
-		      const struct net_device *in,
-		      const struct net_device *out,
-		      const struct nf_loginfo *loginfo,
-		      const char *prefix)
+static void nf_log_ip_packet(struct net *net, u_int8_t pf,
+			     unsigned int hooknum, const struct sk_buff *skb,
+			     const struct net_device *in,
+			     const struct net_device *out,
+			     const struct nf_loginfo *loginfo,
+			     const char *prefix)
 {
 	struct nf_log_buf *m;
 
@@ -334,7 +334,6 @@ void nf_log_ip_packet(struct net *net, u_int8_t pf,
 
 	nf_log_buf_close(m);
 }
-EXPORT_SYMBOL_GPL(nf_log_ip_packet);
 
 static struct nf_logger nf_ip_logger __read_mostly = {
 	.name		= "nf_log_ipv4",
@@ -383,3 +382,4 @@ module_exit(nf_log_ipv4_exit);
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("Netfilter IPv4 packet logging");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS_NF_LOGGER(AF_INET, 0);
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index 804060946d2b..7b17a0be93e7 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -338,12 +338,12 @@ fallback:
 	}
 }
 
-void nf_log_ip6_packet(struct net *net, u_int8_t pf,
-		       unsigned int hooknum, const struct sk_buff *skb,
-		       const struct net_device *in,
-		       const struct net_device *out,
-		       const struct nf_loginfo *loginfo,
-		       const char *prefix)
+static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
+			      unsigned int hooknum, const struct sk_buff *skb,
+			      const struct net_device *in,
+			      const struct net_device *out,
+			      const struct nf_loginfo *loginfo,
+			      const char *prefix)
 {
 	struct nf_log_buf *m;
 
@@ -366,7 +366,6 @@ void nf_log_ip6_packet(struct net *net, u_int8_t pf,
 
 	nf_log_buf_close(m);
 }
-EXPORT_SYMBOL_GPL(nf_log_ip6_packet);
 
 static struct nf_logger nf_ip6_logger __read_mostly = {
 	.name		= "nf_log_ipv6",
@@ -415,3 +414,4 @@ module_exit(nf_log_ipv6_exit);
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("Netfilter IPv4 packet logging");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS_NF_LOGGER(AF_INET6, 0);
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 0b6b2c874199..0b2161c689e0 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -132,6 +132,41 @@ void nf_log_unbind_pf(struct net *net, u_int8_t pf)
 }
 EXPORT_SYMBOL(nf_log_unbind_pf);
 
+int nf_logger_find_get(int pf, enum nf_log_type type)
+{
+	struct nf_logger *logger;
+	int ret = -ENOENT;
+
+	logger = loggers[pf][type];
+	if (logger == NULL)
+		request_module("nf-logger-%u-%u", pf, type);
+
+	rcu_read_lock();
+	logger = rcu_dereference(loggers[pf][type]);
+	if (logger == NULL)
+		goto out;
+
+	if (logger && try_module_get(logger->me))
+		ret = 0;
+out:
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nf_logger_find_get);
+
+void nf_logger_put(int pf, enum nf_log_type type)
+{
+	struct nf_logger *logger;
+
+	BUG_ON(loggers[pf][type] == NULL);
+
+	rcu_read_lock();
+	logger = rcu_dereference(loggers[pf][type]);
+	module_put(logger->me);
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_logger_put);
+
 void nf_log_packet(struct net *net,
 		   u_int8_t pf,
 		   unsigned int hooknum,
@@ -146,7 +181,11 @@ void nf_log_packet(struct net *net,
 	const struct nf_logger *logger;
 
 	rcu_read_lock();
-	logger = rcu_dereference(net->nf.nf_loggers[pf]);
+	if (loginfo != NULL)
+		logger = rcu_dereference(loggers[pf][loginfo->type]);
+	else
+		logger = rcu_dereference(net->nf.nf_loggers[pf]);
+
 	if (logger) {
 		va_start(args, fmt);
 		vsnprintf(prefix, sizeof(prefix), fmt, args);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 160bb8ea9923..a11c5ff2f720 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -1106,6 +1106,9 @@ MODULE_DESCRIPTION("netfilter userspace logging");
 MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ULOG);
+MODULE_ALIAS_NF_LOGGER(AF_INET, 1);
+MODULE_ALIAS_NF_LOGGER(AF_INET6, 1);
+MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 1);
 
 module_init(nfnetlink_log_init);
 module_exit(nfnetlink_log_fini);
diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
index 5a6bd60e20d6..00eb49196e75 100644
--- a/net/netfilter/xt_LOG.c
+++ b/net/netfilter/xt_LOG.c
@@ -39,17 +39,8 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	li.u.log.level = loginfo->level;
 	li.u.log.logflags = loginfo->logflags;
 
-	if (par->family == NFPROTO_IPV4)
-		nf_log_ip_packet(net, NFPROTO_IPV4, par->hooknum, skb, par->in,
-			         par->out, &li, loginfo->prefix);
-#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
-	else if (par->family == NFPROTO_IPV6)
-		nf_log_ip6_packet(net, NFPROTO_IPV6, par->hooknum, skb, par->in,
-				  par->out, &li, loginfo->prefix);
-#endif
-	else
-		WARN_ON_ONCE(1);
-
+	nf_log_packet(net, par->family, par->hooknum, skb, par->in, par->out,
+		      &li, loginfo->prefix);
 	return XT_CONTINUE;
 }
 
@@ -70,7 +61,12 @@ static int log_tg_check(const struct xt_tgchk_param *par)
 		return -EINVAL;
 	}
 
-	return 0;
+	return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG);
+}
+
+static void log_tg_destroy(const struct xt_tgdtor_param *par)
+{
+	nf_logger_put(par->family, NF_LOG_TYPE_LOG);
 }
 
 static struct xt_target log_tg_regs[] __read_mostly = {
@@ -80,6 +76,7 @@ static struct xt_target log_tg_regs[] __read_mostly = {
 		.target		= log_tg,
 		.targetsize	= sizeof(struct xt_log_info),
 		.checkentry	= log_tg_check,
+		.destroy	= log_tg_destroy,
 		.me		= THIS_MODULE,
 	},
 #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
@@ -89,6 +86,7 @@ static struct xt_target log_tg_regs[] __read_mostly = {
 		.target		= log_tg,
 		.targetsize	= sizeof(struct xt_log_info),
 		.checkentry	= log_tg_check,
+		.destroy	= log_tg_destroy,
 		.me		= THIS_MODULE,
 	},
 #endif
-- 
cgit v1.2.3


From 35b9395104d51f4b85847fa72a1bf4136d36c56e Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Fri, 20 Jun 2014 20:36:23 +0200
Subject: netfilter: add generic ARP packet logger

This adds the generic plain text packet loggger for ARP packets. It is
based on the ebt_log code. Nevertheless, the output has been modified
to make it consistent with the original xt_LOG output.

This is an example output:

IN=wlan0 OUT= ARP HTYPE=1 PTYPE=0x0800 OPCODE=2 MACSRC=00:ab:12:34:55:63 IPSRC=192.168.10.1 MACDST=80:09:12:70:4f:50 IPDST=192.168.10.150

This patch enables packet logging from ARP chains, eg.

  nft add rule arp filter input log prefix "input: "

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/Kconfig      |   5 ++
 net/ipv4/netfilter/Makefile     |   1 +
 net/ipv4/netfilter/nf_log_arp.c | 149 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 155 insertions(+)
 create mode 100644 net/ipv4/netfilter/nf_log_arp.c

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 0c980293d225..12e6057daea1 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -159,6 +159,11 @@ config IP_NF_TARGET_SYNPROXY
 
 	  To compile it as a module, choose M here. If unsure, say N.
 
+config NF_LOG_ARP
+	tristate "ARP packet logging"
+	default m if NETFILTER_ADVANCED=n
+	select NF_LOG_INET
+
 config NF_LOG_IPV4
 	tristate "IPv4 packet logging"
 	default m if NETFILTER_ADVANCED=n
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 730e0c1ead90..245db9df3337 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
 obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
 
 # logging
+obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o
 obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o
 
 # NAT helpers (nf_conntrack)
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
new file mode 100644
index 000000000000..ccfc78db12ee
--- /dev/null
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -0,0 +1,149 @@
+/*
+ * (C) 2014 by Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * Based on code from ebt_log from:
+ *
+ * Bart De Schuymer <bdschuym@pandora.be>
+ * Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/ip.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter/xt_LOG.h>
+#include <net/netfilter/nf_log.h>
+
+static struct nf_loginfo default_loginfo = {
+	.type	= NF_LOG_TYPE_LOG,
+	.u = {
+		.log = {
+			.level	  = 5,
+			.logflags = NF_LOG_MASK,
+		},
+	},
+};
+
+struct arppayload {
+	unsigned char mac_src[ETH_ALEN];
+	unsigned char ip_src[4];
+	unsigned char mac_dst[ETH_ALEN];
+	unsigned char ip_dst[4];
+};
+
+static void dump_arp_packet(struct nf_log_buf *m,
+			    const struct nf_loginfo *info,
+			    const struct sk_buff *skb, unsigned int nhoff)
+{
+	const struct arphdr *ah;
+	struct arphdr _arph;
+	const struct arppayload *ap;
+	struct arppayload _arpp;
+
+	ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
+	if (ah == NULL) {
+		nf_log_buf_add(m, "TRUNCATED");
+		return;
+	}
+	nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d",
+		       ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op));
+
+	/* If it's for Ethernet and the lengths are OK, then log the ARP
+	 * payload.
+	 */
+	if (ah->ar_hrd != htons(1) ||
+	    ah->ar_hln != ETH_ALEN ||
+	    ah->ar_pln != sizeof(__be32))
+		return;
+
+	ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp);
+	if (ap == NULL) {
+		nf_log_buf_add(m, " INCOMPLETE [%Zu bytes]",
+			       skb->len - sizeof(_arph));
+		return;
+	}
+	nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4",
+		       ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst);
+}
+
+void nf_log_arp_packet(struct net *net, u_int8_t pf,
+		      unsigned int hooknum, const struct sk_buff *skb,
+		      const struct net_device *in,
+		      const struct net_device *out,
+		      const struct nf_loginfo *loginfo,
+		      const char *prefix)
+{
+	struct nf_log_buf *m;
+
+	/* FIXME: Disabled from containers until syslog ns is supported */
+	if (!net_eq(net, &init_net))
+		return;
+
+	m = nf_log_buf_open();
+
+	if (!loginfo)
+		loginfo = &default_loginfo;
+
+	nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
+				  prefix);
+	dump_arp_packet(m, loginfo, skb, 0);
+
+	nf_log_buf_close(m);
+}
+
+static struct nf_logger nf_arp_logger __read_mostly = {
+	.name		= "nf_log_arp",
+	.type		= NF_LOG_TYPE_LOG,
+	.logfn		= nf_log_arp_packet,
+	.me		= THIS_MODULE,
+};
+
+static int __net_init nf_log_arp_net_init(struct net *net)
+{
+	nf_log_set(net, NFPROTO_ARP, &nf_arp_logger);
+	return 0;
+}
+
+static void __net_exit nf_log_arp_net_exit(struct net *net)
+{
+	nf_log_unset(net, &nf_arp_logger);
+}
+
+static struct pernet_operations nf_log_arp_net_ops = {
+	.init = nf_log_arp_net_init,
+	.exit = nf_log_arp_net_exit,
+};
+
+static int __init nf_log_arp_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&nf_log_arp_net_ops);
+	if (ret < 0)
+		return ret;
+
+	nf_log_register(NFPROTO_ARP, &nf_arp_logger);
+	return 0;
+}
+
+static void __exit nf_log_arp_exit(void)
+{
+	unregister_pernet_subsys(&nf_log_arp_net_ops);
+	nf_log_unregister(&nf_arp_logger);
+}
+
+module_init(nf_log_arp_init);
+module_exit(nf_log_arp_exit);
+
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_DESCRIPTION("Netfilter ARP packet logging");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NF_LOGGER(3, 0);
-- 
cgit v1.2.3


From 57b47553f65e12e2e4f1608168374b0e651de843 Mon Sep 17 00:00:00 2001
From: Octavian Purdila
Date: Wed, 25 Jun 2014 17:09:50 +0300
Subject: tcp: cookie_v4_init_sequence: skb should be const

Signed-off-by: Octavian Purdila <octavian.purdila@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     | 5 +++--
 net/ipv4/syncookies.c | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 06a3023080c0..39e47c4e4f19 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -493,10 +493,11 @@ static inline u32 tcp_cookie_time(void)
 
 u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
 			      u16 *mssp);
-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mss);
+__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
+			      __u16 *mss);
 #else
 static inline __u32 cookie_v4_init_sequence(struct sock *sk,
-					    struct sk_buff *skb,
+					    const struct sk_buff *skb,
 					    __u16 *mss)
 {
 	return 0;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index c86624b36a62..c0c75688896e 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -170,7 +170,8 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
 }
 EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
 
-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
+__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
+			      __u16 *mssp)
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	const struct tcphdr *th = tcp_hdr(skb);
-- 
cgit v1.2.3


From aa27fc501850030fb5d1ee705feb836ee6a21f2a Mon Sep 17 00:00:00 2001
From: Octavian Purdila
Date: Wed, 25 Jun 2014 17:09:51 +0300
Subject: tcp: tcp_v[46]_conn_request: fix snt_synack initialization

Commit 016818d07 (tcp: TCP Fast Open Server - take SYNACK RTT after
completing 3WHS) changes the code to only take a snt_synack timestamp
when a SYNACK transmit or retransmit succeeds. This behaviour is later
broken by commit 843f4a55e (tcp: use tcp_v4_send_synack on first
SYN-ACK), as snt_synack is now updated even if tcp_v4_send_synack
fails.

Also, commit 3a19ce0ee (tcp: IPv6 support for fastopen server) misses
the required IPv6 updates for 016818d07.

This patch makes sure that snt_synack is updated only when the SYNACK
trasnmit/retransmit succeeds, for both IPv4 and IPv6.

Cc: Cardwell <ncardwell@google.com>
Cc: Daniel Lee <longinus00@gmail.com>
Cc: Yuchung Cheng <ycheng@google.com>

Signed-off-by: Octavian Purdila <octavian.purdila@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 2 --
 net/ipv6/tcp_ipv6.c | 3 ++-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 180336d47df6..145f6402c560 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1370,7 +1370,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		goto drop_and_free;
 
 	tcp_rsk(req)->snt_isn = isn;
-	tcp_rsk(req)->snt_synack = tcp_time_stamp;
 	tcp_openreq_init_rwin(req, sk, dst);
 	fastopen = !want_cookie &&
 		   tcp_try_fastopen(sk, skb, req, &foc, dst);
@@ -1380,7 +1379,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		if (err || want_cookie)
 			goto drop_and_free;
 
-		tcp_rsk(req)->snt_synack = tcp_time_stamp;
 		tcp_rsk(req)->listener = NULL;
 		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
 	}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 08ae3da0db4a..a962455471ba 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -497,6 +497,8 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
 		skb_set_queue_mapping(skb, queue_mapping);
 		err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass);
 		err = net_xmit_eval(err);
+		if (!tcp_rsk(req)->snt_synack && !err)
+			tcp_rsk(req)->snt_synack = tcp_time_stamp;
 	}
 
 done:
@@ -1100,7 +1102,6 @@ have_isn:
 		goto drop_and_free;
 
 	tcp_rsk(req)->snt_isn = isn;
-	tcp_rsk(req)->snt_synack = tcp_time_stamp;
 	tcp_openreq_init_rwin(req, sk, dst);
 	fastopen = !want_cookie &&
 		   tcp_try_fastopen(sk, skb, req, &foc, dst);
-- 
cgit v1.2.3


From 16bea70aa7302b6f3bf3502d5a0efb4ea2ce4712 Mon Sep 17 00:00:00 2001
From: Octavian Purdila
Date: Wed, 25 Jun 2014 17:09:53 +0300
Subject: tcp: add init_req method to tcp_request_sock_ops

Move the specific IPv4/IPv6 intializations to a new method in
tcp_request_sock_ops in preparation for unifying tcp_v4_conn_request
and tcp_v6_conn_request.

Signed-off-by: Octavian Purdila <octavian.purdila@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h |  3 ---
 include/net/tcp.h   |  2 ++
 net/ipv4/tcp_ipv4.c | 29 ++++++++++++++++------------
 net/ipv6/tcp_ipv6.c | 55 +++++++++++++++++++++++++++++++----------------------
 4 files changed, 51 insertions(+), 38 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index a0513210798f..fa5258f322e7 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -111,10 +111,7 @@ struct tcp_request_sock_ops;
 
 struct tcp_request_sock {
 	struct inet_request_sock 	req;
-#ifdef CONFIG_TCP_MD5SIG
-	/* Only used by TCP MD5 Signature so far. */
 	const struct tcp_request_sock_ops *af_specific;
-#endif
 	struct sock			*listener; /* needed for TFO */
 	u32				rcv_isn;
 	u32				snt_isn;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 39e47c4e4f19..7ad8ce296c3b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1613,6 +1613,8 @@ struct tcp_request_sock_ops {
 						  const struct request_sock *req,
 						  const struct sk_buff *skb);
 #endif
+	void (*init_req)(struct request_sock *req, struct sock *sk,
+			 struct sk_buff *skb);
 };
 
 int tcpv4_offload_init(void);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 145f6402c560..f86a86b30d20 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1237,6 +1237,17 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
 
 #endif
 
+static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
+			    struct sk_buff *skb)
+{
+	struct inet_request_sock *ireq = inet_rsk(req);
+
+	ireq->ir_loc_addr = ip_hdr(skb)->daddr;
+	ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
+	ireq->no_srccheck = inet_sk(sk)->transparent;
+	ireq->opt = tcp_v4_save_options(skb);
+}
+
 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
 	.family		=	PF_INET,
 	.obj_size	=	sizeof(struct tcp_request_sock),
@@ -1247,26 +1258,26 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = {
 	.syn_ack_timeout = 	tcp_syn_ack_timeout,
 };
 
-#ifdef CONFIG_TCP_MD5SIG
 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
+#ifdef CONFIG_TCP_MD5SIG
 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
-};
 #endif
+	.init_req	=	tcp_v4_init_req,
+};
 
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_options_received tmp_opt;
 	struct request_sock *req;
-	struct inet_request_sock *ireq;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct dst_entry *dst = NULL;
 	__be32 saddr = ip_hdr(skb)->saddr;
-	__be32 daddr = ip_hdr(skb)->daddr;
 	__u32 isn = TCP_SKB_CB(skb)->when;
 	bool want_cookie = false, fastopen;
 	struct flowi4 fl4;
 	struct tcp_fastopen_cookie foc = { .len = -1 };
+	const struct tcp_request_sock_ops *af_ops;
 	int err;
 
 	/* Never answer to SYNs send to broadcast or multicast */
@@ -1298,9 +1309,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (!req)
 		goto drop;
 
-#ifdef CONFIG_TCP_MD5SIG
-	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
-#endif
+	af_ops = tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
 
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
@@ -1313,11 +1322,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
 	tcp_openreq_init(req, &tmp_opt, skb, sk);
 
-	ireq = inet_rsk(req);
-	ireq->ir_loc_addr = daddr;
-	ireq->ir_rmt_addr = saddr;
-	ireq->no_srccheck = inet_sk(sk)->transparent;
-	ireq->opt = tcp_v4_save_options(skb);
+	af_ops->init_req(req, sk, skb);
 
 	if (security_inet_conn_request(sk, skb, req))
 		goto drop_and_free;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5e2d7e655c0f..87a360c3eba9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -720,6 +720,31 @@ static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
 }
 #endif
 
+static void tcp_v6_init_req(struct request_sock *req, struct sock *sk,
+			    struct sk_buff *skb)
+{
+	struct inet_request_sock *ireq = inet_rsk(req);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+
+	ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
+	ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
+
+	ireq->ir_iif = sk->sk_bound_dev_if;
+
+	/* So that link locals have meaning */
+	if (!sk->sk_bound_dev_if &&
+	    ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
+		ireq->ir_iif = inet6_iif(skb);
+
+	if (!TCP_SKB_CB(skb)->when &&
+	    (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo ||
+	     np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||
+	     np->rxopt.bits.rxohlim || np->repflow)) {
+		atomic_inc(&skb->users);
+		ireq->pktopts = skb;
+	}
+}
+
 struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
 	.family		=	AF_INET6,
 	.obj_size	=	sizeof(struct tcp6_request_sock),
@@ -730,12 +755,13 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
 	.syn_ack_timeout =	tcp_syn_ack_timeout,
 };
 
-#ifdef CONFIG_TCP_MD5SIG
 static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
+#ifdef CONFIG_TCP_MD5SIG
 	.md5_lookup	=	tcp_v6_reqsk_md5_lookup,
 	.calc_md5_hash	=	tcp_v6_md5_hash_skb,
-};
 #endif
+	.init_req	=	tcp_v6_init_req,
+};
 
 static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
 				 u32 tsval, u32 tsecr, int oif,
@@ -983,13 +1009,13 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	struct tcp_options_received tmp_opt;
 	struct request_sock *req;
 	struct inet_request_sock *ireq;
-	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	__u32 isn = TCP_SKB_CB(skb)->when;
 	struct dst_entry *dst = NULL;
 	struct tcp_fastopen_cookie foc = { .len = -1 };
 	bool want_cookie = false, fastopen;
 	struct flowi6 fl6;
+	const struct tcp_request_sock_ops *af_ops;
 	int err;
 
 	if (skb->protocol == htons(ETH_P_IP))
@@ -1014,9 +1040,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	if (req == NULL)
 		goto drop;
 
-#ifdef CONFIG_TCP_MD5SIG
-	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
-#endif
+	af_ops = tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
 
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
@@ -1030,27 +1054,12 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_openreq_init(req, &tmp_opt, skb, sk);
 
 	ireq = inet_rsk(req);
-	ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
-	ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
+	af_ops->init_req(req, sk, skb);
+
 	if (!want_cookie || tmp_opt.tstamp_ok)
 		TCP_ECN_create_request(req, skb, sock_net(sk));
 
-	ireq->ir_iif = sk->sk_bound_dev_if;
-
-	/* So that link locals have meaning */
-	if (!sk->sk_bound_dev_if &&
-	    ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
-		ireq->ir_iif = inet6_iif(skb);
-
 	if (!isn) {
-		if (ipv6_opt_accepted(sk, skb) ||
-		    np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
-		    np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim ||
-		    np->repflow) {
-			atomic_inc(&skb->users);
-			ireq->pktopts = skb;
-		}
-
 		if (want_cookie) {
 			isn = cookie_v6_init_sequence(sk, skb, &req->mss);
 			req->cookie_ts = tmp_opt.tstamp_ok;
-- 
cgit v1.2.3


From fb7b37a7f3d6f7b7ba05ee526fee96810d5b92a8 Mon Sep 17 00:00:00 2001
From: Octavian Purdila
Date: Wed, 25 Jun 2014 17:09:54 +0300
Subject: tcp: add init_cookie_seq method to tcp_request_sock_ops

Move the specific IPv4/IPv6 cookie sequence initialization to a new
method in tcp_request_sock_ops in preparation for unifying
tcp_v4_conn_request and tcp_v6_conn_request.

Signed-off-by: Octavian Purdila <octavian.purdila@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   | 34 ++++++++++++++++++++--------------
 net/ipv4/tcp_ipv4.c |  5 ++++-
 net/ipv6/tcp_ipv6.c |  5 ++++-
 3 files changed, 28 insertions(+), 16 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7ad8ce296c3b..086d00ec6d8b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -495,13 +495,6 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
 			      u16 *mssp);
 __u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
 			      __u16 *mss);
-#else
-static inline __u32 cookie_v4_init_sequence(struct sock *sk,
-					    const struct sk_buff *skb,
-					    __u16 *mss)
-{
-	return 0;
-}
 #endif
 
 __u32 cookie_init_timestamp(struct request_sock *req);
@@ -517,13 +510,6 @@ u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
 			      const struct tcphdr *th, u16 *mssp);
 __u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb,
 			      __u16 *mss);
-#else
-static inline __u32 cookie_v6_init_sequence(struct sock *sk,
-					    struct sk_buff *skb,
-					    __u16 *mss)
-{
-	return 0;
-}
 #endif
 /* tcp_output.c */
 
@@ -1615,8 +1601,28 @@ struct tcp_request_sock_ops {
 #endif
 	void (*init_req)(struct request_sock *req, struct sock *sk,
 			 struct sk_buff *skb);
+#ifdef CONFIG_SYN_COOKIES
+	__u32 (*cookie_init_seq)(struct sock *sk, const struct sk_buff *skb,
+				 __u16 *mss);
+#endif
 };
 
+#ifdef CONFIG_SYN_COOKIES
+static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
+					 struct sock *sk, struct sk_buff *skb,
+					 __u16 *mss)
+{
+	return ops->cookie_init_seq(sk, skb, mss);
+}
+#else
+static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
+					 struct sock *sk, struct sk_buff *skb,
+					 __u16 *mss)
+{
+	return 0;
+}
+#endif
+
 int tcpv4_offload_init(void);
 
 void tcp_v4_init(void);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f86a86b30d20..8c69e44c287b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1264,6 +1264,9 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
 #endif
 	.init_req	=	tcp_v4_init_req,
+#ifdef CONFIG_SYN_COOKIES
+	.cookie_init_seq =	cookie_v4_init_sequence,
+#endif
 };
 
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
@@ -1331,7 +1334,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		TCP_ECN_create_request(req, skb, sock_net(sk));
 
 	if (want_cookie) {
-		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
+		isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
 		req->cookie_ts = tmp_opt.tstamp_ok;
 	} else if (!isn) {
 		/* VJ's idea. We save last timestamp seen
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 87a360c3eba9..17710cffddaa 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -761,6 +761,9 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
 	.calc_md5_hash	=	tcp_v6_md5_hash_skb,
 #endif
 	.init_req	=	tcp_v6_init_req,
+#ifdef CONFIG_SYN_COOKIES
+	.cookie_init_seq =	cookie_v6_init_sequence,
+#endif
 };
 
 static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
@@ -1061,7 +1064,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 
 	if (!isn) {
 		if (want_cookie) {
-			isn = cookie_v6_init_sequence(sk, skb, &req->mss);
+			isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
 			req->cookie_ts = tmp_opt.tstamp_ok;
 			goto have_isn;
 		}
-- 
cgit v1.2.3


From d94e0417ad8d96d7d96b69335338ad942eaeecf1 Mon Sep 17 00:00:00 2001
From: Octavian Purdila
Date: Wed, 25 Jun 2014 17:09:55 +0300
Subject: tcp: add route_req method to tcp_request_sock_ops

Create wrappers with same signature for the IPv4/IPv6 request routing
calls and use these wrappers (via route_req method from
tcp_request_sock_ops) in tcp_v4_conn_request and tcp_v6_conn_request
with the purpose of unifying the two functions in a later patch.

We can later drop the wrapper functions and modify inet_csk_route_req
and inet6_cks_route_req to use the same signature.

Signed-off-by: Octavian Purdila <octavian.purdila@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   |  3 +++
 net/ipv4/tcp_ipv4.c | 36 +++++++++++++++++++++++++++++-------
 net/ipv6/tcp_ipv6.c | 26 ++++++++++++++++++++------
 3 files changed, 52 insertions(+), 13 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 086d00ec6d8b..59fcc5934c79 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1605,6 +1605,9 @@ struct tcp_request_sock_ops {
 	__u32 (*cookie_init_seq)(struct sock *sk, const struct sk_buff *skb,
 				 __u16 *mss);
 #endif
+	struct dst_entry *(*route_req)(struct sock *sk, struct flowi *fl,
+				       const struct request_sock *req,
+				       bool *strict);
 };
 
 #ifdef CONFIG_SYN_COOKIES
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 8c69e44c287b..54fbbd8b4fcd 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1248,6 +1248,22 @@ static void tcp_v4_init_req(struct request_sock *req, struct sock *sk,
 	ireq->opt = tcp_v4_save_options(skb);
 }
 
+static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
+					  const struct request_sock *req,
+					  bool *strict)
+{
+	struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
+
+	if (strict) {
+		if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
+			*strict = true;
+		else
+			*strict = false;
+	}
+
+	return dst;
+}
+
 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
 	.family		=	PF_INET,
 	.obj_size	=	sizeof(struct tcp_request_sock),
@@ -1267,6 +1283,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
 #ifdef CONFIG_SYN_COOKIES
 	.cookie_init_seq =	cookie_v4_init_sequence,
 #endif
+	.route_req	=	tcp_v4_route_req,
 };
 
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
@@ -1346,11 +1363,13 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		 * timewait bucket, so that all the necessary checks
 		 * are made in the function processing timewait state.
 		 */
-		if (tmp_opt.saw_tstamp &&
-		    tcp_death_row.sysctl_tw_recycle &&
-		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
-		    fl4.daddr == saddr) {
-			if (!tcp_peer_is_proven(req, dst, true)) {
+		if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) {
+			bool strict;
+
+			dst = af_ops->route_req(sk, (struct flowi *)&fl4, req,
+						&strict);
+			if (dst && strict &&
+			    !tcp_peer_is_proven(req, dst, true)) {
 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
 				goto drop_and_release;
 			}
@@ -1374,8 +1393,11 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 
 		isn = tcp_v4_init_sequence(skb);
 	}
-	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
-		goto drop_and_free;
+	if (!dst) {
+		dst = af_ops->route_req(sk, (struct flowi *)&fl4, req, NULL);
+		if (!dst)
+			goto drop_and_free;
+	}
 
 	tcp_rsk(req)->snt_isn = isn;
 	tcp_openreq_init_rwin(req, sk, dst);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 17710cffddaa..d780d8808566 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -745,6 +745,16 @@ static void tcp_v6_init_req(struct request_sock *req, struct sock *sk,
 	}
 }
 
+static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl,
+					  const struct request_sock *req,
+					  bool *strict)
+{
+	if (strict)
+		*strict = true;
+	return inet6_csk_route_req(sk, &fl->u.ip6, req);
+}
+
+
 struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
 	.family		=	AF_INET6,
 	.obj_size	=	sizeof(struct tcp6_request_sock),
@@ -764,6 +774,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
 #ifdef CONFIG_SYN_COOKIES
 	.cookie_init_seq =	cookie_v6_init_sequence,
 #endif
+	.route_req	=	tcp_v6_route_req,
 };
 
 static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
@@ -1078,10 +1089,10 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 		 * timewait bucket, so that all the necessary checks
 		 * are made in the function processing timewait state.
 		 */
-		if (tmp_opt.saw_tstamp &&
-		    tcp_death_row.sysctl_tw_recycle &&
-		    (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) {
-			if (!tcp_peer_is_proven(req, dst, true)) {
+		if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) {
+			dst = af_ops->route_req(sk, (struct flowi *)&fl6, req,
+						NULL);
+			if (dst && !tcp_peer_is_proven(req, dst, true)) {
 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
 				goto drop_and_release;
 			}
@@ -1110,8 +1121,11 @@ have_isn:
 	if (security_inet_conn_request(sk, skb, req))
 		goto drop_and_release;
 
-	if (!dst && (dst = inet6_csk_route_req(sk, &fl6, req)) == NULL)
-		goto drop_and_free;
+	if (!dst) {
+		dst = af_ops->route_req(sk, (struct flowi *)&fl6, req, NULL);
+		if (!dst)
+			goto drop_and_free;
+	}
 
 	tcp_rsk(req)->snt_isn = isn;
 	tcp_openreq_init_rwin(req, sk, dst);
-- 
cgit v1.2.3


From 936b8bdb53f90840e658904530f9db8d02ac804b Mon Sep 17 00:00:00 2001
From: Octavian Purdila
Date: Wed, 25 Jun 2014 17:09:57 +0300
Subject: tcp: add init_seq method to tcp_request_sock_ops

More work in preparation of unifying tcp_v4_conn_request and
tcp_v6_conn_request: indirect the init sequence calls via the
tcp_request_sock_ops.

Signed-off-by: Octavian Purdila <octavian.purdila@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   | 1 +
 net/ipv4/tcp_ipv4.c | 5 +++--
 net/ipv6/tcp_ipv6.c | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 59fcc5934c79..8cacf0d6ed4d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1608,6 +1608,7 @@ struct tcp_request_sock_ops {
 	struct dst_entry *(*route_req)(struct sock *sk, struct flowi *fl,
 				       const struct request_sock *req,
 				       bool *strict);
+	__u32 (*init_seq)(const struct sk_buff *skb);
 };
 
 #ifdef CONFIG_SYN_COOKIES
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 54fbbd8b4fcd..43478265006a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -99,7 +99,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
 struct inet_hashinfo tcp_hashinfo;
 EXPORT_SYMBOL(tcp_hashinfo);
 
-static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
+static  __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 {
 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 					  ip_hdr(skb)->saddr,
@@ -1284,6 +1284,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
 	.cookie_init_seq =	cookie_v4_init_sequence,
 #endif
 	.route_req	=	tcp_v4_route_req,
+	.init_seq	=	tcp_v4_init_sequence,
 };
 
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
@@ -1391,7 +1392,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 			goto drop_and_release;
 		}
 
-		isn = tcp_v4_init_sequence(skb);
+		isn = af_ops->init_seq(skb);
 	}
 	if (!dst) {
 		dst = af_ops->route_req(sk, (struct flowi *)&fl4, req, NULL);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 91b8a2e699f3..2fd886fe8340 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -775,6 +775,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
 	.cookie_init_seq =	cookie_v6_init_sequence,
 #endif
 	.route_req	=	tcp_v6_route_req,
+	.init_seq	=	tcp_v6_init_sequence,
 };
 
 static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
@@ -1114,7 +1115,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 			goto drop_and_release;
 		}
 
-		isn = tcp_v6_init_sequence(skb);
+		isn = af_ops->init_seq(skb);
 	}
 
 	if (!dst) {
-- 
cgit v1.2.3


From d6274bd8d6ea84b7b54cc1c3fde6bcb6143b104f Mon Sep 17 00:00:00 2001
From: Octavian Purdila
Date: Wed, 25 Jun 2014 17:09:58 +0300
Subject: tcp: add send_synack method to tcp_request_sock_ops

Create a new tcp_request_sock_ops method to unify the IPv4/IPv6
signature for tcp_v[46]_send_synack. This allows us to later unify
tcp_v4_rtx_synack with tcp_v6_rtx_synack and tcp_v4_conn_request with
tcp_v4_conn_request.

Signed-off-by: Octavian Purdila <octavian.purdila@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   |  3 +++
 net/ipv4/tcp_ipv4.c |  9 ++++++---
 net/ipv6/tcp_ipv6.c | 14 ++++++++------
 3 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8cacf0d6ed4d..8c05c25018d5 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1609,6 +1609,9 @@ struct tcp_request_sock_ops {
 				       const struct request_sock *req,
 				       bool *strict);
 	__u32 (*init_seq)(const struct sk_buff *skb);
+	int (*send_synack)(struct sock *sk, struct dst_entry *dst,
+			   struct flowi *fl, struct request_sock *req,
+			   u16 queue_mapping, struct tcp_fastopen_cookie *foc);
 };
 
 #ifdef CONFIG_SYN_COOKIES
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 43478265006a..b5945ac50876 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -814,6 +814,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
  *	socket.
  */
 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
+			      struct flowi *fl,
 			      struct request_sock *req,
 			      u16 queue_mapping,
 			      struct tcp_fastopen_cookie *foc)
@@ -846,7 +847,8 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 
 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
 {
-	int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL);
+	const struct  tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
+	int res = af_ops->send_synack(sk, NULL, NULL, req, 0, NULL);
 
 	if (!res) {
 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
@@ -1285,6 +1287,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
 #endif
 	.route_req	=	tcp_v4_route_req,
 	.init_seq	=	tcp_v4_init_sequence,
+	.send_synack	=	tcp_v4_send_synack,
 };
 
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
@@ -1404,8 +1407,8 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_openreq_init_rwin(req, sk, dst);
 	fastopen = !want_cookie &&
 		   tcp_try_fastopen(sk, skb, req, &foc, dst);
-	err = tcp_v4_send_synack(sk, dst, req,
-				 skb_get_queue_mapping(skb), &foc);
+	err = af_ops->send_synack(sk, dst, NULL, req,
+				  skb_get_queue_mapping(skb), &foc);
 	if (!fastopen) {
 		if (err || want_cookie)
 			goto drop_and_free;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2fd886fe8340..210b6105afed 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -470,13 +470,14 @@ out:
 
 
 static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
-			      struct flowi6 *fl6,
+			      struct flowi *fl,
 			      struct request_sock *req,
 			      u16 queue_mapping,
 			      struct tcp_fastopen_cookie *foc)
 {
 	struct inet_request_sock *ireq = inet_rsk(req);
 	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct flowi6 *fl6 = &fl->u.ip6;
 	struct sk_buff *skb;
 	int err = -ENOMEM;
 
@@ -507,10 +508,11 @@ done:
 
 static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
 {
-	struct flowi6 fl6;
+	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
+	struct flowi fl;
 	int res;
 
-	res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0, NULL);
+	res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
 	if (!res) {
 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
@@ -754,7 +756,6 @@ static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl,
 	return inet6_csk_route_req(sk, &fl->u.ip6, req);
 }
 
-
 struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
 	.family		=	AF_INET6,
 	.obj_size	=	sizeof(struct tcp6_request_sock),
@@ -776,6 +777,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
 #endif
 	.route_req	=	tcp_v6_route_req,
 	.init_seq	=	tcp_v6_init_sequence,
+	.send_synack	=	tcp_v6_send_synack,
 };
 
 static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
@@ -1128,8 +1130,8 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_openreq_init_rwin(req, sk, dst);
 	fastopen = !want_cookie &&
 		   tcp_try_fastopen(sk, skb, req, &foc, dst);
-	err = tcp_v6_send_synack(sk, dst, &fl6, req,
-				 skb_get_queue_mapping(skb), &foc);
+	err = af_ops->send_synack(sk, dst, (struct flowi *)&fl6, req,
+				  skb_get_queue_mapping(skb), &foc);
 	if (!fastopen) {
 		if (err || want_cookie)
 			goto drop_and_free;
-- 
cgit v1.2.3


From 5db92c994982ed826cf38f38d58bd09bc326aef6 Mon Sep 17 00:00:00 2001
From: Octavian Purdila
Date: Wed, 25 Jun 2014 17:09:59 +0300
Subject: tcp: unify tcp_v4_rtx_synack and tcp_v6_rtx_synack

Signed-off-by: Octavian Purdila <octavian.purdila@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  2 ++
 net/ipv4/tcp_ipv4.c   | 14 +-------------
 net/ipv4/tcp_output.c | 15 +++++++++++++++
 net/ipv6/tcp_ipv6.c   | 15 +--------------
 4 files changed, 19 insertions(+), 27 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8c05c25018d5..8e9c28dccb80 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1573,6 +1573,8 @@ int tcp4_proc_init(void);
 void tcp4_proc_exit(void);
 #endif
 
+int tcp_rtx_synack(struct sock *sk, struct request_sock *req);
+
 /* TCP af-specific functions */
 struct tcp_sock_af_ops {
 #ifdef CONFIG_TCP_MD5SIG
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b5945ac50876..597dd9d75210 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -845,18 +845,6 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 	return err;
 }
 
-static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
-{
-	const struct  tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
-	int res = af_ops->send_synack(sk, NULL, NULL, req, 0, NULL);
-
-	if (!res) {
-		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
-	}
-	return res;
-}
-
 /*
  *	IPv4 request_sock destructor.
  */
@@ -1269,7 +1257,7 @@ static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
 	.family		=	PF_INET,
 	.obj_size	=	sizeof(struct tcp_request_sock),
-	.rtx_syn_ack	=	tcp_v4_rtx_synack,
+	.rtx_syn_ack	=	tcp_rtx_synack,
 	.send_ack	=	tcp_v4_reqsk_send_ack,
 	.destructor	=	tcp_v4_reqsk_destructor,
 	.send_reset	=	tcp_v4_send_reset,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d92bce0ea24e..f8f2a944a1ce 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3299,3 +3299,18 @@ void tcp_send_probe0(struct sock *sk)
 					  TCP_RTO_MAX);
 	}
 }
+
+int tcp_rtx_synack(struct sock *sk, struct request_sock *req)
+{
+	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
+	struct flowi fl;
+	int res;
+
+	res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
+	if (!res) {
+		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+	}
+	return res;
+}
+EXPORT_SYMBOL(tcp_rtx_synack);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 210b6105afed..41389bbb08c0 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -506,19 +506,6 @@ done:
 	return err;
 }
 
-static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
-{
-	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
-	struct flowi fl;
-	int res;
-
-	res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
-	if (!res) {
-		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
-	}
-	return res;
-}
 
 static void tcp_v6_reqsk_destructor(struct request_sock *req)
 {
@@ -759,7 +746,7 @@ static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl,
 struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
 	.family		=	AF_INET6,
 	.obj_size	=	sizeof(struct tcp6_request_sock),
-	.rtx_syn_ack	=	tcp_v6_rtx_synack,
+	.rtx_syn_ack	=	tcp_rtx_synack,
 	.send_ack	=	tcp_v6_reqsk_send_ack,
 	.destructor	=	tcp_v6_reqsk_destructor,
 	.send_reset	=	tcp_v6_send_reset,
-- 
cgit v1.2.3


From 2aec4a297b21f3690486bbf8f7d5d29281ba6a48 Mon Sep 17 00:00:00 2001
From: Octavian Purdila
Date: Wed, 25 Jun 2014 17:10:00 +0300
Subject: tcp: add mss_clamp to tcp_request_sock_ops

Add mss_clamp member to tcp_request_sock_ops so that we can later
unify tcp_v4_conn_request and tcp_v6_conn_request.

Signed-off-by: Octavian Purdila <octavian.purdila@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   | 1 +
 net/ipv4/tcp_ipv4.c | 3 ++-
 net/ipv6/tcp_ipv6.c | 4 +++-
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8e9c28dccb80..30fe98bc8957 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1592,6 +1592,7 @@ struct tcp_sock_af_ops {
 };
 
 struct tcp_request_sock_ops {
+	u16 mss_clamp;
 #ifdef CONFIG_TCP_MD5SIG
 	struct tcp_md5sig_key	*(*md5_lookup) (struct sock *sk,
 						struct request_sock *req);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 597dd9d75210..499d440539ad 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1265,6 +1265,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = {
 };
 
 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
+	.mss_clamp	=	TCP_MSS_DEFAULT,
 #ifdef CONFIG_TCP_MD5SIG
 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
@@ -1324,7 +1325,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	af_ops = tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
 
 	tcp_clear_options(&tmp_opt);
-	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
+	tmp_opt.mss_clamp = af_ops->mss_clamp;
 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
 	tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 41389bbb08c0..ad658332cf7d 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -754,6 +754,8 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
 };
 
 static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
+	.mss_clamp	=	IPV6_MIN_MTU - sizeof(struct tcphdr) -
+				sizeof(struct ipv6hdr),
 #ifdef CONFIG_TCP_MD5SIG
 	.md5_lookup	=	tcp_v6_reqsk_md5_lookup,
 	.calc_md5_hash	=	tcp_v6_md5_hash_skb,
@@ -1047,7 +1049,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	af_ops = tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
 
 	tcp_clear_options(&tmp_opt);
-	tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
+	tmp_opt.mss_clamp = af_ops->mss_clamp;
 	tmp_opt.user_mss = tp->rx_opt.user_mss;
 	tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
 
-- 
cgit v1.2.3


From 695da14eb0af21129187ed3810e329b21262e45f Mon Sep 17 00:00:00 2001
From: Octavian Purdila
Date: Wed, 25 Jun 2014 17:10:01 +0300
Subject: tcp: add queue_add_hash to tcp_request_sock_ops

Add queue_add_hash member to tcp_request_sock_ops so that we can later
unify tcp_v4_conn_request and tcp_v6_conn_request.

Signed-off-by: Octavian Purdila <octavian.purdila@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   | 2 ++
 net/ipv4/tcp_ipv4.c | 3 ++-
 net/ipv6/tcp_ipv6.c | 3 ++-
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 30fe98bc8957..cec6e2cf0610 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1615,6 +1615,8 @@ struct tcp_request_sock_ops {
 	int (*send_synack)(struct sock *sk, struct dst_entry *dst,
 			   struct flowi *fl, struct request_sock *req,
 			   u16 queue_mapping, struct tcp_fastopen_cookie *foc);
+	void (*queue_hash_add)(struct sock *sk, struct request_sock *req,
+			       const unsigned long timeout);
 };
 
 #ifdef CONFIG_SYN_COOKIES
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 499d440539ad..845c39de97ab 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1277,6 +1277,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
 	.route_req	=	tcp_v4_route_req,
 	.init_seq	=	tcp_v4_init_sequence,
 	.send_synack	=	tcp_v4_send_synack,
+	.queue_hash_add =	inet_csk_reqsk_queue_hash_add,
 };
 
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
@@ -1403,7 +1404,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 			goto drop_and_free;
 
 		tcp_rsk(req)->listener = NULL;
-		inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+		af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
 	}
 
 	return 0;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ad658332cf7d..8232bc7423c6 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -767,6 +767,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
 	.route_req	=	tcp_v6_route_req,
 	.init_seq	=	tcp_v6_init_sequence,
 	.send_synack	=	tcp_v6_send_synack,
+	.queue_hash_add =	inet6_csk_reqsk_queue_hash_add,
 };
 
 static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
@@ -1126,7 +1127,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 			goto drop_and_free;
 
 		tcp_rsk(req)->listener = NULL;
-		inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+		af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
 	}
 	return 0;
 
-- 
cgit v1.2.3


From 1fb6f159fd21c640a28eb65fbd62ce8c9f6a777e Mon Sep 17 00:00:00 2001
From: Octavian Purdila
Date: Wed, 25 Jun 2014 17:10:02 +0300
Subject: tcp: add tcp_conn_request

Create tcp_conn_request and remove most of the code from
tcp_v4_conn_request and tcp_v6_conn_request.

Signed-off-by: Octavian Purdila <octavian.purdila@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    |   3 ++
 net/ipv4/tcp_input.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c  | 128 +-------------------------------------------
 net/ipv6/tcp_ipv6.c  | 120 +----------------------------------------
 4 files changed, 155 insertions(+), 244 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index cec6e2cf0610..0d5389aecf18 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1574,6 +1574,9 @@ void tcp4_proc_exit(void);
 #endif
 
 int tcp_rtx_synack(struct sock *sk, struct request_sock *req);
+int tcp_conn_request(struct request_sock_ops *rsk_ops,
+		     const struct tcp_request_sock_ops *af_ops,
+		     struct sock *sk, struct sk_buff *skb);
 
 /* TCP af-specific functions */
 struct tcp_sock_af_ops {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b5c23756965a..97e48d60c4e8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5877,3 +5877,151 @@ discard:
 	return 0;
 }
 EXPORT_SYMBOL(tcp_rcv_state_process);
+
+static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
+{
+	struct inet_request_sock *ireq = inet_rsk(req);
+
+	if (family == AF_INET)
+		LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
+			       &ireq->ir_rmt_addr, port);
+	else
+		LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"),
+			       &ireq->ir_v6_rmt_addr, port);
+}
+
+int tcp_conn_request(struct request_sock_ops *rsk_ops,
+		     const struct tcp_request_sock_ops *af_ops,
+		     struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_options_received tmp_opt;
+	struct request_sock *req;
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = NULL;
+	__u32 isn = TCP_SKB_CB(skb)->when;
+	bool want_cookie = false, fastopen;
+	struct flowi fl;
+	struct tcp_fastopen_cookie foc = { .len = -1 };
+	int err;
+
+
+	/* TW buckets are converted to open requests without
+	 * limitations, they conserve resources and peer is
+	 * evidently real one.
+	 */
+	if ((sysctl_tcp_syncookies == 2 ||
+	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {
+		want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
+		if (!want_cookie)
+			goto drop;
+	}
+
+
+	/* Accept backlog is full. If we have already queued enough
+	 * of warm entries in syn queue, drop request. It is better than
+	 * clogging syn queue with openreqs with exponentially increasing
+	 * timeout.
+	 */
+	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+		goto drop;
+	}
+
+	req = inet_reqsk_alloc(rsk_ops);
+	if (!req)
+		goto drop;
+
+	tcp_rsk(req)->af_specific = af_ops;
+
+	tcp_clear_options(&tmp_opt);
+	tmp_opt.mss_clamp = af_ops->mss_clamp;
+	tmp_opt.user_mss  = tp->rx_opt.user_mss;
+	tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
+
+	if (want_cookie && !tmp_opt.saw_tstamp)
+		tcp_clear_options(&tmp_opt);
+
+	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
+	tcp_openreq_init(req, &tmp_opt, skb, sk);
+
+	af_ops->init_req(req, sk, skb);
+
+	if (security_inet_conn_request(sk, skb, req))
+		goto drop_and_free;
+
+	if (!want_cookie || tmp_opt.tstamp_ok)
+		TCP_ECN_create_request(req, skb, sock_net(sk));
+
+	if (want_cookie) {
+		isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
+		req->cookie_ts = tmp_opt.tstamp_ok;
+	} else if (!isn) {
+		/* VJ's idea. We save last timestamp seen
+		 * from the destination in peer table, when entering
+		 * state TIME-WAIT, and check against it before
+		 * accepting new connection request.
+		 *
+		 * If "isn" is not zero, this request hit alive
+		 * timewait bucket, so that all the necessary checks
+		 * are made in the function processing timewait state.
+		 */
+		if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) {
+			bool strict;
+
+			dst = af_ops->route_req(sk, &fl, req, &strict);
+			if (dst && strict &&
+			    !tcp_peer_is_proven(req, dst, true)) {
+				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
+				goto drop_and_release;
+			}
+		}
+		/* Kill the following clause, if you dislike this way. */
+		else if (!sysctl_tcp_syncookies &&
+			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+			  (sysctl_max_syn_backlog >> 2)) &&
+			 !tcp_peer_is_proven(req, dst, false)) {
+			/* Without syncookies last quarter of
+			 * backlog is filled with destinations,
+			 * proven to be alive.
+			 * It means that we continue to communicate
+			 * to destinations, already remembered
+			 * to the moment of synflood.
+			 */
+			pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
+				    rsk_ops->family);
+			goto drop_and_release;
+		}
+
+		isn = af_ops->init_seq(skb);
+	}
+	if (!dst) {
+		dst = af_ops->route_req(sk, &fl, req, NULL);
+		if (!dst)
+			goto drop_and_free;
+	}
+
+	tcp_rsk(req)->snt_isn = isn;
+	tcp_openreq_init_rwin(req, sk, dst);
+	fastopen = !want_cookie &&
+		   tcp_try_fastopen(sk, skb, req, &foc, dst);
+	err = af_ops->send_synack(sk, dst, &fl, req,
+				  skb_get_queue_mapping(skb), &foc);
+	if (!fastopen) {
+		if (err || want_cookie)
+			goto drop_and_free;
+
+		tcp_rsk(req)->listener = NULL;
+		af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
+	}
+
+	return 0;
+
+drop_and_release:
+	dst_release(dst);
+drop_and_free:
+	reqsk_free(req);
+drop:
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+	return 0;
+}
+EXPORT_SYMBOL(tcp_conn_request);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 845c39de97ab..5dfebd2f2e38 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1282,137 +1282,13 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
 
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 {
-	struct tcp_options_received tmp_opt;
-	struct request_sock *req;
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct dst_entry *dst = NULL;
-	__be32 saddr = ip_hdr(skb)->saddr;
-	__u32 isn = TCP_SKB_CB(skb)->when;
-	bool want_cookie = false, fastopen;
-	struct flowi4 fl4;
-	struct tcp_fastopen_cookie foc = { .len = -1 };
-	const struct tcp_request_sock_ops *af_ops;
-	int err;
-
 	/* Never answer to SYNs send to broadcast or multicast */
 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
 		goto drop;
 
-	/* TW buckets are converted to open requests without
-	 * limitations, they conserve resources and peer is
-	 * evidently real one.
-	 */
-	if ((sysctl_tcp_syncookies == 2 ||
-	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {
-		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
-		if (!want_cookie)
-			goto drop;
-	}
-
-	/* Accept backlog is full. If we have already queued enough
-	 * of warm entries in syn queue, drop request. It is better than
-	 * clogging syn queue with openreqs with exponentially increasing
-	 * timeout.
-	 */
-	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
-		goto drop;
-	}
-
-	req = inet_reqsk_alloc(&tcp_request_sock_ops);
-	if (!req)
-		goto drop;
-
-	af_ops = tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
-
-	tcp_clear_options(&tmp_opt);
-	tmp_opt.mss_clamp = af_ops->mss_clamp;
-	tmp_opt.user_mss  = tp->rx_opt.user_mss;
-	tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
-
-	if (want_cookie && !tmp_opt.saw_tstamp)
-		tcp_clear_options(&tmp_opt);
-
-	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
-	tcp_openreq_init(req, &tmp_opt, skb, sk);
-
-	af_ops->init_req(req, sk, skb);
-
-	if (security_inet_conn_request(sk, skb, req))
-		goto drop_and_free;
+	return tcp_conn_request(&tcp_request_sock_ops,
+				&tcp_request_sock_ipv4_ops, sk, skb);
 
-	if (!want_cookie || tmp_opt.tstamp_ok)
-		TCP_ECN_create_request(req, skb, sock_net(sk));
-
-	if (want_cookie) {
-		isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
-		req->cookie_ts = tmp_opt.tstamp_ok;
-	} else if (!isn) {
-		/* VJ's idea. We save last timestamp seen
-		 * from the destination in peer table, when entering
-		 * state TIME-WAIT, and check against it before
-		 * accepting new connection request.
-		 *
-		 * If "isn" is not zero, this request hit alive
-		 * timewait bucket, so that all the necessary checks
-		 * are made in the function processing timewait state.
-		 */
-		if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) {
-			bool strict;
-
-			dst = af_ops->route_req(sk, (struct flowi *)&fl4, req,
-						&strict);
-			if (dst && strict &&
-			    !tcp_peer_is_proven(req, dst, true)) {
-				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
-				goto drop_and_release;
-			}
-		}
-		/* Kill the following clause, if you dislike this way. */
-		else if (!sysctl_tcp_syncookies &&
-			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
-			  (sysctl_max_syn_backlog >> 2)) &&
-			 !tcp_peer_is_proven(req, dst, false)) {
-			/* Without syncookies last quarter of
-			 * backlog is filled with destinations,
-			 * proven to be alive.
-			 * It means that we continue to communicate
-			 * to destinations, already remembered
-			 * to the moment of synflood.
-			 */
-			LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
-				       &saddr, ntohs(tcp_hdr(skb)->source));
-			goto drop_and_release;
-		}
-
-		isn = af_ops->init_seq(skb);
-	}
-	if (!dst) {
-		dst = af_ops->route_req(sk, (struct flowi *)&fl4, req, NULL);
-		if (!dst)
-			goto drop_and_free;
-	}
-
-	tcp_rsk(req)->snt_isn = isn;
-	tcp_openreq_init_rwin(req, sk, dst);
-	fastopen = !want_cookie &&
-		   tcp_try_fastopen(sk, skb, req, &foc, dst);
-	err = af_ops->send_synack(sk, dst, NULL, req,
-				  skb_get_queue_mapping(skb), &foc);
-	if (!fastopen) {
-		if (err || want_cookie)
-			goto drop_and_free;
-
-		tcp_rsk(req)->listener = NULL;
-		af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
-	}
-
-	return 0;
-
-drop_and_release:
-	dst_release(dst);
-drop_and_free:
-	reqsk_free(req);
 drop:
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
 	return 0;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 8232bc7423c6..bc24ee21339a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1008,133 +1008,17 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
 	return sk;
 }
 
-/* FIXME: this is substantially similar to the ipv4 code.
- * Can some kind of merge be done? -- erics
- */
 static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 {
-	struct tcp_options_received tmp_opt;
-	struct request_sock *req;
-	struct inet_request_sock *ireq;
-	struct tcp_sock *tp = tcp_sk(sk);
-	__u32 isn = TCP_SKB_CB(skb)->when;
-	struct dst_entry *dst = NULL;
-	struct tcp_fastopen_cookie foc = { .len = -1 };
-	bool want_cookie = false, fastopen;
-	struct flowi6 fl6;
-	const struct tcp_request_sock_ops *af_ops;
-	int err;
-
 	if (skb->protocol == htons(ETH_P_IP))
 		return tcp_v4_conn_request(sk, skb);
 
 	if (!ipv6_unicast_destination(skb))
 		goto drop;
 
-	if ((sysctl_tcp_syncookies == 2 ||
-	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {
-		want_cookie = tcp_syn_flood_action(sk, skb, "TCPv6");
-		if (!want_cookie)
-			goto drop;
-	}
+	return tcp_conn_request(&tcp6_request_sock_ops,
+				&tcp_request_sock_ipv6_ops, sk, skb);
 
-	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
-		goto drop;
-	}
-
-	req = inet_reqsk_alloc(&tcp6_request_sock_ops);
-	if (req == NULL)
-		goto drop;
-
-	af_ops = tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
-
-	tcp_clear_options(&tmp_opt);
-	tmp_opt.mss_clamp = af_ops->mss_clamp;
-	tmp_opt.user_mss = tp->rx_opt.user_mss;
-	tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
-
-	if (want_cookie && !tmp_opt.saw_tstamp)
-		tcp_clear_options(&tmp_opt);
-
-	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
-	tcp_openreq_init(req, &tmp_opt, skb, sk);
-
-	ireq = inet_rsk(req);
-	af_ops->init_req(req, sk, skb);
-
-	if (security_inet_conn_request(sk, skb, req))
-		goto drop_and_release;
-
-	if (!want_cookie || tmp_opt.tstamp_ok)
-		TCP_ECN_create_request(req, skb, sock_net(sk));
-
-	if (want_cookie) {
-		isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
-		req->cookie_ts = tmp_opt.tstamp_ok;
-	} else if (!isn) {
-		/* VJ's idea. We save last timestamp seen
-		 * from the destination in peer table, when entering
-		 * state TIME-WAIT, and check against it before
-		 * accepting new connection request.
-		 *
-		 * If "isn" is not zero, this request hit alive
-		 * timewait bucket, so that all the necessary checks
-		 * are made in the function processing timewait state.
-		 */
-		if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) {
-			dst = af_ops->route_req(sk, (struct flowi *)&fl6, req,
-						NULL);
-			if (dst && !tcp_peer_is_proven(req, dst, true)) {
-				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
-				goto drop_and_release;
-			}
-		}
-		/* Kill the following clause, if you dislike this way. */
-		else if (!sysctl_tcp_syncookies &&
-			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
-			  (sysctl_max_syn_backlog >> 2)) &&
-			 !tcp_peer_is_proven(req, dst, false)) {
-			/* Without syncookies last quarter of
-			 * backlog is filled with destinations,
-			 * proven to be alive.
-			 * It means that we continue to communicate
-			 * to destinations, already remembered
-			 * to the moment of synflood.
-			 */
-			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n",
-				       &ireq->ir_v6_rmt_addr, ntohs(tcp_hdr(skb)->source));
-			goto drop_and_release;
-		}
-
-		isn = af_ops->init_seq(skb);
-	}
-
-	if (!dst) {
-		dst = af_ops->route_req(sk, (struct flowi *)&fl6, req, NULL);
-		if (!dst)
-			goto drop_and_free;
-	}
-
-	tcp_rsk(req)->snt_isn = isn;
-	tcp_openreq_init_rwin(req, sk, dst);
-	fastopen = !want_cookie &&
-		   tcp_try_fastopen(sk, skb, req, &foc, dst);
-	err = af_ops->send_synack(sk, dst, (struct flowi *)&fl6, req,
-				  skb_get_queue_mapping(skb), &foc);
-	if (!fastopen) {
-		if (err || want_cookie)
-			goto drop_and_free;
-
-		tcp_rsk(req)->listener = NULL;
-		af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
-	}
-	return 0;
-
-drop_and_release:
-	dst_release(dst);
-drop_and_free:
-	reqsk_free(req);
 drop:
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
 	return 0; /* don't send reset */
-- 
cgit v1.2.3


From c1878869c0c8e0def3df5397155f369442ce4e06 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso
Date: Sat, 28 Jun 2014 18:39:01 +0200
Subject: netfilter: fix several Kconfig problems in NF_LOG_*

warning: (NETFILTER_XT_TARGET_LOG) selects NF_LOG_IPV6 which has unmet direct dependencies (NET && INET && IPV6 && NETFILTER && IP6_NF_IPTABLES && NETFILTER_ADVANCED)
warning: (NF_LOG_IPV4 && NF_LOG_IPV6) selects NF_LOG_COMMON which has unmet direct dependencies (NET && INET && NETFILTER && NF_CONNTRACK)

Fixes: 83e96d4 ("netfilter: log: split family specific code to nf_log_{ip,ip6,common}.c files")
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/Kconfig | 20 ++++++++++----------
 net/ipv6/netfilter/Kconfig | 10 +++++-----
 net/netfilter/Kconfig      | 10 ++++------
 3 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 12e6057daea1..fb173126f03d 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -36,6 +36,16 @@ config NF_CONNTRACK_PROC_COMPAT
 
 	  If unsure, say Y.
 
+config NF_LOG_ARP
+	tristate "ARP packet logging"
+	default m if NETFILTER_ADVANCED=n
+	select NF_LOG_COMMON
+
+config NF_LOG_IPV4
+	tristate "IPv4 packet logging"
+	default m if NETFILTER_ADVANCED=n
+	select NF_LOG_COMMON
+
 config NF_TABLES_IPV4
 	depends on NF_TABLES
 	tristate "IPv4 nf_tables support"
@@ -159,16 +169,6 @@ config IP_NF_TARGET_SYNPROXY
 
 	  To compile it as a module, choose M here. If unsure, say N.
 
-config NF_LOG_ARP
-	tristate "ARP packet logging"
-	default m if NETFILTER_ADVANCED=n
-	select NF_LOG_INET
-
-config NF_LOG_IPV4
-	tristate "IPv4 packet logging"
-	default m if NETFILTER_ADVANCED=n
-	select NF_LOG_COMMON
-
 # NAT + specific targets: nf_conntrack
 config NF_NAT_IPV4
 	tristate "IPv4 NAT"
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 1537130e9f5b..ac93df16f5af 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -55,6 +55,11 @@ config NFT_REJECT_IPV6
 	default NFT_REJECT
 	tristate
 
+config NF_LOG_IPV6
+	tristate "IPv6 packet logging"
+	depends on NETFILTER_ADVANCED
+	select NF_LOG_COMMON
+
 config IP6_NF_IPTABLES
 	tristate "IP6 tables support (required for filtering)"
 	depends on INET && IPV6
@@ -227,11 +232,6 @@ config IP6_NF_SECURITY
 
          If unsure, say N.
 
-config NF_LOG_IPV6
-	tristate "IPv6 packet logging"
-	depends on NETFILTER_ADVANCED
-	select NF_LOG_COMMON
-
 config NF_NAT_IPV6
 	tristate "IPv6 NAT"
 	depends on NF_CONNTRACK_IPV6
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index f17b273c9082..ad751fe2e82b 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -46,6 +46,9 @@ config NF_CONNTRACK
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NF_LOG_COMMON
+	tristate
+
 if NF_CONNTRACK
 
 config NF_CONNTRACK_MARK
@@ -359,9 +362,6 @@ config NETFILTER_NETLINK_QUEUE_CT
 	  If this option is enabled, NFQUEUE can include Connection Tracking
 	  information together with the packet is the enqueued via NFNETLINK.
 
-config NF_LOG_COMMON
-	tristate
-
 config NF_NAT
 	tristate
 
@@ -747,9 +747,7 @@ config NETFILTER_XT_TARGET_LED
 
 config NETFILTER_XT_TARGET_LOG
 	tristate "LOG target support"
-	select NF_LOG
-	select NF_LOG_IPV4
-	select NF_LOG_IPV6
+	depends on NF_LOG_IPV4 && NF_LOG_IPV6
 	default m if NETFILTER_ADVANCED=n
 	help
 	  This option adds a `LOG' target, which allows you to create rules in
-- 
cgit v1.2.3


From 1759389e8af46d724220785bf710b7bdbebdfa48 Mon Sep 17 00:00:00 2001
From: Christoph Paasch
Date: Sat, 28 Jun 2014 14:12:44 +0200
Subject: xfrm4: Remove duplicate semicolon

3328715e6c1fc (xfrm4: Add IPsec protocol multiplexer) adds a
duplicate semicolon after the return-statement.

Although it has no negative impact, the second semicolon should be
removed.

Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Christoph Paasch <christoph.paasch@uclouvain.be>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_protocol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index a2ce0101eaac..dccefa9d84cf 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -124,7 +124,7 @@ static int xfrm4_ah_rcv(struct sk_buff *skb)
 
 	for_each_protocol_rcu(ah4_handlers, handler)
 		if ((ret = handler->handler(skb)) != -EINVAL)
-			return ret;;
+			return ret;
 
 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 
-- 
cgit v1.2.3


From 4135ab8208048a1586385fae1298726d7dacfc26 Mon Sep 17 00:00:00 2001
From: Octavian Purdila
Date: Sat, 28 Jun 2014 21:20:54 +0300
Subject: tcp: tcp_conn_request: fix build error when IPv6 is disabled

Fixes build error introduced by commit 1fb6f159fd21c64 (tcp: add
tcp_conn_request):

net/ipv4/tcp_input.c: In function 'pr_drop_req':
net/ipv4/tcp_input.c:5889:130: error: 'struct sock_common' has no member named 'skc_v6_daddr'

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Octavian Purdila <octavian.purdila@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 97e48d60c4e8..bb684967f7a7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5885,9 +5885,11 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
 	if (family == AF_INET)
 		LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
 			       &ireq->ir_rmt_addr, port);
-	else
+#if IS_ENABLED(CONFIG_IPV6)
+	else if (family == AF_INET6)
 		LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"),
 			       &ireq->ir_v6_rmt_addr, port);
+#endif
 }
 
 int tcp_conn_request(struct request_sock_ops *rsk_ops,
-- 
cgit v1.2.3


From 24de3d377539e384621c5b8f8f8d8d01852dddc8 Mon Sep 17 00:00:00 2001
From: Duan Jiong
Date: Mon, 30 Jun 2014 09:19:32 +0800
Subject: netfilter: use IS_ENABLED() macro

replace:
 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
with
 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)

replace:
 #if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE)
with
 #if !IS_ENABLED(CONFIG_NF_NAT)

replace:
 #if !defined(CONFIG_NF_CONNTRACK) && !defined(CONFIG_NF_CONNTRACK_MODULE)
with
 #if !IS_ENABLED(CONFIG_NF_CONNTRACK)

And add missing:
 IS_ENABLED(CONFIG_NF_CT_NETLINK)

in net/ipv{4,6}/netfilter/nf_nat_l3proto_ipv{4,6}.c

Signed-off-by: Duan Jiong <duanj.fnst@cn.fujitsu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 4 ++--
 net/ipv4/netfilter/nf_conntrack_proto_icmp.c   | 4 ++--
 net/ipv4/netfilter/nf_defrag_ipv4.c            | 8 ++++----
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c       | 4 ++++
 net/ipv4/netfilter/nf_nat_proto_gre.c          | 2 +-
 net/ipv4/netfilter/nf_nat_proto_icmp.c         | 2 +-
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c       | 4 ++++
 net/netfilter/nf_nat_core.c                    | 2 +-
 net/netfilter/nf_nat_proto_common.c            | 2 +-
 net/netfilter/nf_nat_proto_dccp.c              | 2 +-
 net/netfilter/nf_nat_proto_sctp.c              | 2 +-
 net/netfilter/nf_nat_proto_tcp.c               | 2 +-
 net/netfilter/nf_nat_proto_udp.c               | 2 +-
 net/netfilter/nf_nat_proto_udplite.c           | 2 +-
 14 files changed, 25 insertions(+), 17 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 8127dc802865..4ce44c4bc57b 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -314,7 +314,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
 	return -ENOENT;
 }
 
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_conntrack.h>
@@ -388,7 +388,7 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
 	.invert_tuple	 = ipv4_invert_tuple,
 	.print_tuple	 = ipv4_print_tuple,
 	.get_l4proto	 = ipv4_get_l4proto,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.tuple_to_nlattr = ipv4_tuple_to_nlattr,
 	.nlattr_tuple_size = ipv4_nlattr_tuple_size,
 	.nlattr_to_tuple = ipv4_nlattr_to_tuple,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index a338dad41b7d..b91b2641adda 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -226,7 +226,7 @@ icmp_error(struct net *net, struct nf_conn *tmpl,
 	return icmp_error_message(net, tmpl, skb, ctinfo, hooknum);
 }
 
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_conntrack.h>
@@ -408,7 +408,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
 	.error			= icmp_error,
 	.destroy		= NULL,
 	.me			= NULL,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.tuple_to_nlattr	= icmp_tuple_to_nlattr,
 	.nlattr_tuple_size	= icmp_nlattr_tuple_size,
 	.nlattr_to_tuple	= icmp_nlattr_to_tuple,
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index b8f6381c7d0b..76bd1aef257f 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -17,7 +17,7 @@
 #include <linux/netfilter_bridge.h>
 #include <linux/netfilter_ipv4.h>
 #include <net/netfilter/ipv4/nf_defrag_ipv4.h>
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
 #include <net/netfilter/nf_conntrack.h>
 #endif
 #include <net/netfilter/nf_conntrack_zones.h>
@@ -45,7 +45,7 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
 {
 	u16 zone = NF_CT_DEFAULT_ZONE;
 
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	if (skb->nfct)
 		zone = nf_ct_zone((struct nf_conn *)skb->nfct);
 #endif
@@ -74,8 +74,8 @@ static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
 	    inet->nodefrag)
 		return NF_ACCEPT;
 
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
-#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#if !IS_ENABLED(CONFIG_NF_NAT)
 	/* Previously seen (loopback)?  Ignore.  Do this before
 	   fragment check. */
 	if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index d8b2e14efddc..14f5ccd06337 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -154,6 +154,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
 					 htons(oldlen), htons(datalen), 1);
 }
 
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
 				       struct nf_nat_range *range)
 {
@@ -169,6 +170,7 @@ static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
 
 	return 0;
 }
+#endif
 
 static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
 	.l3proto		= NFPROTO_IPV4,
@@ -177,7 +179,9 @@ static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
 	.manip_pkt		= nf_nat_ipv4_manip_pkt,
 	.csum_update		= nf_nat_ipv4_csum_update,
 	.csum_recalc		= nf_nat_ipv4_csum_recalc,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.nlattr_to_range	= nf_nat_ipv4_nlattr_to_range,
+#endif
 #ifdef CONFIG_XFRM
 	.decode_session		= nf_nat_ipv4_decode_session,
 #endif
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 690d890111bb..9414923f1e15 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -124,7 +124,7 @@ static const struct nf_nat_l4proto gre = {
 	.manip_pkt		= gre_manip_pkt,
 	.in_range		= nf_nat_l4proto_in_range,
 	.unique_tuple		= gre_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range,
 #endif
 };
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index eb303471bcf6..4557b4ab8342 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -77,7 +77,7 @@ const struct nf_nat_l4proto nf_nat_l4proto_icmp = {
 	.manip_pkt		= icmp_manip_pkt,
 	.in_range		= icmp_in_range,
 	.unique_tuple		= icmp_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range,
 #endif
 };
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index abfe75a2e316..fc8e49b2ff3e 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -158,6 +158,7 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
 					 htons(oldlen), htons(datalen), 1);
 }
 
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[],
 				       struct nf_nat_range *range)
 {
@@ -175,6 +176,7 @@ static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[],
 
 	return 0;
 }
+#endif
 
 static const struct nf_nat_l3proto nf_nat_l3proto_ipv6 = {
 	.l3proto		= NFPROTO_IPV6,
@@ -183,7 +185,9 @@ static const struct nf_nat_l3proto nf_nat_l3proto_ipv6 = {
 	.manip_pkt		= nf_nat_ipv6_manip_pkt,
 	.csum_update		= nf_nat_ipv6_csum_update,
 	.csum_recalc		= nf_nat_ipv6_csum_recalc,
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.nlattr_to_range	= nf_nat_ipv6_nlattr_to_range,
+#endif
 #ifdef CONFIG_XFRM
 	.decode_session	= nf_nat_ipv6_decode_session,
 #endif
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 09096a670c45..31c5015394e9 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -677,7 +677,7 @@ static struct nf_ct_ext_type nat_extend __read_mostly = {
 	.flags		= NF_CT_EXT_F_PREALLOC,
 };
 
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_conntrack.h>
diff --git a/net/netfilter/nf_nat_proto_common.c b/net/netfilter/nf_nat_proto_common.c
index 83a72a235cae..fbce552a796e 100644
--- a/net/netfilter/nf_nat_proto_common.c
+++ b/net/netfilter/nf_nat_proto_common.c
@@ -95,7 +95,7 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
 }
 EXPORT_SYMBOL_GPL(nf_nat_l4proto_unique_tuple);
 
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[],
 				   struct nf_nat_range *range)
 {
diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c
index c8be2cdac0bf..b8067b53ff3a 100644
--- a/net/netfilter/nf_nat_proto_dccp.c
+++ b/net/netfilter/nf_nat_proto_dccp.c
@@ -78,7 +78,7 @@ static const struct nf_nat_l4proto nf_nat_l4proto_dccp = {
 	.manip_pkt		= dccp_manip_pkt,
 	.in_range		= nf_nat_l4proto_in_range,
 	.unique_tuple		= dccp_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range,
 #endif
 };
diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
index 754536f2c674..cbc7ade1487b 100644
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -59,7 +59,7 @@ static const struct nf_nat_l4proto nf_nat_l4proto_sctp = {
 	.manip_pkt		= sctp_manip_pkt,
 	.in_range		= nf_nat_l4proto_in_range,
 	.unique_tuple		= sctp_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range,
 #endif
 };
diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c
index 83ec8a6e4c36..37f5505f4529 100644
--- a/net/netfilter/nf_nat_proto_tcp.c
+++ b/net/netfilter/nf_nat_proto_tcp.c
@@ -79,7 +79,7 @@ const struct nf_nat_l4proto nf_nat_l4proto_tcp = {
 	.manip_pkt		= tcp_manip_pkt,
 	.in_range		= nf_nat_l4proto_in_range,
 	.unique_tuple		= tcp_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range,
 #endif
 };
diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c
index 7df613fb34a2..b0ede2f0d8bc 100644
--- a/net/netfilter/nf_nat_proto_udp.c
+++ b/net/netfilter/nf_nat_proto_udp.c
@@ -70,7 +70,7 @@ const struct nf_nat_l4proto nf_nat_l4proto_udp = {
 	.manip_pkt		= udp_manip_pkt,
 	.in_range		= nf_nat_l4proto_in_range,
 	.unique_tuple		= udp_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range,
 #endif
 };
diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c
index 776a0d1317b1..368f14e01e75 100644
--- a/net/netfilter/nf_nat_proto_udplite.c
+++ b/net/netfilter/nf_nat_proto_udplite.c
@@ -69,7 +69,7 @@ static const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
 	.manip_pkt		= udplite_manip_pkt,
 	.in_range		= nf_nat_l4proto_in_range,
 	.unique_tuple		= udplite_unique_tuple,
-#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 	.nlattr_to_range	= nf_nat_l4proto_nlattr_to_range,
 #endif
 };
-- 
cgit v1.2.3


From 9fe516ba3fb29b6f6a752ffd93342fdee500ec01 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Fri, 27 Jun 2014 08:36:16 -0700
Subject: inet: move ipv6only in sock_common

When an UDP application switches from AF_INET to AF_INET6 sockets, we
have a small performance degradation for IPv4 communications because of
extra cache line misses to access ipv6only information.

This can also be noticed for TCP listeners, as ipv6_only_sock() is also
used from __inet_lookup_listener()->compute_score()

This is magnified when SO_REUSEPORT is used.

Move ipv6only into struct sock_common so that it is available at
no extra cost in lookups.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h             | 10 +++++-----
 include/net/inet_timewait_sock.h |  3 ++-
 include/net/sock.h               |  4 +++-
 net/dccp/minisocks.c             |  4 +---
 net/ipv4/tcp_minisocks.c         |  2 +-
 net/ipv6/af_inet6.c              |  6 +++---
 net/ipv6/ipv6_sockglue.c         |  4 ++--
 net/ipv6/udp.c                   |  3 +--
 8 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index b0f2452f1d58..5dc68c3ebcbd 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -194,7 +194,7 @@ struct ipv6_pinfo {
 	                        sndflow:1,
 				repflow:1,
 				pmtudisc:3,
-				ipv6only:1,
+				padding:1,	/* 1 bit hole */
 				srcprefs:3,	/* 001: prefer temporary address
 						 * 010: prefer public address
 						 * 100: prefer care-of address
@@ -273,8 +273,8 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to,
 	__inet_sk_copy_descendant(sk_to, sk_from, ancestor_size);
 }
 
-#define __ipv6_only_sock(sk)	(inet6_sk(sk)->ipv6only)
-#define ipv6_only_sock(sk)	((sk)->sk_family == PF_INET6 && __ipv6_only_sock(sk))
+#define __ipv6_only_sock(sk)	(sk->sk_ipv6only)
+#define ipv6_only_sock(sk)	(__ipv6_only_sock(sk))
 #define ipv6_sk_rxinfo(sk)	((sk)->sk_family == PF_INET6 && \
 				 inet6_sk(sk)->rxopt.bits.rxinfo)
 
@@ -287,8 +287,8 @@ static inline const struct in6_addr *inet6_rcv_saddr(const struct sock *sk)
 
 static inline int inet_v6_ipv6only(const struct sock *sk)
 {
-	return likely(sk->sk_state != TCP_TIME_WAIT) ?
-		ipv6_only_sock(sk) : inet_twsk(sk)->tw_ipv6only;
+	/* ipv6only field is at same position for timewait and other sockets */
+	return ipv6_only_sock(sk);
 }
 #else
 #define __ipv6_only_sock(sk)	0
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 61474ea02152..6c566034e26d 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -108,6 +108,7 @@ struct inet_timewait_sock {
 #define tw_family		__tw_common.skc_family
 #define tw_state		__tw_common.skc_state
 #define tw_reuse		__tw_common.skc_reuse
+#define tw_ipv6only		__tw_common.skc_ipv6only
 #define tw_bound_dev_if		__tw_common.skc_bound_dev_if
 #define tw_node			__tw_common.skc_nulls_node
 #define tw_bind_node		__tw_common.skc_bind_node
@@ -131,7 +132,7 @@ struct inet_timewait_sock {
 	__be16			tw_sport;
 	kmemcheck_bitfield_begin(flags);
 	/* And these are ours. */
-	unsigned int		tw_ipv6only     : 1,
+	unsigned int		tw_pad0		: 1,	/* 1 bit hole */
 				tw_transparent  : 1,
 				tw_flowlabel	: 20,
 				tw_pad		: 2,	/* 2 bits hole */
diff --git a/include/net/sock.h b/include/net/sock.h
index 173cae485de1..8d4c9473e7d7 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -181,7 +181,8 @@ struct sock_common {
 	unsigned short		skc_family;
 	volatile unsigned char	skc_state;
 	unsigned char		skc_reuse:4;
-	unsigned char		skc_reuseport:4;
+	unsigned char		skc_reuseport:1;
+	unsigned char		skc_ipv6only:1;
 	int			skc_bound_dev_if;
 	union {
 		struct hlist_node	skc_bind_node;
@@ -317,6 +318,7 @@ struct sock {
 #define sk_state		__sk_common.skc_state
 #define sk_reuse		__sk_common.skc_reuse
 #define sk_reuseport		__sk_common.skc_reuseport
+#define sk_ipv6only		__sk_common.skc_ipv6only
 #define sk_bound_dev_if		__sk_common.skc_bound_dev_if
 #define sk_bind_node		__sk_common.skc_bind_node
 #define sk_prot			__sk_common.skc_prot
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index c69eb9c4fbb8..b50dc436db1f 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -55,11 +55,9 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
 		const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
 #if IS_ENABLED(CONFIG_IPV6)
 		if (tw->tw_family == PF_INET6) {
-			const struct ipv6_pinfo *np = inet6_sk(sk);
-
 			tw->tw_v6_daddr = sk->sk_v6_daddr;
 			tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
-			tw->tw_ipv6only = np->ipv6only;
+			tw->tw_ipv6only = sk->sk_ipv6only;
 		}
 #endif
 		/* Linkage updates. */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index e68e0d4af6c9..1649988bd1b6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -298,7 +298,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 			tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
 			tw->tw_tclass = np->tclass;
 			tw->tw_flowlabel = np->flow_label >> 12;
-			tw->tw_ipv6only = np->ipv6only;
+			tw->tw_ipv6only = sk->sk_ipv6only;
 		}
 #endif
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 7cb4392690dd..a426cd7099bb 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -197,7 +197,7 @@ lookup_protocol:
 	np->mcast_hops	= IPV6_DEFAULT_MCASTHOPS;
 	np->mc_loop	= 1;
 	np->pmtudisc	= IPV6_PMTUDISC_WANT;
-	np->ipv6only	= net->ipv6.sysctl.bindv6only;
+	sk->sk_ipv6only	= net->ipv6.sysctl.bindv6only;
 
 	/* Init the ipv4 part of the socket since we can have sockets
 	 * using v6 API for ipv4.
@@ -294,7 +294,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		/* Binding to v4-mapped address on a v6-only socket
 		 * makes no sense
 		 */
-		if (np->ipv6only) {
+		if (sk->sk_ipv6only) {
 			err = -EINVAL;
 			goto out;
 		}
@@ -371,7 +371,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (addr_type != IPV6_ADDR_ANY) {
 		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
 		if (addr_type != IPV6_ADDR_MAPPED)
-			np->ipv6only = 1;
+			sk->sk_ipv6only = 1;
 	}
 	if (snum)
 		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index edb58aff4ae7..cc34f65179e4 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -235,7 +235,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		if (optlen < sizeof(int) ||
 		    inet_sk(sk)->inet_num)
 			goto e_inval;
-		np->ipv6only = valbool;
+		sk->sk_ipv6only = valbool;
 		retv = 0;
 		break;
 
@@ -1058,7 +1058,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
 	}
 
 	case IPV6_V6ONLY:
-		val = np->ipv6only;
+		val = sk->sk_ipv6only;
 		break;
 
 	case IPV6_RECVPKTINFO:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 95c834799288..c2bd28fd43e4 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -79,7 +79,6 @@ static unsigned int udp6_ehashfn(struct net *net,
 int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
 {
 	const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
-	int sk_ipv6only = ipv6_only_sock(sk);
 	int sk2_ipv6only = inet_v6_ipv6only(sk2);
 	int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
 	int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
@@ -95,7 +94,7 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
 		return 1;
 
 	if (addr_type == IPV6_ADDR_ANY &&
-	    !(sk_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))
+	    !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
 		return 1;
 
 	if (sk2_rcv_saddr6 &&
-- 
cgit v1.2.3


From 86c6a2c75ab97fe31844985169e26aea335432f9 Mon Sep 17 00:00:00 2001
From: Neal Cardwell
Date: Mon, 30 Jun 2014 15:09:49 -0400
Subject: tcp: switch snt_synack back to measuring transmit time of first
 SYNACK

Always store in snt_synack the time at which the server received the
first client SYN and attempted to send the first SYNACK.

Recent commit aa27fc501 ("tcp: tcp_v[46]_conn_request: fix snt_synack
initialization") resolved an inconsistency between IPv4 and IPv6 in
the initialization of snt_synack. This commit brings back the idea
from 843f4a55e (tcp: use tcp_v4_send_synack on first SYN-ACK), which
was going for the original behavior of snt_synack from the commit
where it was added in 9ad7c049f0f79 ("tcp: RFC2988bis + taking RTT
sample from 3WHS for the passive open side") in v3.1.

In addition to being simpler (and probably a tiny bit faster),
unconditionally storing the time of the first SYNACK attempt has been
useful because it allows calculating a performance metric quantifying
how long it took to establish a passive TCP connection.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Cc: Octavian Purdila <octavian.purdila@intel.com>
Cc: Jerry Chu <hkchu@google.com>
Acked-by: Octavian Purdila <octavian.purdila@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   | 2 +-
 net/ipv4/tcp_ipv4.c | 2 --
 net/ipv6/tcp_ipv6.c | 2 --
 3 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0d5389aecf18..c9a75dbba0c7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1093,7 +1093,7 @@ static inline void tcp_openreq_init(struct request_sock *req,
 	req->cookie_ts = 0;
 	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
 	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
-	tcp_rsk(req)->snt_synack = 0;
+	tcp_rsk(req)->snt_synack = tcp_time_stamp;
 	req->mss = rx_opt->mss_clamp;
 	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
 	ireq->tstamp_ok = rx_opt->tstamp_ok;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5dfebd2f2e38..52d0f6a1ec2c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -838,8 +838,6 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 					    ireq->ir_rmt_addr,
 					    ireq->opt);
 		err = net_xmit_eval(err);
-		if (!tcp_rsk(req)->snt_synack && !err)
-			tcp_rsk(req)->snt_synack = tcp_time_stamp;
 	}
 
 	return err;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index bc24ee21339a..a97c95585da8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -498,8 +498,6 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
 		skb_set_queue_mapping(skb, queue_mapping);
 		err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass);
 		err = net_xmit_eval(err);
-		if (!tcp_rsk(req)->snt_synack && !err)
-			tcp_rsk(req)->snt_synack = tcp_time_stamp;
 	}
 
 done:
-- 
cgit v1.2.3


From b73c3d0e4f0e1961e15bec18720e48aabebe2109 Mon Sep 17 00:00:00 2001
From: Tom Herbert
Date: Tue, 1 Jul 2014 21:32:17 -0700
Subject: net: Save TX flow hash in sock and set in skbuf on xmit

For a connected socket we can precompute the flow hash for setting
in skb->hash on output. This is a performance advantage over
calculating the skb->hash for every packet on the connection. The
computation is done using the common hash algorithm to be consistent
with computations done for packets of the connection in other states
where thers is no socket (e.g. time-wait, syn-recv, syn-cookies).

This patch adds sk_txhash to the sock structure. inet_set_txhash and
ip6_set_txhash functions are added which are called from points in
TCP and UDP where socket moves to established state.

skb_set_hash_from_sk is a function which sets skb->hash from the
sock txhash value. This is called in UDP and TCP transmit path when
transmitting within the context of a socket.

Tested: ran super_netperf with 200 TCP_RR streams over a vxlan
interface (in this case skb_get_hash called on every TX packet to
create a UDP source port).

Before fix:

  95.02% CPU utilization
  154/256/505 90/95/99% latencies
  1.13042e+06 tps

  Time in functions:
    0.28% skb_flow_dissect
    0.21% __skb_get_hash

After fix:

  94.95% CPU utilization
  156/254/485 90/95/99% latencies
  1.15447e+06

  Neither __skb_get_hash nor skb_flow_dissect appear in perf

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h      | 14 ++++++++++++++
 include/net/ipv6.h    | 15 +++++++++++++++
 include/net/sock.h    | 11 +++++++++++
 net/ipv4/datagram.c   |  1 +
 net/ipv4/tcp_ipv4.c   |  3 +++
 net/ipv4/tcp_output.c |  1 +
 net/ipv6/datagram.c   |  1 +
 net/ipv6/tcp_ipv6.c   |  4 ++++
 8 files changed, 50 insertions(+)

(limited to 'net/ipv4')

diff --git a/include/net/ip.h b/include/net/ip.h
index 0e795df05ec9..2e8f055989c3 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -31,6 +31,7 @@
 #include <net/route.h>
 #include <net/snmp.h>
 #include <net/flow.h>
+#include <net/flow_keys.h>
 
 struct sock;
 
@@ -353,6 +354,19 @@ static inline __wsum inet_compute_pseudo(struct sk_buff *skb, int proto)
 				  skb->len, proto, 0);
 }
 
+static inline void inet_set_txhash(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct flow_keys keys;
+
+	keys.src = inet->inet_saddr;
+	keys.dst = inet->inet_daddr;
+	keys.port16[0] = inet->inet_sport;
+	keys.port16[1] = inet->inet_dport;
+
+	sk->sk_txhash = flow_hash_from_keys(&keys);
+}
+
 /*
  *	Map a multicast IP onto multicast MAC for type ethernet.
  */
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 574337fe72dd..2aa86e1135a1 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -19,6 +19,7 @@
 #include <net/if_inet6.h>
 #include <net/ndisc.h>
 #include <net/flow.h>
+#include <net/flow_keys.h>
 #include <net/snmp.h>
 
 #define SIN6_LEN_RFC2133	24
@@ -684,6 +685,20 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6,
 	return hlimit;
 }
 
+static inline void ip6_set_txhash(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct flow_keys keys;
+
+	keys.src = (__force __be32)ipv6_addr_hash(&np->saddr);
+	keys.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr);
+	keys.port16[0] = inet->inet_sport;
+	keys.port16[1] = inet->inet_dport;
+
+	sk->sk_txhash = flow_hash_from_keys(&keys);
+}
+
 /*
  *	Header manipulation
  */
diff --git a/include/net/sock.h b/include/net/sock.h
index 8d4c9473e7d7..cb84b2f1ad8f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -273,6 +273,7 @@ struct cg_proto;
   *	@sk_rcvtimeo: %SO_RCVTIMEO setting
   *	@sk_sndtimeo: %SO_SNDTIMEO setting
   *	@sk_rxhash: flow hash received from netif layer
+  *	@sk_txhash: computed flow hash for use on transmit
   *	@sk_filter: socket filtering instructions
   *	@sk_protinfo: private area, net family specific, when not using slab
   *	@sk_timer: sock cleanup timer
@@ -347,6 +348,7 @@ struct sock {
 #ifdef CONFIG_RPS
 	__u32			sk_rxhash;
 #endif
+	__u32			sk_txhash;
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	unsigned int		sk_napi_id;
 	unsigned int		sk_ll_usec;
@@ -1980,6 +1982,14 @@ static inline void sock_poll_wait(struct file *filp,
 	}
 }
 
+static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk)
+{
+	if (sk->sk_txhash) {
+		skb->l4_hash = 1;
+		skb->hash = sk->sk_txhash;
+	}
+}
+
 /*
  *	Queue a received datagram if it will fit. Stream and sequenced
  *	protocols can't normally use this as they need to fit buffers in
@@ -1994,6 +2004,7 @@ static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
 	skb_orphan(skb);
 	skb->sk = sk;
 	skb->destructor = sock_wfree;
+	skb_set_hash_from_sk(skb, sk);
 	/*
 	 * We used to take a refcount on sk, but following operation
 	 * is enough to guarantee sk_free() wont free this sock until
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index a3095fdefbed..90c0e8386116 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -76,6 +76,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	inet->inet_daddr = fl4->daddr;
 	inet->inet_dport = usin->sin_port;
 	sk->sk_state = TCP_ESTABLISHED;
+	inet_set_txhash(sk);
 	inet->inet_id = jiffies;
 
 	sk_dst_set(sk, &rt->dst);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 52d0f6a1ec2c..1edc739b9da5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -208,6 +208,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	inet->inet_dport = usin->sin_port;
 	inet->inet_daddr = daddr;
 
+	inet_set_txhash(sk);
+
 	inet_csk(sk)->icsk_ext_hdr_len = 0;
 	if (inet_opt)
 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
@@ -1334,6 +1336,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
 	newinet->rcv_tos      = ip_hdr(skb)->tos;
 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
+	inet_set_txhash(newsk);
 	if (inet_opt)
 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 	newinet->inet_id = newtp->write_seq ^ jiffies;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f8f2a944a1ce..bcee13c4627c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -916,6 +916,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	skb_orphan(skb);
 	skb->sk = sk;
 	skb->destructor = tcp_wfree;
+	skb_set_hash_from_sk(skb, sk);
 	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
 
 	/* Build TCP header and checksum it. */
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index c3bf2d2e519e..2753319524f1 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -199,6 +199,7 @@ ipv4_connected:
 		      NULL);
 
 	sk->sk_state = TCP_ESTABLISHED;
+	ip6_set_txhash(sk);
 out:
 	fl6_sock_release(flowlabel);
 	return err;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index a97c95585da8..22055b098428 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -198,6 +198,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	sk->sk_v6_daddr = usin->sin6_addr;
 	np->flow_label = fl6.flowlabel;
 
+	ip6_set_txhash(sk);
+
 	/*
 	 *	TCP over IPv4
 	 */
@@ -1132,6 +1134,8 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr;
 	newsk->sk_bound_dev_if = ireq->ir_iif;
 
+	ip6_set_txhash(newsk);
+
 	/* Now IPv6 options...
 
 	   First: no IPv4 options.
-- 
cgit v1.2.3


From 4f6ad60cf357e1663d84de052af68dfccd734ec7 Mon Sep 17 00:00:00 2001
From: Fabian Frederick
Date: Wed, 2 Jul 2014 22:22:20 +0200
Subject: ipconfig: add static to local variable

ic_dev_xid is only used in ipconfig.c

Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: netdev@vger.kernel.org
Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipconfig.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index b3e86ea7b71b..f0f3f9f77b30 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -143,7 +143,7 @@ __be32 ic_servaddr = NONE;	/* Boot server IP address */
 __be32 root_server_addr = NONE;	/* Address of NFS server */
 u8 root_server_path[256] = { 0, };	/* Path to mount as root */
 
-__be32 ic_dev_xid;		/* Device under configuration */
+static __be32 ic_dev_xid;		/* Device under configuration */
 
 /* vendor class identifier */
 static char vendor_class_identifier[253] __initdata;
-- 
cgit v1.2.3


From 21621e93f24b5f8de8262ace269a5f692a826bed Mon Sep 17 00:00:00 2001
From: Fabian Frederick
Date: Wed, 9 Jul 2014 20:35:21 +0200
Subject: ipconfig: move ic_dev_xid under IPCONFIG_BOOTP

ic_dev_xid is only used in __init ic_bootp_recv under IPCONFIG_BOOTP
and __init ic_dynamic under IPCONFIG_DYNAMIC(which is itself defined
with the same IPCONFIG_BOOTP)

This patch fixes the following warning when IPCONFIG_BOOTP is not set:
>> net/ipv4/ipconfig.c:146:15: warning: 'ic_dev_xid' defined but not used [-Wunused-variable]
    static __be32 ic_dev_xid;  /* Device under configuration */

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: netdev@vger.kernel.org
Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipconfig.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index f0f3f9f77b30..f02f594d04c5 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -143,8 +143,6 @@ __be32 ic_servaddr = NONE;	/* Boot server IP address */
 __be32 root_server_addr = NONE;	/* Address of NFS server */
 u8 root_server_path[256] = { 0, };	/* Path to mount as root */
 
-static __be32 ic_dev_xid;		/* Device under configuration */
-
 /* vendor class identifier */
 static char vendor_class_identifier[253] __initdata;
 
@@ -654,6 +652,7 @@ static struct packet_type bootp_packet_type __initdata = {
 	.func =	ic_bootp_recv,
 };
 
+static __be32 ic_dev_xid;		/* Device under configuration */
 
 /*
  *  Initialize DHCP/BOOTP extension fields in the request.
-- 
cgit v1.2.3


From 405fd707196d596f59a510d176dedc979c1ae34b Mon Sep 17 00:00:00 2001
From: David S. Miller
Date: Wed, 9 Jul 2014 22:25:18 -0700
Subject: ipconfig: Only bootp paths should reference ic_dev_xid.

It is only tested, and declared, in the bootp code.

So, in ic_dynamic() guard it's setting with IPCONFIG_BOOTP.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipconfig.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index f02f594d04c5..5bbef4fdcb43 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1217,10 +1217,10 @@ static int __init ic_dynamic(void)
 	get_random_bytes(&timeout, sizeof(timeout));
 	timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned int) CONF_TIMEOUT_RANDOM);
 	for (;;) {
+#ifdef IPCONFIG_BOOTP
 		/* Track the device we are configuring */
 		ic_dev_xid = d->xid;
 
-#ifdef IPCONFIG_BOOTP
 		if (do_bootp && (d->able & IC_BOOTP))
 			ic_bootp_send_if(d, jiffies - start_jiffies);
 #endif
-- 
cgit v1.2.3


From a2f983f83b0f66a74a045d36eb84e2f01bb950c7 Mon Sep 17 00:00:00 2001
From: Li RongQing
Date: Fri, 11 Jul 2014 14:32:17 +0800
Subject: ipv4: remove the unnecessary variable in udp_mcast_next

Signed-off-by: Li RongQing <roy.qing.li@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index d92f94b7e402..ac30e106a884 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -600,19 +600,18 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
 					     int dif)
 {
 	struct hlist_nulls_node *node;
-	struct sock *s = sk;
 	unsigned short hnum = ntohs(loc_port);
 
-	sk_nulls_for_each_from(s, node) {
-		if (__udp_is_mcast_sock(net, s,
+	sk_nulls_for_each_from(sk, node) {
+		if (__udp_is_mcast_sock(net, sk,
 					loc_port, loc_addr,
 					rmt_port, rmt_addr,
 					dif, hnum))
 			goto found;
 	}
-	s = NULL;
+	sk = NULL;
 found:
-	return s;
+	return sk;
 }
 
 /*
-- 
cgit v1.2.3


From 8024e02879ddd5042be02c70557f74cdc70b44b4 Mon Sep 17 00:00:00 2001
From: Tom Herbert
Date: Sun, 13 Jul 2014 19:49:37 -0700
Subject: udp: Add udp_sock_create for UDP tunnels to open listener socket

Added udp_tunnel.c which can contain some common functions for UDP
tunnels. The first function in this is udp_sock_create which is used
to open the listener port for a UDP tunnel.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/udp_tunnel.h |  32 +++++++++++++++
 net/ipv4/Kconfig         |   4 ++
 net/ipv4/Makefile        |   1 +
 net/ipv4/udp_tunnel.c    | 100 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 137 insertions(+)
 create mode 100644 include/net/udp_tunnel.h
 create mode 100644 net/ipv4/udp_tunnel.c

(limited to 'net/ipv4')

diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
new file mode 100644
index 000000000000..ffd69cbded35
--- /dev/null
+++ b/include/net/udp_tunnel.h
@@ -0,0 +1,32 @@
+#ifndef __NET_UDP_TUNNEL_H
+#define __NET_UDP_TUNNEL_H
+
+struct udp_port_cfg {
+	u8			family;
+
+	/* Used only for kernel-created sockets */
+	union {
+		struct in_addr		local_ip;
+#if IS_ENABLED(CONFIG_IPV6)
+		struct in6_addr		local_ip6;
+#endif
+	};
+
+	union {
+		struct in_addr		peer_ip;
+#if IS_ENABLED(CONFIG_IPV6)
+		struct in6_addr		peer_ip6;
+#endif
+	};
+
+	__be16			local_udp_port;
+	__be16			peer_udp_port;
+	unsigned int		use_udp_checksums:1,
+				use_udp6_tx_checksums:1,
+				use_udp6_rx_checksums:1;
+};
+
+int udp_sock_create(struct net *net, struct udp_port_cfg *cfg,
+		    struct socket **sockp);
+
+#endif
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 05c57f0fcabe..dbc10d84161f 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -307,6 +307,10 @@ config NET_IPVTI
 	  the notion of a secure tunnel for IPSEC and then use routing protocol
 	  on top.
 
+config NET_UDP_TUNNEL
+	tristate
+	default n
+
 config INET_AH
 	tristate "IP: AH transformation"
 	select XFRM_ALGO
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f032688d20d3..8ee1cd4053ee 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_NET_IPIP) += ipip.o
 gre-y := gre_demux.o
 obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
 obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o
 obj-$(CONFIG_NET_IPVTI) += ip_vti.o
 obj-$(CONFIG_SYN_COOKIES) += syncookies.o
 obj-$(CONFIG_INET_AH) += ah4.o
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
new file mode 100644
index 000000000000..61ec1a65207e
--- /dev/null
+++ b/net/ipv4/udp_tunnel.c
@@ -0,0 +1,100 @@
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/socket.h>
+#include <linux/udp.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+#include <net/net_namespace.h>
+
+int udp_sock_create(struct net *net, struct udp_port_cfg *cfg,
+		    struct socket **sockp)
+{
+	int err = -EINVAL;
+	struct socket *sock = NULL;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (cfg->family == AF_INET6) {
+		struct sockaddr_in6 udp6_addr;
+
+		err = sock_create_kern(AF_INET6, SOCK_DGRAM, 0, &sock);
+		if (err < 0)
+			goto error;
+
+		sk_change_net(sock->sk, net);
+
+		udp6_addr.sin6_family = AF_INET6;
+		memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,
+		       sizeof(udp6_addr.sin6_addr));
+		udp6_addr.sin6_port = cfg->local_udp_port;
+		err = kernel_bind(sock, (struct sockaddr *)&udp6_addr,
+				  sizeof(udp6_addr));
+		if (err < 0)
+			goto error;
+
+		if (cfg->peer_udp_port) {
+			udp6_addr.sin6_family = AF_INET6;
+			memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6,
+			       sizeof(udp6_addr.sin6_addr));
+			udp6_addr.sin6_port = cfg->peer_udp_port;
+			err = kernel_connect(sock,
+					     (struct sockaddr *)&udp6_addr,
+					     sizeof(udp6_addr), 0);
+		}
+		if (err < 0)
+			goto error;
+
+		udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums);
+		udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums);
+	} else
+#endif
+	if (cfg->family == AF_INET) {
+		struct sockaddr_in udp_addr;
+
+		err = sock_create_kern(AF_INET, SOCK_DGRAM, 0, &sock);
+		if (err < 0)
+			goto error;
+
+		sk_change_net(sock->sk, net);
+
+		udp_addr.sin_family = AF_INET;
+		udp_addr.sin_addr = cfg->local_ip;
+		udp_addr.sin_port = cfg->local_udp_port;
+		err = kernel_bind(sock, (struct sockaddr *)&udp_addr,
+				  sizeof(udp_addr));
+		if (err < 0)
+			goto error;
+
+		if (cfg->peer_udp_port) {
+			udp_addr.sin_family = AF_INET;
+			udp_addr.sin_addr = cfg->peer_ip;
+			udp_addr.sin_port = cfg->peer_udp_port;
+			err = kernel_connect(sock,
+					     (struct sockaddr *)&udp_addr,
+					     sizeof(udp_addr), 0);
+			if (err < 0)
+				goto error;
+		}
+
+		sock->sk->sk_no_check_tx = !cfg->use_udp_checksums;
+	} else {
+		return -EPFNOSUPPORT;
+	}
+
+
+	*sockp = sock;
+
+	return 0;
+
+error:
+	if (sock) {
+		kernel_sock_shutdown(sock, SHUT_RDWR);
+		sk_release_kernel(sock->sk);
+	}
+	*sockp = NULL;
+	return err;
+}
+EXPORT_SYMBOL(udp_sock_create);
+
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 155e010edbc168f43bb332887c43e8579ee3894a Mon Sep 17 00:00:00 2001
From: Tom Herbert
Date: Sun, 13 Jul 2014 19:49:56 -0700
Subject: udp: Move udp_tunnel_segment into udp_offload.c

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c         | 76 --------------------------------------------------
 net/ipv4/udp_offload.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 76 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ac30e106a884..f6dfe525584f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2522,79 +2522,3 @@ void __init udp_init(void)
 	sysctl_udp_rmem_min = SK_MEM_QUANTUM;
 	sysctl_udp_wmem_min = SK_MEM_QUANTUM;
 }
-
-struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
-				       netdev_features_t features)
-{
-	struct sk_buff *segs = ERR_PTR(-EINVAL);
-	u16 mac_offset = skb->mac_header;
-	int mac_len = skb->mac_len;
-	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
-	__be16 protocol = skb->protocol;
-	netdev_features_t enc_features;
-	int udp_offset, outer_hlen;
-	unsigned int oldlen;
-	bool need_csum;
-
-	oldlen = (u16)~skb->len;
-
-	if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
-		goto out;
-
-	skb->encapsulation = 0;
-	__skb_pull(skb, tnl_hlen);
-	skb_reset_mac_header(skb);
-	skb_set_network_header(skb, skb_inner_network_offset(skb));
-	skb->mac_len = skb_inner_network_offset(skb);
-	skb->protocol = htons(ETH_P_TEB);
-
-	need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
-	if (need_csum)
-		skb->encap_hdr_csum = 1;
-
-	/* segment inner packet. */
-	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
-	segs = skb_mac_gso_segment(skb, enc_features);
-	if (!segs || IS_ERR(segs)) {
-		skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
-				     mac_len);
-		goto out;
-	}
-
-	outer_hlen = skb_tnl_header_len(skb);
-	udp_offset = outer_hlen - tnl_hlen;
-	skb = segs;
-	do {
-		struct udphdr *uh;
-		int len;
-
-		skb_reset_inner_headers(skb);
-		skb->encapsulation = 1;
-
-		skb->mac_len = mac_len;
-
-		skb_push(skb, outer_hlen);
-		skb_reset_mac_header(skb);
-		skb_set_network_header(skb, mac_len);
-		skb_set_transport_header(skb, udp_offset);
-		len = skb->len - udp_offset;
-		uh = udp_hdr(skb);
-		uh->len = htons(len);
-
-		if (need_csum) {
-			__be32 delta = htonl(oldlen + len);
-
-			uh->check = ~csum_fold((__force __wsum)
-					       ((__force u32)uh->check +
-						(__force u32)delta));
-			uh->check = gso_make_checksum(skb, ~uh->check);
-
-			if (uh->check == 0)
-				uh->check = CSUM_MANGLED_0;
-		}
-
-		skb->protocol = protocol;
-	} while ((skb = skb->next));
-out:
-	return segs;
-}
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 546d2d439dda..4807544d018b 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -47,6 +47,82 @@ static int udp4_ufo_send_check(struct sk_buff *skb)
 	return 0;
 }
 
+struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
+				       netdev_features_t features)
+{
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	u16 mac_offset = skb->mac_header;
+	int mac_len = skb->mac_len;
+	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+	__be16 protocol = skb->protocol;
+	netdev_features_t enc_features;
+	int udp_offset, outer_hlen;
+	unsigned int oldlen;
+	bool need_csum;
+
+	oldlen = (u16)~skb->len;
+
+	if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
+		goto out;
+
+	skb->encapsulation = 0;
+	__skb_pull(skb, tnl_hlen);
+	skb_reset_mac_header(skb);
+	skb_set_network_header(skb, skb_inner_network_offset(skb));
+	skb->mac_len = skb_inner_network_offset(skb);
+	skb->protocol = htons(ETH_P_TEB);
+
+	need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
+	if (need_csum)
+		skb->encap_hdr_csum = 1;
+
+	/* segment inner packet. */
+	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
+	segs = skb_mac_gso_segment(skb, enc_features);
+	if (!segs || IS_ERR(segs)) {
+		skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
+				     mac_len);
+		goto out;
+	}
+
+	outer_hlen = skb_tnl_header_len(skb);
+	udp_offset = outer_hlen - tnl_hlen;
+	skb = segs;
+	do {
+		struct udphdr *uh;
+		int len;
+
+		skb_reset_inner_headers(skb);
+		skb->encapsulation = 1;
+
+		skb->mac_len = mac_len;
+
+		skb_push(skb, outer_hlen);
+		skb_reset_mac_header(skb);
+		skb_set_network_header(skb, mac_len);
+		skb_set_transport_header(skb, udp_offset);
+		len = skb->len - udp_offset;
+		uh = udp_hdr(skb);
+		uh->len = htons(len);
+
+		if (need_csum) {
+			__be32 delta = htonl(oldlen + len);
+
+			uh->check = ~csum_fold((__force __wsum)
+					       ((__force u32)uh->check +
+						(__force u32)delta));
+			uh->check = gso_make_checksum(skb, ~uh->check);
+
+			if (uh->check == 0)
+				uh->check = CSUM_MANGLED_0;
+		}
+
+		skb->protocol = protocol;
+	} while ((skb = skb->next));
+out:
+	return segs;
+}
+
 static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 					 netdev_features_t features)
 {
-- 
cgit v1.2.3


From c835a677331495cf137a7f8a023463afd9f032f8 Mon Sep 17 00:00:00 2001
From: Tom Gundersen
Date: Mon, 14 Jul 2014 16:37:24 +0200
Subject: net: set name_assign_type in alloc_netdev()

Extend alloc_netdev{,_mq{,s}}() to take name_assign_type as argument, and convert
all users to pass NET_NAME_UNKNOWN.

Coccinelle patch:

@@
expression sizeof_priv, name, setup, txqs, rxqs, count;
@@

(
-alloc_netdev_mqs(sizeof_priv, name, setup, txqs, rxqs)
+alloc_netdev_mqs(sizeof_priv, name, NET_NAME_UNKNOWN, setup, txqs, rxqs)
|
-alloc_netdev_mq(sizeof_priv, name, setup, count)
+alloc_netdev_mq(sizeof_priv, name, NET_NAME_UNKNOWN, setup, count)
|
-alloc_netdev(sizeof_priv, name, setup)
+alloc_netdev(sizeof_priv, name, NET_NAME_UNKNOWN, setup)
)

v9: move comments here from the wrong commit

Signed-off-by: Tom Gundersen <teg@jklm.no>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/firewire/net.c                              |  3 ++-
 drivers/hsi/clients/ssi_protocol.c                  |  2 +-
 drivers/infiniband/hw/amso1100/c2_provider.c        |  2 +-
 drivers/infiniband/ulp/ipoib/ipoib_main.c           |  4 ++--
 drivers/isdn/i4l/isdn_net.c                         |  3 ++-
 drivers/media/dvb-core/dvb_net.c                    |  3 ++-
 drivers/misc/sgi-xp/xpnet.c                         |  3 ++-
 drivers/net/arcnet/arcnet.c                         |  3 ++-
 drivers/net/bonding/bond_main.c                     |  2 +-
 drivers/net/caif/caif_serial.c                      |  3 ++-
 drivers/net/caif/caif_spi.c                         |  4 ++--
 drivers/net/caif/caif_virtio.c                      |  2 +-
 drivers/net/can/dev.c                               |  2 +-
 drivers/net/can/slcan.c                             |  2 +-
 drivers/net/dummy.c                                 |  2 +-
 drivers/net/eql.c                                   |  3 ++-
 drivers/net/ethernet/8390/lib8390.c                 |  2 +-
 drivers/net/ethernet/tile/tilegx.c                  |  4 ++--
 drivers/net/ethernet/tile/tilepro.c                 |  3 ++-
 drivers/net/hamradio/6pack.c                        |  3 ++-
 drivers/net/hamradio/baycom_epp.c                   |  2 +-
 drivers/net/hamradio/bpqether.c                     |  4 ++--
 drivers/net/hamradio/dmascc.c                       |  4 ++--
 drivers/net/hamradio/hdlcdrv.c                      |  2 +-
 drivers/net/hamradio/mkiss.c                        |  3 ++-
 drivers/net/hamradio/scc.c                          |  2 +-
 drivers/net/hamradio/yam.c                          |  2 +-
 drivers/net/ieee802154/fakehard.c                   |  3 ++-
 drivers/net/ifb.c                                   |  4 ++--
 drivers/net/loopback.c                              |  2 +-
 drivers/net/ppp/ppp_generic.c                       |  3 ++-
 drivers/net/slip/slip.c                             |  2 +-
 drivers/net/tun.c                                   |  3 ++-
 drivers/net/usb/cdc-phonet.c                        |  2 +-
 drivers/net/usb/hso.c                               |  3 ++-
 drivers/net/wan/dlci.c                              |  4 ++--
 drivers/net/wan/hdlc.c                              |  3 ++-
 drivers/net/wan/hdlc_fr.c                           |  5 +++--
 drivers/net/wan/lapbether.c                         |  4 ++--
 drivers/net/wan/sbni.c                              |  7 ++++---
 drivers/net/wan/sdla.c                              |  3 ++-
 drivers/net/wan/x25_asy.c                           |  4 ++--
 drivers/net/wimax/i2400m/usb.c                      |  2 +-
 drivers/net/wireless/airo.c                         |  5 +++--
 drivers/net/wireless/ath/ath6kl/cfg80211.c          |  2 +-
 drivers/net/wireless/ath/wil6210/netdev.c           |  2 +-
 drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c |  3 ++-
 drivers/net/wireless/libertas/main.c                |  2 +-
 drivers/net/wireless/libertas/mesh.c                |  2 +-
 drivers/net/wireless/mac80211_hwsim.c               |  3 ++-
 drivers/net/wireless/mwifiex/cfg80211.c             |  3 ++-
 drivers/net/xen-netback/interface.c                 |  4 ++--
 drivers/s390/net/claw.c                             |  2 +-
 drivers/s390/net/ctcm_main.c                        |  6 ++++--
 drivers/s390/net/netiucv.c                          |  2 +-
 drivers/s390/net/qeth_l2_main.c                     |  6 ++++--
 drivers/s390/net/qeth_l3_main.c                     |  3 ++-
 drivers/staging/cxt1e1/linux.c                      |  3 ++-
 drivers/staging/gdm724x/gdm_lte.c                   |  2 +-
 drivers/staging/gdm72xx/gdm_wimax.c                 |  3 ++-
 drivers/staging/vt6655/wpactl.c                     |  3 ++-
 drivers/staging/wlan-ng/p80211netdev.c              |  2 +-
 drivers/tty/n_gsm.c                                 |  5 ++---
 drivers/usb/gadget/f_phonet.c                       |  3 ++-
 include/linux/netdevice.h                           | 10 ++++++----
 net/802/fc.c                                        |  2 +-
 net/802/fddi.c                                      |  3 ++-
 net/802/hippi.c                                     |  3 ++-
 net/8021q/vlan.c                                    |  3 ++-
 net/appletalk/dev.c                                 |  3 ++-
 net/atm/br2684.c                                    |  4 ++--
 net/atm/clip.c                                      |  3 ++-
 net/batman-adv/soft-interface.c                     |  2 +-
 net/bluetooth/6lowpan.c                             |  2 +-
 net/bluetooth/bnep/core.c                           |  5 +++--
 net/bridge/br_if.c                                  |  2 +-
 net/core/dev.c                                      | 13 ++++++++-----
 net/core/rtnetlink.c                                |  4 ++--
 net/dsa/slave.c                                     |  4 ++--
 net/ethernet/eth.c                                  |  3 ++-
 net/ipv4/ip_tunnel.c                                |  2 +-
 net/ipv4/ipmr.c                                     |  2 +-
 net/ipv6/ip6_gre.c                                  |  6 ++++--
 net/ipv6/ip6_tunnel.c                               |  5 +++--
 net/ipv6/ip6_vti.c                                  |  4 ++--
 net/ipv6/ip6mr.c                                    |  2 +-
 net/ipv6/sit.c                                      |  4 +++-
 net/irda/irda_device.c                              |  3 ++-
 net/irda/irlan/irlan_eth.c                          |  2 +-
 net/l2tp/l2tp_eth.c                                 |  3 ++-
 net/mac80211/iface.c                                |  6 +++---
 net/mac802154/ieee802154_dev.c                      |  6 ++++--
 net/netrom/af_netrom.c                              |  2 +-
 net/openvswitch/vport-internal_dev.c                |  3 ++-
 net/phonet/pep-gprs.c                               |  2 +-
 net/rose/af_rose.c                                  |  2 +-
 net/sched/sch_teql.c                                |  4 ++--
 97 files changed, 185 insertions(+), 133 deletions(-)

(limited to 'net/ipv4')

diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c
index c3986452194d..2c68da1ceeee 100644
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -1460,7 +1460,8 @@ static int fwnet_probe(struct fw_unit *unit,
 		goto have_dev;
 	}
 
-	net = alloc_netdev(sizeof(*dev), "firewire%d", fwnet_init_dev);
+	net = alloc_netdev(sizeof(*dev), "firewire%d", NET_NAME_UNKNOWN,
+			   fwnet_init_dev);
 	if (net == NULL) {
 		mutex_unlock(&fwnet_device_mutex);
 		return -ENOMEM;
diff --git a/drivers/hsi/clients/ssi_protocol.c b/drivers/hsi/clients/ssi_protocol.c
index ce4be3738d46..737fa2e0e782 100644
--- a/drivers/hsi/clients/ssi_protocol.c
+++ b/drivers/hsi/clients/ssi_protocol.c
@@ -1115,7 +1115,7 @@ static int ssi_protocol_probe(struct device *dev)
 		goto out;
 	}
 
-	ssi->netdev = alloc_netdev(0, ifname, ssip_pn_setup);
+	ssi->netdev = alloc_netdev(0, ifname, NET_NAME_UNKNOWN, ssip_pn_setup);
 	if (!ssi->netdev) {
 		dev_err(dev, "No memory for netdev\n");
 		err = -ENOMEM;
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c
index 8af33cf1fc4e..2d5cbf4363e4 100644
--- a/drivers/infiniband/hw/amso1100/c2_provider.c
+++ b/drivers/infiniband/hw/amso1100/c2_provider.c
@@ -734,7 +734,7 @@ static struct net_device *c2_pseudo_netdev_init(struct c2_dev *c2dev)
 	/* change ethxxx to iwxxx */
 	strcpy(name, "iw");
 	strcat(name, &c2dev->netdev->name[3]);
-	netdev = alloc_netdev(0, name, setup);
+	netdev = alloc_netdev(0, name, NET_NAME_UNKNOWN, setup);
 	if (!netdev) {
 		printk(KERN_ERR PFX "%s -  etherdev alloc failed",
 			__func__);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 5786a78ff8bc..4e675f4fecc9 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1394,8 +1394,8 @@ struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
 {
 	struct net_device *dev;
 
-	dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name,
-			   ipoib_setup);
+	dev = alloc_netdev((int)sizeof(struct ipoib_dev_priv), name,
+			   NET_NAME_UNKNOWN, ipoib_setup);
 	if (!dev)
 		return NULL;
 
diff --git a/drivers/isdn/i4l/isdn_net.c b/drivers/isdn/i4l/isdn_net.c
index d9aebbc510cc..c2ed6246a389 100644
--- a/drivers/isdn/i4l/isdn_net.c
+++ b/drivers/isdn/i4l/isdn_net.c
@@ -2588,7 +2588,8 @@ isdn_net_new(char *name, struct net_device *master)
 		printk(KERN_WARNING "isdn_net: Could not allocate net-device\n");
 		return NULL;
 	}
-	netdev->dev = alloc_netdev(sizeof(isdn_net_local), name, _isdn_setup);
+	netdev->dev = alloc_netdev(sizeof(isdn_net_local), name,
+				   NET_NAME_UNKNOWN, _isdn_setup);
 	if (!netdev->dev) {
 		printk(KERN_WARNING "isdn_net: Could not allocate network device\n");
 		kfree(netdev);
diff --git a/drivers/media/dvb-core/dvb_net.c b/drivers/media/dvb-core/dvb_net.c
index 8a86b3025637..059e6117f22b 100644
--- a/drivers/media/dvb-core/dvb_net.c
+++ b/drivers/media/dvb-core/dvb_net.c
@@ -1276,7 +1276,8 @@ static int dvb_net_add_if(struct dvb_net *dvbnet, u16 pid, u8 feedtype)
 	if ((if_num = get_if(dvbnet)) < 0)
 		return -EINVAL;
 
-	net = alloc_netdev(sizeof(struct dvb_net_priv), "dvb", dvb_net_setup);
+	net = alloc_netdev(sizeof(struct dvb_net_priv), "dvb",
+			   NET_NAME_UNKNOWN, dvb_net_setup);
 	if (!net)
 		return -ENOMEM;
 
diff --git a/drivers/misc/sgi-xp/xpnet.c b/drivers/misc/sgi-xp/xpnet.c
index 3fac67a5204c..557f9782c53c 100644
--- a/drivers/misc/sgi-xp/xpnet.c
+++ b/drivers/misc/sgi-xp/xpnet.c
@@ -544,7 +544,8 @@ xpnet_init(void)
 	 * use ether_setup() to init the majority of our device
 	 * structure and then override the necessary pieces.
 	 */
-	xpnet_device = alloc_netdev(0, XPNET_DEVICE_NAME, ether_setup);
+	xpnet_device = alloc_netdev(0, XPNET_DEVICE_NAME, NET_NAME_UNKNOWN,
+				    ether_setup);
 	if (xpnet_device == NULL) {
 		kfree(xpnet_broadcast_partitions);
 		return -ENOMEM;
diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c
index a956053608f9..3b790de6c976 100644
--- a/drivers/net/arcnet/arcnet.c
+++ b/drivers/net/arcnet/arcnet.c
@@ -346,7 +346,8 @@ struct net_device *alloc_arcdev(const char *name)
 	struct net_device *dev;
 
 	dev = alloc_netdev(sizeof(struct arcnet_local),
-			   name && *name ? name : "arc%d", arcdev_setup);
+			   name && *name ? name : "arc%d", NET_NAME_UNKNOWN,
+			   arcdev_setup);
 	if(dev) {
 		struct arcnet_local *lp = netdev_priv(dev);
 		spin_lock_init(&lp->lock);
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 09dc3ef771a7..46dcb7b6216f 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4420,7 +4420,7 @@ int bond_create(struct net *net, const char *name)
 	rtnl_lock();
 
 	bond_dev = alloc_netdev_mq(sizeof(struct bonding),
-				   name ? name : "bond%d",
+				   name ? name : "bond%d", NET_NAME_UNKNOWN,
 				   bond_setup, tx_queues);
 	if (!bond_dev) {
 		pr_err("%s: eek! can't alloc netdev!\n", name);
diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c
index fc73865bb83a..27bbc56de15f 100644
--- a/drivers/net/caif/caif_serial.c
+++ b/drivers/net/caif/caif_serial.c
@@ -349,7 +349,8 @@ static int ldisc_open(struct tty_struct *tty)
 	result = snprintf(name, sizeof(name), "cf%s", tty->name);
 	if (result >= IFNAMSIZ)
 		return -EINVAL;
-	dev = alloc_netdev(sizeof(*ser), name, caifdev_setup);
+	dev = alloc_netdev(sizeof(*ser), name, NET_NAME_UNKNOWN,
+			   caifdev_setup);
 	if (!dev)
 		return -ENOMEM;
 
diff --git a/drivers/net/caif/caif_spi.c b/drivers/net/caif/caif_spi.c
index ff54c0eb2052..72ea9ff9bb9c 100644
--- a/drivers/net/caif/caif_spi.c
+++ b/drivers/net/caif/caif_spi.c
@@ -730,8 +730,8 @@ int cfspi_spi_probe(struct platform_device *pdev)
 	int res;
 	dev = (struct cfspi_dev *)pdev->dev.platform_data;
 
-	ndev = alloc_netdev(sizeof(struct cfspi),
-			"cfspi%d", cfspi_setup);
+	ndev = alloc_netdev(sizeof(struct cfspi), "cfspi%d",
+			    NET_NAME_UNKNOWN, cfspi_setup);
 	if (!dev)
 		return -ENODEV;
 
diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c
index 985608634f8c..a5fefb9059c5 100644
--- a/drivers/net/caif/caif_virtio.c
+++ b/drivers/net/caif/caif_virtio.c
@@ -661,7 +661,7 @@ static int cfv_probe(struct virtio_device *vdev)
 	int err = -EINVAL;
 
 	netdev = alloc_netdev(sizeof(struct cfv_info), cfv_netdev_name,
-			      cfv_netdev_setup);
+			      NET_NAME_UNKNOWN, cfv_netdev_setup);
 	if (!netdev)
 		return -ENOMEM;
 
diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index e318e87e2bfc..9f91fcba43f8 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -565,7 +565,7 @@ struct net_device *alloc_candev(int sizeof_priv, unsigned int echo_skb_max)
 	else
 		size = sizeof_priv;
 
-	dev = alloc_netdev(size, "can%d", can_setup);
+	dev = alloc_netdev(size, "can%d", NET_NAME_UNKNOWN, can_setup);
 	if (!dev)
 		return NULL;
 
diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c
index ea4d4f1a6411..acb5b92ace92 100644
--- a/drivers/net/can/slcan.c
+++ b/drivers/net/can/slcan.c
@@ -529,7 +529,7 @@ static struct slcan *slc_alloc(dev_t line)
 		return NULL;
 
 	sprintf(name, "slcan%d", i);
-	dev = alloc_netdev(sizeof(*sl), name, slc_setup);
+	dev = alloc_netdev(sizeof(*sl), name, NET_NAME_UNKNOWN, slc_setup);
 	if (!dev)
 		return NULL;
 
diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c
index 0932ffbf381b..ff435fbd1ad0 100644
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -164,7 +164,7 @@ static int __init dummy_init_one(void)
 	struct net_device *dev_dummy;
 	int err;
 
-	dev_dummy = alloc_netdev(0, "dummy%d", dummy_setup);
+	dev_dummy = alloc_netdev(0, "dummy%d", NET_NAME_UNKNOWN, dummy_setup);
 	if (!dev_dummy)
 		return -ENOMEM;
 
diff --git a/drivers/net/eql.c b/drivers/net/eql.c
index 7a79b6046879..957e5c0cede3 100644
--- a/drivers/net/eql.c
+++ b/drivers/net/eql.c
@@ -585,7 +585,8 @@ static int __init eql_init_module(void)
 
 	pr_info("%s\n", version);
 
-	dev_eql = alloc_netdev(sizeof(equalizer_t), "eql", eql_setup);
+	dev_eql = alloc_netdev(sizeof(equalizer_t), "eql", NET_NAME_UNKNOWN,
+			       eql_setup);
 	if (!dev_eql)
 		return -ENOMEM;
 
diff --git a/drivers/net/ethernet/8390/lib8390.c b/drivers/net/ethernet/8390/lib8390.c
index 599311f0e05c..b96e8852b2d1 100644
--- a/drivers/net/ethernet/8390/lib8390.c
+++ b/drivers/net/ethernet/8390/lib8390.c
@@ -986,7 +986,7 @@ static void ethdev_setup(struct net_device *dev)
 static struct net_device *____alloc_ei_netdev(int size)
 {
 	return alloc_netdev(sizeof(struct ei_device) + size, "eth%d",
-				ethdev_setup);
+			    NET_NAME_UNKNOWN, ethdev_setup);
 }
 
 
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
index 4c70360967c2..69557a26f749 100644
--- a/drivers/net/ethernet/tile/tilegx.c
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -2201,8 +2201,8 @@ static void tile_net_dev_init(const char *name, const uint8_t *mac)
 	/* Allocate the device structure.  Normally, "name" is a
 	 * template, instantiated by register_netdev(), but not for us.
 	 */
-	dev = alloc_netdev_mqs(sizeof(*priv), name, tile_net_setup,
-			       NR_CPUS, 1);
+	dev = alloc_netdev_mqs(sizeof(*priv), name, NET_NAME_UNKNOWN,
+			       tile_net_setup, NR_CPUS, 1);
 	if (!dev) {
 		pr_err("alloc_netdev_mqs(%s) failed\n", name);
 		return;
diff --git a/drivers/net/ethernet/tile/tilepro.c b/drivers/net/ethernet/tile/tilepro.c
index e5a5c5d4ce0c..88c712126692 100644
--- a/drivers/net/ethernet/tile/tilepro.c
+++ b/drivers/net/ethernet/tile/tilepro.c
@@ -2292,7 +2292,8 @@ static struct net_device *tile_net_dev_init(const char *name)
 	 * tile_net_setup(), and saves "name".  Normally, "name" is a
 	 * template, instantiated by register_netdev(), but not for us.
 	 */
-	dev = alloc_netdev(sizeof(*priv), name, tile_net_setup);
+	dev = alloc_netdev(sizeof(*priv), name, NET_NAME_UNKNOWN,
+			   tile_net_setup);
 	if (!dev) {
 		pr_err("alloc_netdev(%s) failed\n", name);
 		return NULL;
diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index 66e2b19ef709..c3c4051a089d 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -596,7 +596,8 @@ static int sixpack_open(struct tty_struct *tty)
 	if (tty->ops->write == NULL)
 		return -EOPNOTSUPP;
 
-	dev = alloc_netdev(sizeof(struct sixpack), "sp%d", sp_setup);
+	dev = alloc_netdev(sizeof(struct sixpack), "sp%d", NET_NAME_UNKNOWN,
+			   sp_setup);
 	if (!dev) {
 		err = -ENOMEM;
 		goto out;
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c
index 484f77ec2ce1..a98c153f371e 100644
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -1206,7 +1206,7 @@ static int __init init_baycomepp(void)
 		struct net_device *dev;
 		
 		dev = alloc_netdev(sizeof(struct baycom_state), "bce%d",
-				   baycom_epp_dev_setup);
+				   NET_NAME_UNKNOWN, baycom_epp_dev_setup);
 
 		if (!dev) {
 			printk(KERN_WARNING "bce%d : out of memory\n", i);
diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c
index d50b23cf9ea9..c2894e43840e 100644
--- a/drivers/net/hamradio/bpqether.c
+++ b/drivers/net/hamradio/bpqether.c
@@ -501,8 +501,8 @@ static int bpq_new_device(struct net_device *edev)
 	struct net_device *ndev;
 	struct bpqdev *bpq;
 
-	ndev = alloc_netdev(sizeof(struct bpqdev), "bpq%d",
-			   bpq_setup);
+	ndev = alloc_netdev(sizeof(struct bpqdev), "bpq%d", NET_NAME_UNKNOWN,
+			    bpq_setup);
 	if (!ndev)
 		return -ENOMEM;
 
diff --git a/drivers/net/hamradio/dmascc.c b/drivers/net/hamradio/dmascc.c
index 6636022a1027..0fad408f24aa 100644
--- a/drivers/net/hamradio/dmascc.c
+++ b/drivers/net/hamradio/dmascc.c
@@ -466,7 +466,7 @@ static int __init setup_adapter(int card_base, int type, int n)
 	if (!info)
 		goto out;
 
-	info->dev[0] = alloc_netdev(0, "", dev_setup);
+	info->dev[0] = alloc_netdev(0, "", NET_NAME_UNKNOWN, dev_setup);
 	if (!info->dev[0]) {
 		printk(KERN_ERR "dmascc: "
 		       "could not allocate memory for %s at %#3x\n",
@@ -474,7 +474,7 @@ static int __init setup_adapter(int card_base, int type, int n)
 		goto out1;
 	}
 
-	info->dev[1] = alloc_netdev(0, "", dev_setup);
+	info->dev[1] = alloc_netdev(0, "", NET_NAME_UNKNOWN, dev_setup);
 	if (!info->dev[1]) {
 		printk(KERN_ERR "dmascc: "
 		       "could not allocate memory for %s at %#3x\n",
diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c
index 5d78c1d08abd..c67a27245072 100644
--- a/drivers/net/hamradio/hdlcdrv.c
+++ b/drivers/net/hamradio/hdlcdrv.c
@@ -699,7 +699,7 @@ struct net_device *hdlcdrv_register(const struct hdlcdrv_ops *ops,
 	if (privsize < sizeof(struct hdlcdrv_state))
 		privsize = sizeof(struct hdlcdrv_state);
 
-	dev = alloc_netdev(privsize, ifname, hdlcdrv_setup);
+	dev = alloc_netdev(privsize, ifname, NET_NAME_UNKNOWN, hdlcdrv_setup);
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c
index 8a6c720a4cc9..f990bb1c3e02 100644
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -734,7 +734,8 @@ static int mkiss_open(struct tty_struct *tty)
 	if (tty->ops->write == NULL)
 		return -EOPNOTSUPP;
 
-	dev = alloc_netdev(sizeof(struct mkiss), "ax%d", ax_setup);
+	dev = alloc_netdev(sizeof(struct mkiss), "ax%d", NET_NAME_UNKNOWN,
+			   ax_setup);
 	if (!dev) {
 		err = -ENOMEM;
 		goto out;
diff --git a/drivers/net/hamradio/scc.c b/drivers/net/hamradio/scc.c
index 4bc6ee8e7987..57be9e0e98a6 100644
--- a/drivers/net/hamradio/scc.c
+++ b/drivers/net/hamradio/scc.c
@@ -1515,7 +1515,7 @@ static int scc_net_alloc(const char *name, struct scc_channel *scc)
 	int err;
 	struct net_device *dev;
 
-	dev = alloc_netdev(0, name, scc_net_setup);
+	dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, scc_net_setup);
 	if (!dev) 
 		return -ENOMEM;
 
diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c
index 81901659cc9e..717433cfb81d 100644
--- a/drivers/net/hamradio/yam.c
+++ b/drivers/net/hamradio/yam.c
@@ -1147,7 +1147,7 @@ static int __init yam_init_driver(void)
 		sprintf(name, "yam%d", i);
 		
 		dev = alloc_netdev(sizeof(struct yam_port), name,
-				   yam_setup);
+				   NET_NAME_UNKNOWN, yam_setup);
 		if (!dev) {
 			pr_err("yam: cannot allocate net device\n");
 			err = -ENOMEM;
diff --git a/drivers/net/ieee802154/fakehard.c b/drivers/net/ieee802154/fakehard.c
index 78f18be3bbf2..9ce854f43917 100644
--- a/drivers/net/ieee802154/fakehard.c
+++ b/drivers/net/ieee802154/fakehard.c
@@ -343,7 +343,8 @@ static int ieee802154fake_probe(struct platform_device *pdev)
 	if (!phy)
 		return -ENOMEM;
 
-	dev = alloc_netdev(sizeof(struct fakehard_priv), "hardwpan%d", ieee802154_fake_setup);
+	dev = alloc_netdev(sizeof(struct fakehard_priv), "hardwpan%d",
+			   NET_NAME_UNKNOWN, ieee802154_fake_setup);
 	if (!dev) {
 		wpan_phy_free(phy);
 		return -ENOMEM;
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 46a7790be004..d2d4a3d2237f 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -269,8 +269,8 @@ static int __init ifb_init_one(int index)
 	struct ifb_private *dp;
 	int err;
 
-	dev_ifb = alloc_netdev(sizeof(struct ifb_private),
-				 "ifb%d", ifb_setup);
+	dev_ifb = alloc_netdev(sizeof(struct ifb_private), "ifb%d",
+			       NET_NAME_UNKNOWN, ifb_setup);
 
 	if (!dev_ifb)
 		return -ENOMEM;
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index bb96409f8c05..8f2262540561 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -195,7 +195,7 @@ static __net_init int loopback_net_init(struct net *net)
 	int err;
 
 	err = -ENOMEM;
-	dev = alloc_netdev(0, "lo", loopback_setup);
+	dev = alloc_netdev(0, "lo", NET_NAME_UNKNOWN, loopback_setup);
 	if (!dev)
 		goto out;
 
diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
index 91d6c1272fcf..5c002b1ef169 100644
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -2665,7 +2665,8 @@ ppp_create_interface(struct net *net, int unit, int *retp)
 	int ret = -ENOMEM;
 	int i;
 
-	dev = alloc_netdev(sizeof(struct ppp), "", ppp_setup);
+	dev = alloc_netdev(sizeof(struct ppp), "", NET_NAME_UNKNOWN,
+			   ppp_setup);
 	if (!dev)
 		goto out1;
 
diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c
index 87526443841f..05387b1e2e95 100644
--- a/drivers/net/slip/slip.c
+++ b/drivers/net/slip/slip.c
@@ -749,7 +749,7 @@ static struct slip *sl_alloc(dev_t line)
 		return NULL;
 
 	sprintf(name, "sl%d", i);
-	dev = alloc_netdev(sizeof(*sl), name, sl_setup);
+	dev = alloc_netdev(sizeof(*sl), name, NET_NAME_UNKNOWN, sl_setup);
 	if (!dev)
 		return NULL;
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 98bad1fb1bfb..acaaf6784179 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1633,7 +1633,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 			name = ifr->ifr_name;
 
 		dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
-				       tun_setup, queues, queues);
+				       NET_NAME_UNKNOWN, tun_setup, queues,
+				       queues);
 
 		if (!dev)
 			return -ENOMEM;
diff --git a/drivers/net/usb/cdc-phonet.c b/drivers/net/usb/cdc-phonet.c
index 6358d420e185..2ec1500d0077 100644
--- a/drivers/net/usb/cdc-phonet.c
+++ b/drivers/net/usb/cdc-phonet.c
@@ -387,7 +387,7 @@ static int usbpn_probe(struct usb_interface *intf, const struct usb_device_id *i
 		return -EINVAL;
 
 	dev = alloc_netdev(sizeof(*pnd) + sizeof(pnd->urbs[0]) * rxq_size,
-				ifname, usbpn_setup);
+			   ifname, NET_NAME_UNKNOWN, usbpn_setup);
 	if (!dev)
 		return -ENOMEM;
 
diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c
index a3a05869309d..50b36b299946 100644
--- a/drivers/net/usb/hso.c
+++ b/drivers/net/usb/hso.c
@@ -2520,7 +2520,8 @@ static struct hso_device *hso_create_net_device(struct usb_interface *interface,
 
 	/* allocate our network device, then we can put in our private data */
 	/* call hso_net_init to do the basic initialization */
-	net = alloc_netdev(sizeof(struct hso_net), "hso%d", hso_net_init);
+	net = alloc_netdev(sizeof(struct hso_net), "hso%d", NET_NAME_UNKNOWN,
+			   hso_net_init);
 	if (!net) {
 		dev_err(&interface->dev, "Unable to create ethernet device\n");
 		goto exit;
diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c
index 19f7cb2cdef3..a463613a0719 100644
--- a/drivers/net/wan/dlci.c
+++ b/drivers/net/wan/dlci.c
@@ -327,8 +327,8 @@ static int dlci_add(struct dlci_add *dlci)
 		goto err1;
 
 	/* create device name */
-	master = alloc_netdev( sizeof(struct dlci_local), "dlci%d",
-			      dlci_setup);
+	master = alloc_netdev(sizeof(struct dlci_local), "dlci%d",
+			      NET_NAME_UNKNOWN, dlci_setup);
 	if (!master) {
 		err = -ENOMEM;
 		goto err1;
diff --git a/drivers/net/wan/hdlc.c b/drivers/net/wan/hdlc.c
index 9c33ca918e19..51f6cee8aab2 100644
--- a/drivers/net/wan/hdlc.c
+++ b/drivers/net/wan/hdlc.c
@@ -256,7 +256,8 @@ static void hdlc_setup(struct net_device *dev)
 struct net_device *alloc_hdlcdev(void *priv)
 {
 	struct net_device *dev;
-	dev = alloc_netdev(sizeof(struct hdlc_device), "hdlc%d", hdlc_setup);
+	dev = alloc_netdev(sizeof(struct hdlc_device), "hdlc%d",
+			   NET_NAME_UNKNOWN, hdlc_setup);
 	if (dev)
 		dev_to_hdlc(dev)->priv = priv;
 	return dev;
diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c
index 7c6cb4f31798..7cc64eac0fa3 100644
--- a/drivers/net/wan/hdlc_fr.c
+++ b/drivers/net/wan/hdlc_fr.c
@@ -1075,10 +1075,11 @@ static int fr_add_pvc(struct net_device *frad, unsigned int dlci, int type)
 	used = pvc_is_used(pvc);
 
 	if (type == ARPHRD_ETHER) {
-		dev = alloc_netdev(0, "pvceth%d", ether_setup);
+		dev = alloc_netdev(0, "pvceth%d", NET_NAME_UNKNOWN,
+				   ether_setup);
 		dev->priv_flags &= ~IFF_TX_SKB_SHARING;
 	} else
-		dev = alloc_netdev(0, "pvc%d", pvc_setup);
+		dev = alloc_netdev(0, "pvc%d", NET_NAME_UNKNOWN, pvc_setup);
 
 	if (!dev) {
 		netdev_warn(frad, "Memory squeeze on fr_pvc()\n");
diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c
index a33a46fa88dd..2f5eda8a7227 100644
--- a/drivers/net/wan/lapbether.c
+++ b/drivers/net/wan/lapbether.c
@@ -325,8 +325,8 @@ static int lapbeth_new_device(struct net_device *dev)
 
 	ASSERT_RTNL();
 
-	ndev = alloc_netdev(sizeof(*lapbeth), "lapb%d", 
-			   lapbeth_setup);
+	ndev = alloc_netdev(sizeof(*lapbeth), "lapb%d", NET_NAME_UNKNOWN,
+			    lapbeth_setup);
 	if (!ndev)
 		goto out;
 
diff --git a/drivers/net/wan/sbni.c b/drivers/net/wan/sbni.c
index 1b89ecf0959e..758c4ba1e97c 100644
--- a/drivers/net/wan/sbni.c
+++ b/drivers/net/wan/sbni.c
@@ -227,7 +227,8 @@ int __init sbni_probe(int unit)
 	struct net_device *dev;
 	int err;
 
-	dev = alloc_netdev(sizeof(struct net_local), "sbni", sbni_devsetup);
+	dev = alloc_netdev(sizeof(struct net_local), "sbni",
+			   NET_NAME_UNKNOWN, sbni_devsetup);
 	if (!dev)
 		return -ENOMEM;
 
@@ -1477,8 +1478,8 @@ int __init init_module( void )
 	int err;
 
 	while( num < SBNI_MAX_NUM_CARDS ) {
-		dev = alloc_netdev(sizeof(struct net_local), 
-				   "sbni%d", sbni_devsetup);
+		dev = alloc_netdev(sizeof(struct net_local), "sbni%d",
+				   NET_NAME_UNKNOWN, sbni_devsetup);
 		if( !dev)
 			break;
 
diff --git a/drivers/net/wan/sdla.c b/drivers/net/wan/sdla.c
index cdd45fb8a1f6..421ac5f85699 100644
--- a/drivers/net/wan/sdla.c
+++ b/drivers/net/wan/sdla.c
@@ -1631,7 +1631,8 @@ static int __init init_sdla(void)
 
 	printk("%s.\n", version);
 
-	sdla = alloc_netdev(sizeof(struct frad_local), "sdla0", setup_sdla);
+	sdla = alloc_netdev(sizeof(struct frad_local), "sdla0",
+			    NET_NAME_UNKNOWN, setup_sdla);
 	if (!sdla) 
 		return -ENOMEM;
 
diff --git a/drivers/net/wan/x25_asy.c b/drivers/net/wan/x25_asy.c
index 5895f1978691..df6c07357556 100644
--- a/drivers/net/wan/x25_asy.c
+++ b/drivers/net/wan/x25_asy.c
@@ -81,8 +81,8 @@ static struct x25_asy *x25_asy_alloc(void)
 		char name[IFNAMSIZ];
 		sprintf(name, "x25asy%d", i);
 
-		dev = alloc_netdev(sizeof(struct x25_asy),
-				   name, x25_asy_setup);
+		dev = alloc_netdev(sizeof(struct x25_asy), name,
+				   NET_NAME_UNKNOWN, x25_asy_setup);
 		if (!dev)
 			return NULL;
 
diff --git a/drivers/net/wimax/i2400m/usb.c b/drivers/net/wimax/i2400m/usb.c
index cd15a93d9084..e7f5910a6519 100644
--- a/drivers/net/wimax/i2400m/usb.c
+++ b/drivers/net/wimax/i2400m/usb.c
@@ -472,7 +472,7 @@ int i2400mu_probe(struct usb_interface *iface,
 
 	/* Allocate instance [calls i2400m_netdev_setup() on it]. */
 	result = -ENOMEM;
-	net_dev = alloc_netdev(sizeof(*i2400mu), "wmx%d",
+	net_dev = alloc_netdev(sizeof(*i2400mu), "wmx%d", NET_NAME_UNKNOWN,
 			       i2400mu_netdev_setup);
 	if (net_dev == NULL) {
 		dev_err(dev, "no memory for network device instance\n");
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index 64747d457bb3..29d88739454b 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -2685,7 +2685,8 @@ static struct net_device *init_wifidev(struct airo_info *ai,
 					struct net_device *ethdev)
 {
 	int err;
-	struct net_device *dev = alloc_netdev(0, "wifi%d", wifi_setup);
+	struct net_device *dev = alloc_netdev(0, "wifi%d", NET_NAME_UNKNOWN,
+					      wifi_setup);
 	if (!dev)
 		return NULL;
 	dev->ml_priv = ethdev->ml_priv;
@@ -2785,7 +2786,7 @@ static struct net_device *_init_airo_card( unsigned short irq, int port,
 	CapabilityRid cap_rid;
 
 	/* Create the network device object. */
-	dev = alloc_netdev(sizeof(*ai), "", ether_setup);
+	dev = alloc_netdev(sizeof(*ai), "", NET_NAME_UNKNOWN, ether_setup);
 	if (!dev) {
 		airo_print_err("", "Couldn't alloc_etherdev");
 		return NULL;
diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index 0e26f4a34fda..1c4ce8e3eebe 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -3636,7 +3636,7 @@ struct wireless_dev *ath6kl_interface_add(struct ath6kl *ar, const char *name,
 	struct net_device *ndev;
 	struct ath6kl_vif *vif;
 
-	ndev = alloc_netdev(sizeof(*vif), name, ether_setup);
+	ndev = alloc_netdev(sizeof(*vif), name, NET_NAME_UNKNOWN, ether_setup);
 	if (!ndev)
 		return NULL;
 
diff --git a/drivers/net/wireless/ath/wil6210/netdev.c b/drivers/net/wireless/ath/wil6210/netdev.c
index 106b6dcb773a..7afce6e8c507 100644
--- a/drivers/net/wireless/ath/wil6210/netdev.c
+++ b/drivers/net/wireless/ath/wil6210/netdev.c
@@ -132,7 +132,7 @@ void *wil_if_alloc(struct device *dev, void __iomem *csr)
 	ch = wdev->wiphy->bands[IEEE80211_BAND_60GHZ]->channels;
 	cfg80211_chandef_create(&wdev->preset_chandef, ch, NL80211_CHAN_NO_HT);
 
-	ndev = alloc_netdev(0, "wlan%d", ether_setup);
+	ndev = alloc_netdev(0, "wlan%d", NET_NAME_UNKNOWN, ether_setup);
 	if (!ndev) {
 		dev_err(dev, "alloc_netdev_mqs failed\n");
 		rc = -ENOMEM;
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c b/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c
index 09dd8c13d844..2699441d4f41 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c
@@ -808,7 +808,8 @@ struct brcmf_if *brcmf_add_if(struct brcmf_pub *drvr, s32 bssidx, s32 ifidx,
 	} else {
 		brcmf_dbg(INFO, "allocate netdev interface\n");
 		/* Allocate netdev, including space for private structure */
-		ndev = alloc_netdev(sizeof(*ifp), name, ether_setup);
+		ndev = alloc_netdev(sizeof(*ifp), name, NET_NAME_UNKNOWN,
+				    ether_setup);
 		if (!ndev)
 			return ERR_PTR(-ENOMEM);
 
diff --git a/drivers/net/wireless/libertas/main.c b/drivers/net/wireless/libertas/main.c
index 0c02f0483d1f..569b64ecc607 100644
--- a/drivers/net/wireless/libertas/main.c
+++ b/drivers/net/wireless/libertas/main.c
@@ -981,7 +981,7 @@ struct lbs_private *lbs_add_card(void *card, struct device *dmdev)
 		goto err_wdev;
 	}
 
-	dev = alloc_netdev(0, "wlan%d", ether_setup);
+	dev = alloc_netdev(0, "wlan%d", NET_NAME_UNKNOWN, ether_setup);
 	if (!dev) {
 		dev_err(dmdev, "no memory for network device instance\n");
 		goto err_adapter;
diff --git a/drivers/net/wireless/libertas/mesh.c b/drivers/net/wireless/libertas/mesh.c
index 6fef746345bc..01a67f62696f 100644
--- a/drivers/net/wireless/libertas/mesh.c
+++ b/drivers/net/wireless/libertas/mesh.c
@@ -1000,7 +1000,7 @@ static int lbs_add_mesh(struct lbs_private *priv)
 		goto done;
 	}
 
-	mesh_dev = alloc_netdev(0, "msh%d", ether_setup);
+	mesh_dev = alloc_netdev(0, "msh%d", NET_NAME_UNKNOWN, ether_setup);
 	if (!mesh_dev) {
 		lbs_deb_mesh("init mshX device failed\n");
 		ret = -ENOMEM;
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index eba51460a5de..5ea65fce0b83 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -2676,7 +2676,8 @@ static int __init init_mac80211_hwsim(void)
 			goto out_free_radios;
 	}
 
-	hwsim_mon = alloc_netdev(0, "hwsim%d", hwsim_mon_setup);
+	hwsim_mon = alloc_netdev(0, "hwsim%d", NET_NAME_UNKNOWN,
+				 hwsim_mon_setup);
 	if (hwsim_mon == NULL) {
 		err = -ENOMEM;
 		goto out_free_radios;
diff --git a/drivers/net/wireless/mwifiex/cfg80211.c b/drivers/net/wireless/mwifiex/cfg80211.c
index 6af135fa99f7..ca87f923c61e 100644
--- a/drivers/net/wireless/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/mwifiex/cfg80211.c
@@ -2232,7 +2232,8 @@ struct wireless_dev *mwifiex_add_virtual_intf(struct wiphy *wiphy,
 	}
 
 	dev = alloc_netdev_mqs(sizeof(struct mwifiex_private *), name,
-			       ether_setup, IEEE80211_NUM_ACS, 1);
+			       NET_NAME_UNKNOWN, ether_setup,
+			       IEEE80211_NUM_ACS, 1);
 	if (!dev) {
 		wiphy_err(wiphy, "no memory available for netdevice\n");
 		priv->bss_mode = NL80211_IFTYPE_UNSPECIFIED;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index ef75b45e5085..bd59d9dbf27b 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -418,8 +418,8 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
 	 * When the guest selects the desired number, it will be updated
 	 * via netif_set_real_num_*_queues().
 	 */
-	dev = alloc_netdev_mq(sizeof(struct xenvif), name, ether_setup,
-			      xenvif_max_queues);
+	dev = alloc_netdev_mq(sizeof(struct xenvif), name, NET_NAME_UNKNOWN,
+			      ether_setup, xenvif_max_queues);
 	if (dev == NULL) {
 		pr_warn("Could not allocate netdev for %s\n", name);
 		return ERR_PTR(-ENOMEM);
diff --git a/drivers/s390/net/claw.c b/drivers/s390/net/claw.c
index d837c3c5330f..fbc6701bef30 100644
--- a/drivers/s390/net/claw.c
+++ b/drivers/s390/net/claw.c
@@ -2915,7 +2915,7 @@ claw_new_device(struct ccwgroup_device *cgdev)
 			"failed with error code %d\n", ret);
 		goto out;
 	}
-	dev = alloc_netdev(0,"claw%d",claw_init_netdevice);
+	dev = alloc_netdev(0, "claw%d", NET_NAME_UNKNOWN, claw_init_netdevice);
 	if (!dev) {
 		dev_warn(&cgdev->dev,
 			"Activating the CLAW device failed\n");
diff --git a/drivers/s390/net/ctcm_main.c b/drivers/s390/net/ctcm_main.c
index 03b6ad035577..e056dd4fe44d 100644
--- a/drivers/s390/net/ctcm_main.c
+++ b/drivers/s390/net/ctcm_main.c
@@ -1137,9 +1137,11 @@ static struct net_device *ctcm_init_netdevice(struct ctcm_priv *priv)
 		return NULL;
 
 	if (IS_MPC(priv))
-		dev = alloc_netdev(0, MPC_DEVICE_GENE, ctcm_dev_setup);
+		dev = alloc_netdev(0, MPC_DEVICE_GENE, NET_NAME_UNKNOWN,
+				   ctcm_dev_setup);
 	else
-		dev = alloc_netdev(0, CTC_DEVICE_GENE, ctcm_dev_setup);
+		dev = alloc_netdev(0, CTC_DEVICE_GENE, NET_NAME_UNKNOWN,
+				   ctcm_dev_setup);
 
 	if (!dev) {
 		CTCM_DBF_TEXT_(ERROR, CTC_DBF_CRIT,
diff --git a/drivers/s390/net/netiucv.c b/drivers/s390/net/netiucv.c
index ce16d1bdb20a..0a87809c8af7 100644
--- a/drivers/s390/net/netiucv.c
+++ b/drivers/s390/net/netiucv.c
@@ -2015,7 +2015,7 @@ static struct net_device *netiucv_init_netdevice(char *username, char *userdata)
 	struct net_device *dev;
 
 	dev = alloc_netdev(sizeof(struct netiucv_priv), "iucv%d",
-			   netiucv_setup_netdevice);
+			   NET_NAME_UNKNOWN, netiucv_setup_netdevice);
 	if (!dev)
 		return NULL;
 	rtnl_lock();
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 5ef5b4f45758..c2679bfe7f66 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -952,10 +952,12 @@ static int qeth_l2_setup_netdev(struct qeth_card *card)
 {
 	switch (card->info.type) {
 	case QETH_CARD_TYPE_IQD:
-		card->dev = alloc_netdev(0, "hsi%d", ether_setup);
+		card->dev = alloc_netdev(0, "hsi%d", NET_NAME_UNKNOWN,
+					 ether_setup);
 		break;
 	case QETH_CARD_TYPE_OSN:
-		card->dev = alloc_netdev(0, "osn%d", ether_setup);
+		card->dev = alloc_netdev(0, "osn%d", NET_NAME_UNKNOWN,
+					 ether_setup);
 		card->dev->flags |= IFF_NOARP;
 		break;
 	default:
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 14e0b5810e8c..f8427a2c4840 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -3287,7 +3287,8 @@ static int qeth_l3_setup_netdev(struct qeth_card *card)
 			}
 		}
 	} else if (card->info.type == QETH_CARD_TYPE_IQD) {
-		card->dev = alloc_netdev(0, "hsi%d", ether_setup);
+		card->dev = alloc_netdev(0, "hsi%d", NET_NAME_UNKNOWN,
+					 ether_setup);
 		if (!card->dev)
 			return -ENODEV;
 		card->dev->flags |= IFF_NOARP;
diff --git a/drivers/staging/cxt1e1/linux.c b/drivers/staging/cxt1e1/linux.c
index 09f3d5ca75ac..85d776bbfb15 100644
--- a/drivers/staging/cxt1e1/linux.c
+++ b/drivers/staging/cxt1e1/linux.c
@@ -917,7 +917,8 @@ c4_add_dev(hdw_info_t *hi, int brdno, unsigned long f0, unsigned long f1,
 	struct net_device *ndev;
 	ci_t       *ci;
 
-	ndev = alloc_netdev(sizeof(ci_t), SBE_IFACETMPL, c4_setup);
+	ndev = alloc_netdev(sizeof(ci_t), SBE_IFACETMPL, NET_NAME_UNKNOWN,
+			    c4_setup);
 	if (!ndev) {
 		pr_warning("%s: no memory for struct net_device !\n",
 			   hi->devname);
diff --git a/drivers/staging/gdm724x/gdm_lte.c b/drivers/staging/gdm724x/gdm_lte.c
index 64c55b99fda4..c2268527422f 100644
--- a/drivers/staging/gdm724x/gdm_lte.c
+++ b/drivers/staging/gdm724x/gdm_lte.c
@@ -885,7 +885,7 @@ int register_lte_device(struct phy_dev *phy_dev,
 
 		/* Allocate netdev */
 		net = alloc_netdev(sizeof(struct nic), pdn_dev_name,
-				ether_setup);
+				   NET_NAME_UNKNOWN, ether_setup);
 		if (net == NULL) {
 			pr_err("alloc_netdev failed\n");
 			ret = -ENOMEM;
diff --git a/drivers/staging/gdm72xx/gdm_wimax.c b/drivers/staging/gdm72xx/gdm_wimax.c
index e5e511585122..a9a6fc51024b 100644
--- a/drivers/staging/gdm72xx/gdm_wimax.c
+++ b/drivers/staging/gdm72xx/gdm_wimax.c
@@ -886,7 +886,8 @@ int register_wimax_device(struct phy_dev *phy_dev, struct device *pdev)
 	struct net_device *dev;
 	int ret;
 
-	dev = alloc_netdev(sizeof(*nic), "wm%d", ether_setup);
+	dev = alloc_netdev(sizeof(*nic), "wm%d", NET_NAME_UNKNOWN,
+			   ether_setup);
 
 	if (dev == NULL) {
 		pr_err("alloc_etherdev failed\n");
diff --git a/drivers/staging/vt6655/wpactl.c b/drivers/staging/vt6655/wpactl.c
index 8392d4d1d5ed..0814bfd68b2e 100644
--- a/drivers/staging/vt6655/wpactl.c
+++ b/drivers/staging/vt6655/wpactl.c
@@ -89,7 +89,8 @@ static int wpa_init_wpadev(PSDevice pDevice)
 	struct net_device *dev = pDevice->dev;
 	int ret = 0;
 
-	pDevice->wpadev = alloc_netdev(sizeof(PSDevice), "vntwpa", wpadev_setup);
+	pDevice->wpadev = alloc_netdev(sizeof(PSDevice), "vntwpa",
+				       NET_NAME_UNKNOWN, wpadev_setup);
 	if (pDevice->wpadev == NULL)
 		return -ENOMEM;
 
diff --git a/drivers/staging/wlan-ng/p80211netdev.c b/drivers/staging/wlan-ng/p80211netdev.c
index 00b186c59725..6c78f917e24a 100644
--- a/drivers/staging/wlan-ng/p80211netdev.c
+++ b/drivers/staging/wlan-ng/p80211netdev.c
@@ -769,7 +769,7 @@ int wlan_setup(wlandevice_t *wlandev, struct device *physdev)
 
 	/* Allocate and initialize the struct device */
 	netdev = alloc_netdev(sizeof(struct wireless_dev), "wlan%d",
-				ether_setup);
+			      NET_NAME_UNKNOWN, ether_setup);
 	if (netdev == NULL) {
 		dev_err(physdev, "Failed to alloc netdev.\n");
 		wlan_free_wiphy(wiphy);
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index 2ebe47b78a3e..cde3ab97900f 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -2789,9 +2789,8 @@ static int gsm_create_network(struct gsm_dlci *dlci, struct gsm_netconfig *nc)
 	netname = "gsm%d";
 	if (nc->if_name[0] != '\0')
 		netname = nc->if_name;
-	net = alloc_netdev(sizeof(struct gsm_mux_net),
-			netname,
-			gsm_mux_net_init);
+	net = alloc_netdev(sizeof(struct gsm_mux_net), netname,
+			   NET_NAME_UNKNOWN, gsm_mux_net_init);
 	if (!net) {
 		pr_err("alloc_netdev failed");
 		return -ENOMEM;
diff --git a/drivers/usb/gadget/f_phonet.c b/drivers/usb/gadget/f_phonet.c
index f2b781773eed..b9cfc1571d71 100644
--- a/drivers/usb/gadget/f_phonet.c
+++ b/drivers/usb/gadget/f_phonet.c
@@ -721,7 +721,8 @@ struct net_device *gphonet_setup_default(void)
 	struct phonet_port *port;
 
 	/* Create net device */
-	dev = alloc_netdev(sizeof(*port), "upnlink%d", pn_net_setup);
+	dev = alloc_netdev(sizeof(*port), "upnlink%d", NET_NAME_UNKNOWN,
+			   pn_net_setup);
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9be34732142f..15ed750458ad 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2991,13 +2991,15 @@ void ether_setup(struct net_device *dev);
 
 /* Support for loadable net-drivers */
 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
+				    unsigned char name_assign_type,
 				    void (*setup)(struct net_device *),
 				    unsigned int txqs, unsigned int rxqs);
-#define alloc_netdev(sizeof_priv, name, setup) \
-	alloc_netdev_mqs(sizeof_priv, name, setup, 1, 1)
+#define alloc_netdev(sizeof_priv, name, name_assign_type, setup) \
+	alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1)
 
-#define alloc_netdev_mq(sizeof_priv, name, setup, count) \
-	alloc_netdev_mqs(sizeof_priv, name, setup, count, count)
+#define alloc_netdev_mq(sizeof_priv, name, name_assign_type, setup, count) \
+	alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, count, \
+			 count)
 
 int register_netdev(struct net_device *dev);
 void unregister_netdev(struct net_device *dev);
diff --git a/net/802/fc.c b/net/802/fc.c
index 05eea6b98bb8..7c174b6750cd 100644
--- a/net/802/fc.c
+++ b/net/802/fc.c
@@ -126,6 +126,6 @@ static void fc_setup(struct net_device *dev)
  */
 struct net_device *alloc_fcdev(int sizeof_priv)
 {
-	return alloc_netdev(sizeof_priv, "fc%d", fc_setup);
+	return alloc_netdev(sizeof_priv, "fc%d", NET_NAME_UNKNOWN, fc_setup);
 }
 EXPORT_SYMBOL(alloc_fcdev);
diff --git a/net/802/fddi.c b/net/802/fddi.c
index 9cda40661e0d..59e7346f1193 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -207,7 +207,8 @@ static void fddi_setup(struct net_device *dev)
  */
 struct net_device *alloc_fddidev(int sizeof_priv)
 {
-	return alloc_netdev(sizeof_priv, "fddi%d", fddi_setup);
+	return alloc_netdev(sizeof_priv, "fddi%d", NET_NAME_UNKNOWN,
+			    fddi_setup);
 }
 EXPORT_SYMBOL(alloc_fddidev);
 
diff --git a/net/802/hippi.c b/net/802/hippi.c
index 5ff2a718ddca..2e03f8259dd5 100644
--- a/net/802/hippi.c
+++ b/net/802/hippi.c
@@ -228,7 +228,8 @@ static void hippi_setup(struct net_device *dev)
 
 struct net_device *alloc_hippi_dev(int sizeof_priv)
 {
-	return alloc_netdev(sizeof_priv, "hip%d", hippi_setup);
+	return alloc_netdev(sizeof_priv, "hip%d", NET_NAME_UNKNOWN,
+			    hippi_setup);
 }
 
 EXPORT_SYMBOL(alloc_hippi_dev);
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 44ebd5c2cd4a..cba9c212a730 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -250,7 +250,8 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
 		snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id);
 	}
 
-	new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name, vlan_setup);
+	new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name,
+			       NET_NAME_UNKNOWN, vlan_setup);
 
 	if (new_dev == NULL)
 		return -ENOBUFS;
diff --git a/net/appletalk/dev.c b/net/appletalk/dev.c
index 6c8016f61866..e4158b8b926d 100644
--- a/net/appletalk/dev.c
+++ b/net/appletalk/dev.c
@@ -39,6 +39,7 @@ static void ltalk_setup(struct net_device *dev)
 
 struct net_device *alloc_ltalkdev(int sizeof_priv)
 {
-	return alloc_netdev(sizeof_priv, "lt%d", ltalk_setup);
+	return alloc_netdev(sizeof_priv, "lt%d", NET_NAME_UNKNOWN,
+			    ltalk_setup);
 }
 EXPORT_SYMBOL(alloc_ltalkdev);
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index 403e71fa88fe..cc78538d163b 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -682,8 +682,8 @@ static int br2684_create(void __user *arg)
 
 	netdev = alloc_netdev(sizeof(struct br2684_dev),
 			      ni.ifname[0] ? ni.ifname : "nas%d",
-			      (payload == p_routed) ?
-			      br2684_setup_routed : br2684_setup);
+			      NET_NAME_UNKNOWN,
+			      (payload == p_routed) ? br2684_setup_routed : br2684_setup);
 	if (!netdev)
 		return -ENOMEM;
 
diff --git a/net/atm/clip.c b/net/atm/clip.c
index ba291ce4bdff..46339040fef0 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -520,7 +520,8 @@ static int clip_create(int number)
 			if (PRIV(dev)->number >= number)
 				number = PRIV(dev)->number + 1;
 	}
-	dev = alloc_netdev(sizeof(struct clip_priv), "", clip_setup);
+	dev = alloc_netdev(sizeof(struct clip_priv), "", NET_NAME_UNKNOWN,
+			   clip_setup);
 	if (!dev)
 		return -ENOMEM;
 	clip_priv = PRIV(dev);
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index e7ee65dc20bf..d551e6302cf3 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -895,7 +895,7 @@ struct net_device *batadv_softif_create(const char *name)
 	int ret;
 
 	soft_iface = alloc_netdev(sizeof(struct batadv_priv), name,
-				  batadv_softif_init_early);
+				  NET_NAME_UNKNOWN, batadv_softif_init_early);
 	if (!soft_iface)
 		return NULL;
 
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 5a7f81df603c..206b65ccd5b8 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -712,7 +712,7 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev)
 	unsigned long flags;
 
 	netdev = alloc_netdev(sizeof(struct lowpan_dev), IFACE_NAME_TEMPLATE,
-			      netdev_setup);
+			      NET_NAME_UNKNOWN, netdev_setup);
 	if (!netdev)
 		return -ENOMEM;
 
diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
index a841d3e776c5..85bcc21e84d2 100644
--- a/net/bluetooth/bnep/core.c
+++ b/net/bluetooth/bnep/core.c
@@ -538,8 +538,9 @@ int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock)
 
 	/* session struct allocated as private part of net_device */
 	dev = alloc_netdev(sizeof(struct bnep_session),
-				(*req->device) ? req->device : "bnep%d",
-				bnep_net_setup);
+			   (*req->device) ? req->device : "bnep%d",
+			   NET_NAME_UNKNOWN,
+			   bnep_net_setup);
 	if (!dev)
 		return -ENOMEM;
 
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 3eca3fdf8fe1..078d336a1f37 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -344,7 +344,7 @@ int br_add_bridge(struct net *net, const char *name)
 	struct net_device *dev;
 	int res;
 
-	dev = alloc_netdev(sizeof(struct net_bridge), name,
+	dev = alloc_netdev(sizeof(struct net_bridge), name, NET_NAME_UNKNOWN,
 			   br_dev_setup);
 
 	if (!dev)
diff --git a/net/core/dev.c b/net/core/dev.c
index 38793fb84a35..2c98f10ee62a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6441,17 +6441,19 @@ void netdev_freemem(struct net_device *dev)
 
 /**
  *	alloc_netdev_mqs - allocate network device
- *	@sizeof_priv:	size of private data to allocate space for
- *	@name:		device name format string
- *	@setup:		callback to initialize device
- *	@txqs:		the number of TX subqueues to allocate
- *	@rxqs:		the number of RX subqueues to allocate
+ *	@sizeof_priv:		size of private data to allocate space for
+ *	@name:			device name format string
+ *	@name_assign_type: 	origin of device name
+ *	@setup:			callback to initialize device
+ *	@txqs:			the number of TX subqueues to allocate
+ *	@rxqs:			the number of RX subqueues to allocate
  *
  *	Allocates a struct net_device with private data area for driver use
  *	and performs basic initialization.  Also allocates subqueue structs
  *	for each queue on the device.
  */
 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
+		unsigned char name_assign_type,
 		void (*setup)(struct net_device *),
 		unsigned int txqs, unsigned int rxqs)
 {
@@ -6530,6 +6532,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 #endif
 
 	strcpy(dev->name, name);
+	dev->name_assign_type = name_assign_type;
 	dev->group = INIT_NETDEV_GROUP;
 	if (!dev->ethtool_ops)
 		dev->ethtool_ops = &default_ethtool_ops;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1f8a59e02c48..599864322de8 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1828,8 +1828,8 @@ struct net_device *rtnl_create_link(struct net *net,
 		num_rx_queues = ops->get_num_rx_queues();
 
 	err = -ENOMEM;
-	dev = alloc_netdev_mqs(ops->priv_size, ifname, ops->setup,
-			       num_tx_queues, num_rx_queues);
+	dev = alloc_netdev_mqs(ops->priv_size, ifname, NET_NAME_UNKNOWN,
+			       ops->setup, num_tx_queues, num_rx_queues);
 	if (!dev)
 		goto err;
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 64c5af0a10dd..45a1e34c89e0 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -340,8 +340,8 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent,
 	struct dsa_slave_priv *p;
 	int ret;
 
-	slave_dev = alloc_netdev(sizeof(struct dsa_slave_priv),
-				 name, ether_setup);
+	slave_dev = alloc_netdev(sizeof(struct dsa_slave_priv), name,
+				 NET_NAME_UNKNOWN, ether_setup);
 	if (slave_dev == NULL)
 		return slave_dev;
 
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 5dc638cad2e1..f405e0592407 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -390,7 +390,8 @@ EXPORT_SYMBOL(ether_setup);
 struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
 				      unsigned int rxqs)
 {
-	return alloc_netdev_mqs(sizeof_priv, "eth%d", ether_setup, txqs, rxqs);
+	return alloc_netdev_mqs(sizeof_priv, "eth%d", NET_NAME_UNKNOWN,
+				ether_setup, txqs, rxqs);
 }
 EXPORT_SYMBOL(alloc_etherdev_mqs);
 
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 54b6731dab55..0157a7af20a8 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -301,7 +301,7 @@ static struct net_device *__ip_tunnel_create(struct net *net,
 	}
 
 	ASSERT_RTNL();
-	dev = alloc_netdev(ops->priv_size, name, ops->setup);
+	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
 	if (!dev) {
 		err = -ENOMEM;
 		goto failed;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 65bcaa789043..c8034587859d 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -500,7 +500,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
 	else
 		sprintf(name, "pimreg%u", mrt->id);
 
-	dev = alloc_netdev(0, name, reg_vif_setup);
+	dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
 
 	if (dev == NULL)
 		return NULL;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 365b2b6f3942..5f19dfbc4c6a 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -322,7 +322,8 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net,
 	else
 		strcpy(name, "ip6gre%d");
 
-	dev = alloc_netdev(sizeof(*t), name, ip6gre_tunnel_setup);
+	dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN,
+			   ip6gre_tunnel_setup);
 	if (!dev)
 		return NULL;
 
@@ -1326,7 +1327,8 @@ static int __net_init ip6gre_init_net(struct net *net)
 	int err;
 
 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0",
-					   ip6gre_tunnel_setup);
+					  NET_NAME_UNKNOWN,
+					  ip6gre_tunnel_setup);
 	if (!ign->fb_tunnel_dev) {
 		err = -ENOMEM;
 		goto err_alloc_dev;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 51a1eb185ea7..f9de5a695072 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -315,7 +315,8 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p)
 	else
 		sprintf(name, "ip6tnl%%d");
 
-	dev = alloc_netdev(sizeof (*t), name, ip6_tnl_dev_setup);
+	dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN,
+			   ip6_tnl_dev_setup);
 	if (dev == NULL)
 		goto failed;
 
@@ -1773,7 +1774,7 @@ static int __net_init ip6_tnl_init_net(struct net *net)
 
 	err = -ENOMEM;
 	ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0",
-				      ip6_tnl_dev_setup);
+					NET_NAME_UNKNOWN, ip6_tnl_dev_setup);
 
 	if (!ip6n->fb_tnl_dev)
 		goto err_alloc_dev;
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 9aaa6bb229e4..17ee4fc32dfe 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -204,7 +204,7 @@ static struct ip6_tnl *vti6_tnl_create(struct net *net, struct __ip6_tnl_parm *p
 	else
 		sprintf(name, "ip6_vti%%d");
 
-	dev = alloc_netdev(sizeof(*t), name, vti6_dev_setup);
+	dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, vti6_dev_setup);
 	if (dev == NULL)
 		goto failed;
 
@@ -1020,7 +1020,7 @@ static int __net_init vti6_init_net(struct net *net)
 
 	err = -ENOMEM;
 	ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6_vti0",
-					vti6_dev_setup);
+					NET_NAME_UNKNOWN, vti6_dev_setup);
 
 	if (!ip6n->fb_tnl_dev)
 		goto err_alloc_dev;
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 8250474ab7dc..f9a3fd320d1d 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -744,7 +744,7 @@ static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt)
 	else
 		sprintf(name, "pim6reg%u", mrt->id);
 
-	dev = alloc_netdev(0, name, reg_vif_setup);
+	dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
 	if (dev == NULL)
 		return NULL;
 
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 4f408176dc64..2e9ba035fb5f 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -250,7 +250,8 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net,
 	else
 		strcpy(name, "sit%d");
 
-	dev = alloc_netdev(sizeof(*t), name, ipip6_tunnel_setup);
+	dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN,
+			   ipip6_tunnel_setup);
 	if (dev == NULL)
 		return NULL;
 
@@ -1729,6 +1730,7 @@ static int __net_init sit_init_net(struct net *net)
 	sitn->tunnels[3] = sitn->tunnels_r_l;
 
 	sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0",
+					   NET_NAME_UNKNOWN,
 					   ipip6_tunnel_setup);
 	if (!sitn->fb_tunnel_dev) {
 		err = -ENOMEM;
diff --git a/net/irda/irda_device.c b/net/irda/irda_device.c
index 365b895da84b..9e0d909390fd 100644
--- a/net/irda/irda_device.c
+++ b/net/irda/irda_device.c
@@ -293,7 +293,8 @@ static void irda_device_setup(struct net_device *dev)
  */
 struct net_device *alloc_irdadev(int sizeof_priv)
 {
-	return alloc_netdev(sizeof_priv, "irda%d", irda_device_setup);
+	return alloc_netdev(sizeof_priv, "irda%d", NET_NAME_UNKNOWN,
+			    irda_device_setup);
 }
 EXPORT_SYMBOL(alloc_irdadev);
 
diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
index ffcec225b5d9..dc13f1a45f2f 100644
--- a/net/irda/irlan/irlan_eth.c
+++ b/net/irda/irlan/irlan_eth.c
@@ -96,7 +96,7 @@ static void irlan_eth_setup(struct net_device *dev)
  */
 struct net_device *alloc_irlandev(const char *name)
 {
-	return alloc_netdev(sizeof(struct irlan_cb), name,
+	return alloc_netdev(sizeof(struct irlan_cb), name, NET_NAME_UNKNOWN,
 			    irlan_eth_setup);
 }
 
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 76125c57ee6d..edb78e69efe4 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -246,7 +246,8 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p
 		goto out;
 	}
 
-	dev = alloc_netdev(sizeof(*priv), name, l2tp_eth_dev_setup);
+	dev = alloc_netdev(sizeof(*priv), name, NET_NAME_UNKNOWN,
+			   l2tp_eth_dev_setup);
 	if (!dev) {
 		rc = -ENOMEM;
 		goto out_del_session;
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index bbf51b2f0651..4edfc7c1524f 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1624,9 +1624,9 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
 		if (local->hw.queues >= IEEE80211_NUM_ACS)
 			txqs = IEEE80211_NUM_ACS;
 
-		ndev = alloc_netdev_mqs(sizeof(*sdata) +
-					local->hw.vif_data_size,
-					name, ieee80211_if_setup, txqs, 1);
+		ndev = alloc_netdev_mqs(sizeof(*sdata) + local->hw.vif_data_size,
+					name, NET_NAME_UNKNOWN,
+					ieee80211_if_setup, txqs, 1);
 		if (!ndev)
 			return -ENOMEM;
 		dev_net_set(ndev, wiphy_net(local->hw.wiphy));
diff --git a/net/mac802154/ieee802154_dev.c b/net/mac802154/ieee802154_dev.c
index 9b54370f5e87..b36b2b996578 100644
--- a/net/mac802154/ieee802154_dev.c
+++ b/net/mac802154/ieee802154_dev.c
@@ -167,11 +167,13 @@ mac802154_add_iface(struct wpan_phy *phy, const char *name, int type)
 	switch (type) {
 	case IEEE802154_DEV_MONITOR:
 		dev = alloc_netdev(sizeof(struct mac802154_sub_if_data),
-				   name, mac802154_monitor_setup);
+				   name, NET_NAME_UNKNOWN,
+				   mac802154_monitor_setup);
 		break;
 	case IEEE802154_DEV_WPAN:
 		dev = alloc_netdev(sizeof(struct mac802154_sub_if_data),
-				   name, mac802154_wpan_setup);
+				   name, NET_NAME_UNKNOWN,
+				   mac802154_wpan_setup);
 		break;
 	default:
 		dev = NULL;
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index ede50d197e10..71cf1bffea06 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1418,7 +1418,7 @@ static int __init nr_proto_init(void)
 		struct net_device *dev;
 
 		sprintf(name, "nr%d", i);
-		dev = alloc_netdev(0, name, nr_setup);
+		dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, nr_setup);
 		if (!dev) {
 			printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device structure\n");
 			goto fail;
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 295471a66c78..bd658555afdf 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -165,7 +165,8 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)
 	netdev_vport = netdev_vport_priv(vport);
 
 	netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev),
-					 parms->name, do_setup);
+					 parms->name, NET_NAME_UNKNOWN,
+					 do_setup);
 	if (!netdev_vport->dev) {
 		err = -ENOMEM;
 		goto error_free_vport;
diff --git a/net/phonet/pep-gprs.c b/net/phonet/pep-gprs.c
index 66dc65e7c6a1..e9a83a637185 100644
--- a/net/phonet/pep-gprs.c
+++ b/net/phonet/pep-gprs.c
@@ -267,7 +267,7 @@ int gprs_attach(struct sock *sk)
 		return -EINVAL; /* need packet boundaries */
 
 	/* Create net device */
-	dev = alloc_netdev(sizeof(*gp), ifname, gprs_setup);
+	dev = alloc_netdev(sizeof(*gp), ifname, NET_NAME_UNKNOWN, gprs_setup);
 	if (!dev)
 		return -ENOMEM;
 	gp = netdev_priv(dev);
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 8451c8cdc9de..a85c1a086ae4 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1538,7 +1538,7 @@ static int __init rose_proto_init(void)
 		char name[IFNAMSIZ];
 
 		sprintf(name, "rose%d", i);
-		dev = alloc_netdev(0, name, rose_setup);
+		dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, rose_setup);
 		if (!dev) {
 			printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate memory\n");
 			rc = -ENOMEM;
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 474167162947..bd33793b527e 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -485,8 +485,8 @@ static int __init teql_init(void)
 		struct net_device *dev;
 		struct teql_master *master;
 
-		dev = alloc_netdev(sizeof(struct teql_master),
-				  "teql%d", teql_master_setup);
+		dev = alloc_netdev(sizeof(struct teql_master), "teql%d",
+				   NET_NAME_UNKNOWN, teql_master_setup);
 		if (!dev) {
 			err = -ENOMEM;
 			break;
-- 
cgit v1.2.3


From 5ee2c941b5969eb1b5592f9731b3ee76a784641f Mon Sep 17 00:00:00 2001
From: Christoph Paasch
Date: Mon, 14 Jul 2014 16:58:32 +0200
Subject: tcp: Remove unnecessary arg from tcp_enter_cwr and
 tcp_init_cwnd_reduction

Since Yuchung's 9b44190dc11 (tcp: refactor F-RTO), tcp_enter_cwr is always
called with set_ssthresh = 1. Thus, we can remove this argument from
tcp_enter_cwr. Further, as we remove this one, tcp_init_cwnd_reduction
is then always called with set_ssthresh = true, and so we can get rid of
this argument as well.

Cc: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Christoph Paasch <christoph.paasch@uclouvain.be>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h     |  2 +-
 net/ipv4/tcp_input.c  | 15 +++++++--------
 net/ipv4/tcp_output.c |  2 +-
 3 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index c9a75dbba0c7..0aeb2eb749dc 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -928,7 +928,7 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
 /* Use define here intentionally to get WARN_ON location shown at the caller */
 #define tcp_verify_left_out(tp)	WARN_ON(tcp_left_out(tp) > tp->packets_out)
 
-void tcp_enter_cwr(struct sock *sk, const int set_ssthresh);
+void tcp_enter_cwr(struct sock *sk);
 __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst);
 
 /* The maximum number of MSS of available cwnd for which TSO defers
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bb684967f7a7..2e16afba182c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2475,7 +2475,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
  *	losses and/or application stalls), do not perform any further cwnd
  *	reductions, but instead slow start up to ssthresh.
  */
-static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
+static void tcp_init_cwnd_reduction(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -2485,8 +2485,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
 	tp->prior_cwnd = tp->snd_cwnd;
 	tp->prr_delivered = 0;
 	tp->prr_out = 0;
-	if (set_ssthresh)
-		tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+	tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
 	TCP_ECN_queue_cwr(tp);
 }
 
@@ -2528,14 +2527,14 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
 }
 
 /* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
-void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
+void tcp_enter_cwr(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	tp->prior_ssthresh = 0;
 	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
 		tp->undo_marker = 0;
-		tcp_init_cwnd_reduction(sk, set_ssthresh);
+		tcp_init_cwnd_reduction(sk);
 		tcp_set_ca_state(sk, TCP_CA_CWR);
 	}
 }
@@ -2564,7 +2563,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
 		tp->retrans_stamp = 0;
 
 	if (flag & FLAG_ECE)
-		tcp_enter_cwr(sk, 1);
+		tcp_enter_cwr(sk);
 
 	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
 		tcp_try_keep_open(sk);
@@ -2670,7 +2669,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
 		if (!ece_ack)
 			tp->prior_ssthresh = tcp_current_ssthresh(sk);
-		tcp_init_cwnd_reduction(sk, true);
+		tcp_init_cwnd_reduction(sk);
 	}
 	tcp_set_ca_state(sk, TCP_CA_Recovery);
 }
@@ -3346,7 +3345,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
 		tp->tlp_high_seq = 0;
 		/* Don't reduce cwnd if DSACK arrives for TLP retrans. */
 		if (!(flag & FLAG_DSACKING_ACK)) {
-			tcp_init_cwnd_reduction(sk, true);
+			tcp_init_cwnd_reduction(sk);
 			tcp_set_ca_state(sk, TCP_CA_CWR);
 			tcp_end_cwnd_reduction(sk);
 			tcp_try_keep_open(sk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index bcee13c4627c..306dd5dead7d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -979,7 +979,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	if (likely(err <= 0))
 		return err;
 
-	tcp_enter_cwr(sk, 1);
+	tcp_enter_cwr(sk);
 
 	return net_xmit_eval(err);
 }
-- 
cgit v1.2.3


From 11878b40ed5c5bc20d6a115bae156a5b90b0fb3e Mon Sep 17 00:00:00 2001
From: Willem de Bruijn
Date: Mon, 14 Jul 2014 17:55:06 -0400
Subject: net-timestamp: SOCK_RAW and PING timestamping

Add SO_TIMESTAMPING to sockets of type PF_INET[6]/SOCK_RAW:

Add the necessary sock_tx_timestamp calls to the datapath for RAW
sockets (ping sockets already had these calls).

Fix the IP output path to pass the timestamp flags on the first
fragment also for these sockets. The existing code relies on
transhdrlen != 0 to indicate a first fragment. For these sockets,
that assumption does not hold.

This fixes http://bugzilla.kernel.org/show_bug.cgi?id=77221

Tested SOCK_RAW on IPv4 and IPv6, not PING.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c  |  7 +++----
 net/ipv4/raw.c        |  4 ++++
 net/ipv6/ip6_output.c | 13 ++++---------
 3 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 8d3b6b0e9857..b16556836d66 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -962,10 +962,6 @@ alloc_new_skb:
 							   sk->sk_allocation);
 				if (unlikely(skb == NULL))
 					err = -ENOBUFS;
-				else
-					/* only the initial fragment is
-					   time stamped */
-					cork->tx_flags = 0;
 			}
 			if (skb == NULL)
 				goto error;
@@ -976,7 +972,10 @@ alloc_new_skb:
 			skb->ip_summed = csummode;
 			skb->csum = 0;
 			skb_reserve(skb, hh_len);
+
+			/* only the initial fragment is time stamped */
 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
+			cork->tx_flags = 0;
 
 			/*
 			 *	Find where to start putting bytes.
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 2c65160565e1..2054d7136c62 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -365,6 +365,8 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
 
 	skb->ip_summed = CHECKSUM_NONE;
 
+	sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
+
 	skb->transport_header = skb->network_header;
 	err = -EFAULT;
 	if (memcpy_fromiovecend((void *)iph, from, 0, length))
@@ -606,6 +608,8 @@ back_from_confirm:
 				      &rt, msg->msg_flags);
 
 	 else {
+		sock_tx_timestamp(sk, &ipc.tx_flags);
+
 		if (!ipc.addr)
 			ipc.addr = fl4.daddr;
 		lock_sock(sk);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 9b395c639a07..759456f0c207 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1271,7 +1271,7 @@ emsgsize:
 	}
 
 	/* For UDP, check if TX timestamp is enabled */
-	if (sk->sk_type == SOCK_DGRAM)
+	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW)
 		sock_tx_timestamp(sk, &tx_flags);
 
 	/*
@@ -1380,12 +1380,6 @@ alloc_new_skb:
 							   sk->sk_allocation);
 				if (unlikely(skb == NULL))
 					err = -ENOBUFS;
-				else {
-					/* Only the initial fragment
-					 * is time stamped.
-					 */
-					tx_flags = 0;
-				}
 			}
 			if (skb == NULL)
 				goto error;
@@ -1399,8 +1393,9 @@ alloc_new_skb:
 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
 				    dst_exthdrlen);
 
-			if (sk->sk_type == SOCK_DGRAM)
-				skb_shinfo(skb)->tx_flags = tx_flags;
+			/* Only the initial fragment is time stamped */
+			skb_shinfo(skb)->tx_flags = tx_flags;
+			tx_flags = 0;
 
 			/*
 			 *	Find where to start putting bytes
-- 
cgit v1.2.3


From 5cf3d46192fccf68b4a4759e4d7346e41c669a76 Mon Sep 17 00:00:00 2001
From: David Held
Date: Tue, 15 Jul 2014 23:28:31 -0400
Subject: udp: Simplify __udp*_lib_mcast_deliver.

Switch to using sk_nulls_for_each which shortens the code and makes it
easier to update.

Signed-off-by: David Held <drheld@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 48 ++++++++++----------------------
 net/ipv6/udp.c | 88 +++++++++++++++++++++++-----------------------------------
 2 files changed, 49 insertions(+), 87 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 668af516f094..bbcc33737ef1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -594,26 +594,6 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
 	return true;
 }
 
-static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
-					     __be16 loc_port, __be32 loc_addr,
-					     __be16 rmt_port, __be32 rmt_addr,
-					     int dif)
-{
-	struct hlist_nulls_node *node;
-	unsigned short hnum = ntohs(loc_port);
-
-	sk_nulls_for_each_from(sk, node) {
-		if (__udp_is_mcast_sock(net, sk,
-					loc_port, loc_addr,
-					rmt_port, rmt_addr,
-					dif, hnum))
-			goto found;
-	}
-	sk = NULL;
-found:
-	return sk;
-}
-
 /*
  * This routine is called by the ICMP module when it gets some
  * sort of error condition.  If err < 0 then the socket should
@@ -1667,23 +1647,23 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 				    struct udp_table *udptable)
 {
 	struct sock *sk, *stack[256 / sizeof(struct sock *)];
-	struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
-	int dif;
+	struct hlist_nulls_node *node;
+	unsigned short hnum = ntohs(uh->dest);
+	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
+	int dif = skb->dev->ifindex;
 	unsigned int i, count = 0;
 
 	spin_lock(&hslot->lock);
-	sk = sk_nulls_head(&hslot->head);
-	dif = skb->dev->ifindex;
-	sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
-	while (sk) {
-		stack[count++] = sk;
-		sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
-				       daddr, uh->source, saddr, dif);
-		if (unlikely(count == ARRAY_SIZE(stack))) {
-			if (!sk)
-				break;
-			flush_stack(stack, count, skb, ~0);
-			count = 0;
+	sk_nulls_for_each(sk, node, &hslot->head) {
+		if (__udp_is_mcast_sock(net, sk,
+					uh->dest, daddr,
+					uh->source, saddr,
+					dif, hnum)) {
+			if (unlikely(count == ARRAY_SIZE(stack))) {
+				flush_stack(stack, count, skb, ~0);
+				count = 0;
+			}
+			stack[count++] = sk;
 		}
 	}
 	/*
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index b4481df3d5fa..7d3bd80085be 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -702,43 +702,26 @@ drop:
 	return -1;
 }
 
-static struct sock *udp_v6_mcast_next(struct net *net, struct sock *sk,
-				      __be16 loc_port, const struct in6_addr *loc_addr,
-				      __be16 rmt_port, const struct in6_addr *rmt_addr,
-				      int dif)
+static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
+				   __be16 loc_port, const struct in6_addr *loc_addr,
+				   __be16 rmt_port, const struct in6_addr *rmt_addr,
+				   int dif, unsigned short hnum)
 {
-	struct hlist_nulls_node *node;
-	unsigned short num = ntohs(loc_port);
-
-	sk_nulls_for_each_from(sk, node) {
-		struct inet_sock *inet = inet_sk(sk);
-
-		if (!net_eq(sock_net(sk), net))
-			continue;
-
-		if (udp_sk(sk)->udp_port_hash == num &&
-		    sk->sk_family == PF_INET6) {
-			if (inet->inet_dport) {
-				if (inet->inet_dport != rmt_port)
-					continue;
-			}
-			if (!ipv6_addr_any(&sk->sk_v6_daddr) &&
-			    !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr))
-				continue;
-
-			if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
-				continue;
+	struct inet_sock *inet = inet_sk(sk);
 
-			if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
-				if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr))
-					continue;
-			}
-			if (!inet6_mc_check(sk, loc_addr, rmt_addr))
-				continue;
-			return sk;
-		}
-	}
-	return NULL;
+	if (!net_eq(sock_net(sk), net))
+		return false;
+
+	if (udp_sk(sk)->udp_port_hash != hnum ||
+	    sk->sk_family != PF_INET6 ||
+	    (inet->inet_dport && inet->inet_dport != rmt_port) ||
+	    (!ipv6_addr_any(&sk->sk_v6_daddr) &&
+		    !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
+	    (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+		return false;
+	if (!inet6_mc_check(sk, loc_addr, rmt_addr))
+		return false;
+	return true;
 }
 
 static void flush_stack(struct sock **stack, unsigned int count,
@@ -787,28 +770,27 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 {
 	struct sock *sk, *stack[256 / sizeof(struct sock *)];
 	const struct udphdr *uh = udp_hdr(skb);
-	struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
-	int dif;
+	struct hlist_nulls_node *node;
+	unsigned short hnum = ntohs(uh->dest);
+	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
+	int dif = inet6_iif(skb);
 	unsigned int i, count = 0;
 
 	spin_lock(&hslot->lock);
-	sk = sk_nulls_head(&hslot->head);
-	dif = inet6_iif(skb);
-	sk = udp_v6_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
-	while (sk) {
-		/* If zero checksum and no_check is not on for
-		 * the socket then skip it.
-		 */
-		if (uh->check || udp_sk(sk)->no_check6_rx)
+	sk_nulls_for_each(sk, node, &hslot->head) {
+		if (__udp_v6_is_mcast_sock(net, sk,
+					   uh->dest, daddr,
+					   uh->source, saddr,
+					   dif, hnum) &&
+		    /* If zero checksum and no_check is not on for
+		     * the socket then skip it.
+		     */
+		    (uh->check || udp_sk(sk)->no_check6_rx)) {
+			if (unlikely(count == ARRAY_SIZE(stack))) {
+				flush_stack(stack, count, skb, ~0);
+				count = 0;
+			}
 			stack[count++] = sk;
-
-		sk = udp_v6_mcast_next(net, sk_nulls_next(sk), uh->dest, daddr,
-				       uh->source, saddr, dif);
-		if (unlikely(count == ARRAY_SIZE(stack))) {
-			if (!sk)
-				break;
-			flush_stack(stack, count, skb, ~0);
-			count = 0;
 		}
 	}
 	/*
-- 
cgit v1.2.3


From 2dc41cff7545d55c6294525c811594576f8e119c Mon Sep 17 00:00:00 2001
From: David Held
Date: Tue, 15 Jul 2014 23:28:32 -0400
Subject: udp: Use hash2 for long hash1 chains in __udp*_lib_mcast_deliver.

Many multicast sources can have the same port which can result in a very
large list when hashing by port only. Hash by address and port instead
if this is the case. This makes multicast more similar to unicast.

On a 24-core machine receiving from 500 multicast sockets on the same
port, before this patch 80% of system CPU was used up by spin locking
and only ~25% of packets were successfully delivered.

With this patch, all packets are delivered and kernel overhead is ~8%
system CPU on spinlocks.

Signed-off-by: David Held <drheld@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 14 ++++++++++++++
 net/ipv4/udp.c     | 31 +++++++++++++++++++++----------
 net/ipv6/udp.c     | 30 ++++++++++++++++++++----------
 3 files changed, 55 insertions(+), 20 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/sock.h b/include/net/sock.h
index 29e48a6d1ded..28f734601b50 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -660,6 +660,20 @@ static inline void sk_add_bind_node(struct sock *sk,
 #define sk_for_each_bound(__sk, list) \
 	hlist_for_each_entry(__sk, list, sk_bind_node)
 
+/**
+ * sk_nulls_for_each_entry_offset - iterate over a list at a given struct offset
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @offset:	offset of hlist_node within the struct.
+ *
+ */
+#define sk_nulls_for_each_entry_offset(tpos, pos, head, offset)		       \
+	for (pos = (head)->first;					       \
+	     (!is_a_nulls(pos)) &&					       \
+		({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;});       \
+	     pos = pos->next)
+
 static inline struct user_namespace *sk_user_ns(struct sock *sk)
 {
 	/* Careful only use this in a context where these parameters
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index bbcc33737ef1..f31053b90ee0 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1619,6 +1619,8 @@ static void flush_stack(struct sock **stack, unsigned int count,
 
 		if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
 			skb1 = NULL;
+
+		sock_put(sk);
 	}
 	if (unlikely(skb1))
 		kfree_skb(skb1);
@@ -1651,10 +1653,20 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	unsigned short hnum = ntohs(uh->dest);
 	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
 	int dif = skb->dev->ifindex;
-	unsigned int i, count = 0;
+	unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
+	unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
+
+	if (use_hash2) {
+		hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
+			    udp_table.mask;
+		hash2 = udp4_portaddr_hash(net, daddr, hnum) & udp_table.mask;
+start_lookup:
+		hslot = &udp_table.hash2[hash2];
+		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
+	}
 
 	spin_lock(&hslot->lock);
-	sk_nulls_for_each(sk, node, &hslot->head) {
+	sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
 		if (__udp_is_mcast_sock(net, sk,
 					uh->dest, daddr,
 					uh->source, saddr,
@@ -1664,24 +1676,23 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 				count = 0;
 			}
 			stack[count++] = sk;
+			sock_hold(sk);
 		}
 	}
-	/*
-	 * before releasing chain lock, we must take a reference on sockets
-	 */
-	for (i = 0; i < count; i++)
-		sock_hold(stack[i]);
 
 	spin_unlock(&hslot->lock);
 
+	/* Also lookup *:port if we are using hash2 and haven't done so yet. */
+	if (use_hash2 && hash2 != hash2_any) {
+		hash2 = hash2_any;
+		goto start_lookup;
+	}
+
 	/*
 	 * do the slow work with no lock held
 	 */
 	if (count) {
 		flush_stack(stack, count, skb, count - 1);
-
-		for (i = 0; i < count; i++)
-			sock_put(stack[i]);
 	} else {
 		kfree_skb(skb);
 	}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 7d3bd80085be..f9d8800bb72f 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -745,6 +745,7 @@ static void flush_stack(struct sock **stack, unsigned int count,
 
 		if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0)
 			skb1 = NULL;
+		sock_put(sk);
 	}
 	if (unlikely(skb1))
 		kfree_skb(skb1);
@@ -774,10 +775,20 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 	unsigned short hnum = ntohs(uh->dest);
 	struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
 	int dif = inet6_iif(skb);
-	unsigned int i, count = 0;
+	unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
+	unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
+
+	if (use_hash2) {
+		hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
+			    udp_table.mask;
+		hash2 = udp6_portaddr_hash(net, daddr, hnum) & udp_table.mask;
+start_lookup:
+		hslot = &udp_table.hash2[hash2];
+		offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
+	}
 
 	spin_lock(&hslot->lock);
-	sk_nulls_for_each(sk, node, &hslot->head) {
+	sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
 		if (__udp_v6_is_mcast_sock(net, sk,
 					   uh->dest, daddr,
 					   uh->source, saddr,
@@ -791,21 +802,20 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
 				count = 0;
 			}
 			stack[count++] = sk;
+			sock_hold(sk);
 		}
 	}
-	/*
-	 * before releasing the lock, we must take reference on sockets
-	 */
-	for (i = 0; i < count; i++)
-		sock_hold(stack[i]);
 
 	spin_unlock(&hslot->lock);
 
+	/* Also lookup *:port if we are using hash2 and haven't done so yet. */
+	if (use_hash2 && hash2 != hash2_any) {
+		hash2 = hash2_any;
+		goto start_lookup;
+	}
+
 	if (count) {
 		flush_stack(stack, count, skb, count - 1);
-
-		for (i = 0; i < count; i++)
-			sock_put(stack[i]);
 	} else {
 		kfree_skb(skb);
 	}
-- 
cgit v1.2.3


From 274f482d33a309c87096f2983601ceda2761094e Mon Sep 17 00:00:00 2001
From: Sorin Dumitru
Date: Tue, 22 Jul 2014 21:16:51 +0300
Subject: sock: remove skb argument from sk_rcvqueues_full

It hasn't been used since commit 0fd7bac(net: relax rcvbuf limits).

Signed-off-by: Sorin Dumitru <sorin@returnze.ro>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 5 ++---
 net/core/sock.c    | 2 +-
 net/ipv4/udp.c     | 2 +-
 net/ipv6/udp.c     | 2 +-
 4 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/sock.h b/include/net/sock.h
index 28f734601b50..720773304a85 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -810,8 +810,7 @@ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
  * Do not take into account this skb truesize,
  * to allow even a single big packet to come.
  */
-static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff *skb,
-				     unsigned int limit)
+static inline bool sk_rcvqueues_full(const struct sock *sk, unsigned int limit)
 {
 	unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc);
 
@@ -822,7 +821,7 @@ static inline bool sk_rcvqueues_full(const struct sock *sk, const struct sk_buff
 static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb,
 					      unsigned int limit)
 {
-	if (sk_rcvqueues_full(sk, skb, limit))
+	if (sk_rcvqueues_full(sk, limit))
 		return -ENOBUFS;
 
 	__sk_add_backlog(sk, skb);
diff --git a/net/core/sock.c b/net/core/sock.c
index 026e01f70274..ca9b65199d28 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -491,7 +491,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
 
 	skb->dev = NULL;
 
-	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
+	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 		atomic_inc(&sk->sk_drops);
 		goto discard_and_relse;
 	}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f31053b90ee0..f57c0e4c2326 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1567,7 +1567,7 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		goto csum_error;
 
 
-	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
+	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 		UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
 				 is_udplite);
 		goto drop;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index f9d8800bb72f..5b6091de5868 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -673,7 +673,7 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 			goto csum_error;
 	}
 
-	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
+	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 		UDP6_INC_STATS_BH(sock_net(sk),
 				  UDP_MIB_RCVBUFERRORS, is_udplite);
 		goto drop;
-- 
cgit v1.2.3


From f5220d63991f3fcb3d19efe8af0c8f75dcf0309b Mon Sep 17 00:00:00 2001
From: Quentin Armitage
Date: Wed, 23 Jul 2014 09:58:01 +0100
Subject: ipv4: Make IP_MULTICAST_ALL and IP_MSFILTER work on raw sockets

Currently, although IP_MULTICAST_ALL and IP_MSFILTER ioctl calls succeed on
raw sockets, there is no code to implement the functionality on received
packets; it is only implemented for UDP sockets. The raw(7) man page states:
"In addition, all ip(7) IPPROTO_IP socket options valid for datagram sockets
are supported", which implies these ioctls should work on raw sockets.

To fix this, add a call to ip_mc_sf_allow on raw sockets.

This should not break any existing code, since the current position of
not calling ip_mc_sf_filter makes it behave as if neither the IP_MULTICAST_ALL
nor the IP_MSFILTER ioctl had been called. Adding the call to ip_mc_sf_allow
will therefore maintain the current behaviour so long as IP_MULTICAST_ALL and
IP_MSFILTER ioctls are not called. Any code that currently is calling
IP_MULTICAST_ALL or IP_MSFILTER ioctls on raw sockets presumably is wanting
the filter to be applied, although no filtering will currently be occurring.

Signed-off-by: Quentin Armitage <quentin@armitage.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/raw.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 2054d7136c62..739db3100c23 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -58,6 +58,7 @@
 #include <linux/in_route.h>
 #include <linux/route.h>
 #include <linux/skbuff.h>
+#include <linux/igmp.h>
 #include <net/net_namespace.h>
 #include <net/dst.h>
 #include <net/sock.h>
@@ -174,7 +175,9 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
 
 	while (sk) {
 		delivered = 1;
-		if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
+		if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
+		    ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
+				   skb->dev->ifindex)) {
 			struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
 
 			/* Not releasing hash table! */
-- 
cgit v1.2.3


From 179542a548f37e1366f7afb7a088c965a2adf7d0 Mon Sep 17 00:00:00 2001
From: Himangi Saraogi
Date: Fri, 25 Jul 2014 01:48:44 +0530
Subject: igmp: remove exceptional & on function name

In this file, function names are otherwise used as pointers without &.

A simplified version of the Coccinelle semantic patch that makes this
change is as follows:

// <smpl>
@r@
identifier f;
@@

f(...) { ... }

@@
identifier r.f;
@@

- &f
+ f
// </smpl>

Signed-off-by: Himangi Saraogi <himangi774@gmail.com>
Acked-by: Julia Lawall <julia.lawall@lip6.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index db710b059bab..f10eab462282 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1321,7 +1321,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
 	atomic_set(&im->refcnt, 1);
 	spin_lock_init(&im->lock);
 #ifdef CONFIG_IP_MULTICAST
-	setup_timer(&im->timer, &igmp_timer_expire, (unsigned long)im);
+	setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im);
 	im->unsolicit_count = IGMP_Unsolicited_Report_Count;
 #endif
 
-- 
cgit v1.2.3


From 5bd3a76f4ba37108101c2ae93172571a70505982 Mon Sep 17 00:00:00 2001
From: Himangi Saraogi
Date: Fri, 25 Jul 2014 01:47:16 +0530
Subject: netfilter: nf_conntrack: remove exceptional & on function name

In this file, function names are otherwise used as pointers without &.

A simplified version of the Coccinelle semantic patch that makes this
change is as follows:

// <smpl>
@r@
identifier f;
@@

f(...) { ... }

@@
identifier r.f;
@@

- &f
+ f
// </smpl>

Signed-off-by: Himangi Saraogi <himangi774@gmail.com>
Acked-by: Julia Lawall <julia.lawall@lip6.fr>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 4ce44c4bc57b..a054fe083431 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -358,7 +358,7 @@ static struct nf_sockopt_ops so_getorigdst = {
 	.pf		= PF_INET,
 	.get_optmin	= SO_ORIGINAL_DST,
 	.get_optmax	= SO_ORIGINAL_DST+1,
-	.get		= &getorigdst,
+	.get		= getorigdst,
 	.owner		= THIS_MODULE,
 };
 
-- 
cgit v1.2.3


From d4da843e6fad4f278fe82b075d8e394cff05c95c Mon Sep 17 00:00:00 2001
From: Paul Bolle
Date: Fri, 25 Jul 2014 14:25:31 +0200
Subject: netfilter: kill remnants of ulog targets

The ulog targets were recently killed. A few references to the Kconfig
macros CONFIG_IP_NF_TARGET_ULOG and CONFIG_BRIDGE_EBT_ULOG were left
untouched. Kill these too.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netns/x_tables.h  | 6 ------
 net/bridge/netfilter/Makefile | 1 -
 net/ipv4/netfilter/Makefile   | 1 -
 3 files changed, 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/netns/x_tables.h b/include/net/netns/x_tables.h
index 02fe40f8c8fd..c24060ee411e 100644
--- a/include/net/netns/x_tables.h
+++ b/include/net/netns/x_tables.h
@@ -15,11 +15,5 @@ struct netns_xt {
 	struct ebt_table *frame_filter;
 	struct ebt_table *frame_nat;
 #endif
-#if IS_ENABLED(CONFIG_IP_NF_TARGET_ULOG)
-	bool ulog_warn_deprecated;
-#endif
-#if IS_ENABLED(CONFIG_BRIDGE_EBT_ULOG)
-	bool ebt_ulog_warn_deprecated;
-#endif
 };
 #endif
diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile
index 061d121cf8b9..be4d0cea78ce 100644
--- a/net/bridge/netfilter/Makefile
+++ b/net/bridge/netfilter/Makefile
@@ -37,5 +37,4 @@ obj-$(CONFIG_BRIDGE_EBT_SNAT) += ebt_snat.o
 
 # watchers
 obj-$(CONFIG_BRIDGE_EBT_LOG) += ebt_log.o
-obj-$(CONFIG_BRIDGE_EBT_ULOG) += ebt_ulog.o
 obj-$(CONFIG_BRIDGE_EBT_NFLOG) += ebt_nflog.o
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 245db9df3337..33001621465b 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -57,7 +57,6 @@ obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
 obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
 obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
 obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o
-obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
 
 # generic ARP tables
 obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
-- 
cgit v1.2.3


From 36c7778218b93d96d88d68f116a711f6a598b72f Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Thu, 24 Jul 2014 16:50:29 +0200
Subject: inet: frag: constify match, hashfn and constructor arguments

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h                 | 11 ++++++-----
 include/net/ipv6.h                      |  4 ++--
 net/ieee802154/reassembly.c             | 14 +++++++-------
 net/ipv4/ip_fragment.c                  | 14 +++++++-------
 net/ipv6/netfilter/nf_conntrack_reasm.c |  2 +-
 net/ipv6/reassembly.c                   | 14 +++++++-------
 6 files changed, 30 insertions(+), 29 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 6f59de98dabd..15033057d44e 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -71,10 +71,11 @@ struct inet_frags {
 	u32			rnd;
 	int			qsize;
 
-	unsigned int		(*hashfn)(struct inet_frag_queue *);
-	bool			(*match)(struct inet_frag_queue *q, void *arg);
+	unsigned int		(*hashfn)(const struct inet_frag_queue *);
+	bool			(*match)(const struct inet_frag_queue *q,
+					 const void *arg);
 	void			(*constructor)(struct inet_frag_queue *q,
-						void *arg);
+					       const void *arg);
 	void			(*destructor)(struct inet_frag_queue *);
 	void			(*skb_free)(struct sk_buff *);
 	void			(*frag_expire)(unsigned long data);
@@ -131,9 +132,9 @@ static inline void init_frag_mem_limit(struct netns_frags *nf)
 	percpu_counter_init(&nf->mem, 0);
 }
 
-static inline int sum_frag_mem_limit(struct netns_frags *nf)
+static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)
 {
-	int res;
+	unsigned int res;
 
 	local_bh_disable();
 	res = percpu_counter_sum_positive(&nf->mem);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index a25017247457..25c2170e1298 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -496,8 +496,8 @@ struct ip6_create_arg {
 	u8 ecn;
 };
 
-void ip6_frag_init(struct inet_frag_queue *q, void *a);
-bool ip6_frag_match(struct inet_frag_queue *q, void *a);
+void ip6_frag_init(struct inet_frag_queue *q, const void *a);
+bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
 
 /*
  *	Equivalent of ipv4 struct ip
diff --git a/net/ieee802154/reassembly.c b/net/ieee802154/reassembly.c
index b85bd3f7048e..a5588be18024 100644
--- a/net/ieee802154/reassembly.c
+++ b/net/ieee802154/reassembly.c
@@ -61,18 +61,18 @@ static unsigned int lowpan_hash_frag(__be16 tag, u16 d_size,
 	return c & (INETFRAGS_HASHSZ - 1);
 }
 
-static unsigned int lowpan_hashfn(struct inet_frag_queue *q)
+static unsigned int lowpan_hashfn(const struct inet_frag_queue *q)
 {
-	struct lowpan_frag_queue *fq;
+	const struct lowpan_frag_queue *fq;
 
 	fq = container_of(q, struct lowpan_frag_queue, q);
 	return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr);
 }
 
-static bool lowpan_frag_match(struct inet_frag_queue *q, void *a)
+static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a)
 {
-	struct lowpan_frag_queue *fq;
-	struct lowpan_create_arg *arg = a;
+	const struct lowpan_frag_queue *fq;
+	const struct lowpan_create_arg *arg = a;
 
 	fq = container_of(q, struct lowpan_frag_queue, q);
 	return	fq->tag == arg->tag && fq->d_size == arg->d_size &&
@@ -80,10 +80,10 @@ static bool lowpan_frag_match(struct inet_frag_queue *q, void *a)
 		ieee802154_addr_equal(&fq->daddr, arg->dst);
 }
 
-static void lowpan_frag_init(struct inet_frag_queue *q, void *a)
+static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
 {
+	const struct lowpan_create_arg *arg = a;
 	struct lowpan_frag_queue *fq;
-	struct lowpan_create_arg *arg = a;
 
 	fq = container_of(q, struct lowpan_frag_queue, q);
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index ed32313e307c..586e6aaf9e6e 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -112,18 +112,18 @@ static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
 			    ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1);
 }
 
-static unsigned int ip4_hashfn(struct inet_frag_queue *q)
+static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
 {
-	struct ipq *ipq;
+	const struct ipq *ipq;
 
 	ipq = container_of(q, struct ipq, q);
 	return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
 }
 
-static bool ip4_frag_match(struct inet_frag_queue *q, void *a)
+static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
 {
-	struct ipq *qp;
-	struct ip4_create_arg *arg = a;
+	const struct ipq *qp;
+	const struct ip4_create_arg *arg = a;
 
 	qp = container_of(q, struct ipq, q);
 	return	qp->id == arg->iph->id &&
@@ -133,14 +133,14 @@ static bool ip4_frag_match(struct inet_frag_queue *q, void *a)
 		qp->user == arg->user;
 }
 
-static void ip4_frag_init(struct inet_frag_queue *q, void *a)
+static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
 {
 	struct ipq *qp = container_of(q, struct ipq, q);
 	struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
 					       frags);
 	struct net *net = container_of(ipv4, struct net, ipv4);
 
-	struct ip4_create_arg *arg = a;
+	const struct ip4_create_arg *arg = a;
 
 	qp->protocol = arg->iph->protocol;
 	qp->id = arg->iph->id;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 0d5279fd852a..c2c3f2116bc5 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -156,7 +156,7 @@ static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr,
 }
 
 
-static unsigned int nf_hashfn(struct inet_frag_queue *q)
+static unsigned int nf_hashfn(const struct inet_frag_queue *q)
 {
 	const struct frag_queue *nq;
 
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index cc85a9ba5010..0c6932cc08cb 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -94,18 +94,18 @@ static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr,
 	return c & (INETFRAGS_HASHSZ - 1);
 }
 
-static unsigned int ip6_hashfn(struct inet_frag_queue *q)
+static unsigned int ip6_hashfn(const struct inet_frag_queue *q)
 {
-	struct frag_queue *fq;
+	const struct frag_queue *fq;
 
 	fq = container_of(q, struct frag_queue, q);
 	return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr);
 }
 
-bool ip6_frag_match(struct inet_frag_queue *q, void *a)
+bool ip6_frag_match(const struct inet_frag_queue *q, const void *a)
 {
-	struct frag_queue *fq;
-	struct ip6_create_arg *arg = a;
+	const struct frag_queue *fq;
+	const struct ip6_create_arg *arg = a;
 
 	fq = container_of(q, struct frag_queue, q);
 	return	fq->id == arg->id &&
@@ -115,10 +115,10 @@ bool ip6_frag_match(struct inet_frag_queue *q, void *a)
 }
 EXPORT_SYMBOL(ip6_frag_match);
 
-void ip6_frag_init(struct inet_frag_queue *q, void *a)
+void ip6_frag_init(struct inet_frag_queue *q, const void *a)
 {
 	struct frag_queue *fq = container_of(q, struct frag_queue, q);
-	struct ip6_create_arg *arg = a;
+	const struct ip6_create_arg *arg = a;
 
 	fq->id = arg->id;
 	fq->user = arg->user;
-- 
cgit v1.2.3


From fb3cfe6e75b9d05c87265e85e67d7caf6e5b44a7 Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Thu, 24 Jul 2014 16:50:30 +0200
Subject: inet: frag: remove hash size assumptions from callers

hide actual hash size from individual users: The _find
function will now fold the given hash value into the required range.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ieee802154/reassembly.c             | 12 ++++--------
 net/ipv4/inet_fragment.c                | 13 ++++++++++---
 net/ipv4/ip_fragment.c                  |  2 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c |  7 ++-----
 net/ipv6/reassembly.c                   |  8 ++------
 5 files changed, 19 insertions(+), 23 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ieee802154/reassembly.c b/net/ieee802154/reassembly.c
index a5588be18024..a707995fd4d7 100644
--- a/net/ieee802154/reassembly.c
+++ b/net/ieee802154/reassembly.c
@@ -50,15 +50,11 @@ static unsigned int lowpan_hash_frag(__be16 tag, u16 d_size,
 				     const struct ieee802154_addr *saddr,
 				     const struct ieee802154_addr *daddr)
 {
-	u32 c;
-
 	net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd));
-	c = jhash_3words(ieee802154_addr_hash(saddr),
-			 ieee802154_addr_hash(daddr),
-			 (__force u32)(tag + (d_size << 16)),
-			 lowpan_frags.rnd);
-
-	return c & (INETFRAGS_HASHSZ - 1);
+	return jhash_3words(ieee802154_addr_hash(saddr),
+			    ieee802154_addr_hash(daddr),
+			    (__force u32)(tag + (d_size << 16)),
+			    lowpan_frags.rnd);
 }
 
 static unsigned int lowpan_hashfn(const struct inet_frag_queue *q)
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 3b01959bf4bb..930d23870811 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -46,6 +46,12 @@ const u8 ip_frag_ecn_table[16] = {
 };
 EXPORT_SYMBOL(ip_frag_ecn_table);
 
+static unsigned int
+inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
+{
+	return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
+}
+
 static void inet_frag_secret_rebuild(unsigned long dummy)
 {
 	struct inet_frags *f = (struct inet_frags *)dummy;
@@ -63,7 +69,7 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
 
 		hb = &f->hash[i];
 		hlist_for_each_entry_safe(q, n, &hb->chain, list) {
-			unsigned int hval = f->hashfn(q);
+			unsigned int hval = inet_frag_hashfn(f, q);
 
 			if (hval != i) {
 				struct inet_frag_bucket *hb_dest;
@@ -133,7 +139,7 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
 	unsigned int hash;
 
 	read_lock(&f->lock);
-	hash = f->hashfn(fq);
+	hash = inet_frag_hashfn(f, fq);
 	hb = &f->hash[hash];
 
 	spin_lock(&hb->chain_lock);
@@ -252,7 +258,7 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 	 * the rnd seed, so we need to re-calculate the hash
 	 * chain. Fortunatelly the qp_in can be used to get one.
 	 */
-	hash = f->hashfn(qp_in);
+	hash = inet_frag_hashfn(f, qp_in);
 	hb = &f->hash[hash];
 	spin_lock(&hb->chain_lock);
 
@@ -326,6 +332,7 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 	struct inet_frag_queue *q;
 	int depth = 0;
 
+	hash &= (INETFRAGS_HASHSZ - 1);
 	hb = &f->hash[hash];
 
 	spin_lock(&hb->chain_lock);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 586e6aaf9e6e..b769eb6c83c0 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -109,7 +109,7 @@ static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
 	net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
 	return jhash_3words((__force u32)id << 16 | prot,
 			    (__force u32)saddr, (__force u32)daddr,
-			    ip4_frags.rnd) & (INETFRAGS_HASHSZ - 1);
+			    ip4_frags.rnd);
 }
 
 static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index c2c3f2116bc5..607e4a94ef41 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -147,12 +147,9 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
 static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr,
 				 const struct in6_addr *daddr)
 {
-	u32 c;
-
 	net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd));
-	c = jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
-			 (__force u32)id, nf_frags.rnd);
-	return c & (INETFRAGS_HASHSZ - 1);
+	return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
+			    (__force u32)id, nf_frags.rnd);
 }
 
 
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 0c6932cc08cb..2b76549a1016 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -85,13 +85,9 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr,
 				    const struct in6_addr *daddr)
 {
-	u32 c;
-
 	net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd));
-	c = jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
-			 (__force u32)id, ip6_frags.rnd);
-
-	return c & (INETFRAGS_HASHSZ - 1);
+	return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
+			    (__force u32)id, ip6_frags.rnd);
 }
 
 static unsigned int ip6_hashfn(const struct inet_frag_queue *q)
-- 
cgit v1.2.3


From 86e93e470cadedda9181a2bd9aee1d9d2e5e9c0f Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Thu, 24 Jul 2014 16:50:31 +0200
Subject: inet: frag: move evictor calls into frag_find function

First step to move eviction handling into a work queue.

We lose two spots that accounted evicted fragments in MIB counters.

Accounting will be restored since the upcoming work-queue evictor
invokes the frag queue timer callbacks instead.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h                 |  1 -
 net/ieee802154/reassembly.c             |  2 --
 net/ipv4/inet_fragment.c                | 16 +++++++++-------
 net/ipv4/ip_fragment.c                  | 15 ---------------
 net/ipv6/netfilter/nf_conntrack_reasm.c |  4 ----
 net/ipv6/reassembly.c                   |  6 ------
 6 files changed, 9 insertions(+), 35 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 15033057d44e..9fe644d1a26e 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -90,7 +90,6 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
 void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
 void inet_frag_destroy(struct inet_frag_queue *q,
 				struct inet_frags *f, int *work);
-int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force);
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 		struct inet_frags *f, void *key, unsigned int hash)
 	__releases(&f->lock);
diff --git a/net/ieee802154/reassembly.c b/net/ieee802154/reassembly.c
index a707995fd4d7..9503a48556f7 100644
--- a/net/ieee802154/reassembly.c
+++ b/net/ieee802154/reassembly.c
@@ -369,8 +369,6 @@ int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type)
 	if (frag_info->d_size > ieee802154_lowpan->max_dsize)
 		goto err;
 
-	inet_frag_evictor(&ieee802154_lowpan->frags, &lowpan_frags, false);
-
 	fq = fq_find(net, frag_info, &source, &dest);
 	if (fq != NULL) {
 		int ret;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 930d23870811..535636017534 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -46,6 +46,8 @@ const u8 ip_frag_ecn_table[16] = {
 };
 EXPORT_SYMBOL(ip_frag_ecn_table);
 
+static int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force);
+
 static unsigned int
 inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
 {
@@ -203,16 +205,11 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
 }
 EXPORT_SYMBOL(inet_frag_destroy);
 
-int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
+static int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
 {
 	struct inet_frag_queue *q;
 	int work, evicted = 0;
 
-	if (!force) {
-		if (frag_mem_limit(nf) <= nf->high_thresh)
-			return 0;
-	}
-
 	work = frag_mem_limit(nf) - nf->low_thresh;
 	while (work > 0 || force) {
 		spin_lock(&nf->lru_lock);
@@ -242,7 +239,6 @@ int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
 
 	return evicted;
 }
-EXPORT_SYMBOL(inet_frag_evictor);
 
 static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 		struct inet_frag_queue *qp_in, struct inet_frags *f,
@@ -296,6 +292,9 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 {
 	struct inet_frag_queue *q;
 
+	if (frag_mem_limit(nf) > nf->high_thresh)
+		return NULL;
+
 	q = kzalloc(f->qsize, GFP_ATOMIC);
 	if (q == NULL)
 		return NULL;
@@ -332,6 +331,9 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 	struct inet_frag_queue *q;
 	int depth = 0;
 
+	if (frag_mem_limit(nf) > nf->high_thresh)
+		inet_frag_evictor(nf, f, false);
+
 	hash &= (INETFRAGS_HASHSZ - 1);
 	hb = &f->hash[hash];
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b769eb6c83c0..54988672d00d 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -177,18 +177,6 @@ static void ipq_kill(struct ipq *ipq)
 	inet_frag_kill(&ipq->q, &ip4_frags);
 }
 
-/* Memory limiting on fragments.  Evictor trashes the oldest
- * fragment queue until we are back under the threshold.
- */
-static void ip_evictor(struct net *net)
-{
-	int evicted;
-
-	evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags, false);
-	if (evicted)
-		IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted);
-}
-
 /*
  * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
  */
@@ -655,9 +643,6 @@ int ip_defrag(struct sk_buff *skb, u32 user)
 	net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
 
-	/* Start by cleaning up the memory. */
-	ip_evictor(net);
-
 	/* Lookup (or create) queue header */
 	if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
 		int ret;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 607e4a94ef41..58e32cf91c95 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -594,10 +594,6 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user)
 	hdr = ipv6_hdr(clone);
 	fhdr = (struct frag_hdr *)skb_transport_header(clone);
 
-	local_bh_disable();
-	inet_frag_evictor(&net->nf_frag.frags, &nf_frags, false);
-	local_bh_enable();
-
 	fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
 		     ip6_frag_ecn(hdr));
 	if (fq == NULL) {
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 2b76549a1016..97acbc490d9e 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -519,7 +519,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 	struct frag_queue *fq;
 	const struct ipv6hdr *hdr = ipv6_hdr(skb);
 	struct net *net = dev_net(skb_dst(skb)->dev);
-	int evicted;
 
 	if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED)
 		goto fail_hdr;
@@ -548,11 +547,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 		return 1;
 	}
 
-	evicted = inet_frag_evictor(&net->ipv6.frags, &ip6_frags, false);
-	if (evicted)
-		IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)),
-				 IPSTATS_MIB_REASMFAILS, evicted);
-
 	fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr,
 		     ip6_frag_ecn(hdr));
 	if (fq != NULL) {
-- 
cgit v1.2.3


From b13d3cbfb8e8a8f53930af67d1ebf05149f32c24 Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Thu, 24 Jul 2014 16:50:32 +0200
Subject: inet: frag: move eviction of queues to work queue

When the high_thresh limit is reached we try to toss the 'oldest'
incomplete fragment queues until memory limits are below the low_thresh
value.  This happens in softirq/packet processing context.

This has two drawbacks:

1) processors might evict a queue that was about to be completed
by another cpu, because they will compete wrt. resource usage and
resource reclaim.

2) LRU list maintenance is expensive.

But when constantly overloaded, even the 'least recently used' element is
recent, so removing 'lru' queue first is not 'fairer' than removing any
other fragment queue.

This moves eviction out of the fast path:

When the low threshold is reached, a work queue is scheduled
which then iterates over the table and removes the queues that exceed
the memory limits of the namespace. It sets a new flag called
INET_FRAG_EVICTED on the evicted queues so the proper counters will get
incremented when the queue is forcefully expired.

When the high threshold is reached, no more fragment queues are
created until we're below the limit again.

The LRU list is now unused and will be removed in a followup patch.

Joint work with Nikolay Aleksandrov.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |   4 +-
 include/net/inet_frag.h                |   6 +-
 net/ipv4/inet_fragment.c               | 142 +++++++++++++++++++++++----------
 net/ipv4/ip_fragment.c                 |   3 +-
 net/ipv6/reassembly.c                  |   4 +-
 5 files changed, 112 insertions(+), 47 deletions(-)

(limited to 'net/ipv4')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index f35bfe43bf7a..625c8dda4be7 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -104,7 +104,9 @@ ipfrag_high_thresh - INTEGER
 	is reached.
 
 ipfrag_low_thresh - INTEGER
-	See ipfrag_high_thresh
+	Maximum memory used to reassemble IP fragments before the kernel
+	begins to remove incomplete fragment queues to free up resources.
+	The kernel still accepts new fragments for defragmentation.
 
 ipfrag_time - INTEGER
 	Time in seconds to keep an IP fragment in memory.
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 9fe644d1a26e..e975032ea11b 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -32,6 +32,7 @@ struct inet_frag_queue {
 	int			meat;
 	__u8			last_in;    /* first/last segment arrived? */
 
+#define INET_FRAG_EVICTED	8
 #define INET_FRAG_COMPLETE	4
 #define INET_FRAG_FIRST_IN	2
 #define INET_FRAG_LAST_IN	1
@@ -48,7 +49,7 @@ struct inet_frag_queue {
  *	       rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or
  *	       struct frag_queue))
  */
-#define INETFRAGS_MAXDEPTH		128
+#define INETFRAGS_MAXDEPTH	128
 
 struct inet_frag_bucket {
 	struct hlist_head	chain;
@@ -65,6 +66,9 @@ struct inet_frags {
 	int			secret_interval;
 	struct timer_list	secret_timer;
 
+	struct work_struct	frags_work;
+	unsigned int next_bucket;
+
 	/* The first call to hashfn is responsible to initialize
 	 * rnd. This is best done with net_get_random_once.
 	 */
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 535636017534..43315ecb9400 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -25,6 +25,9 @@
 #include <net/inet_frag.h>
 #include <net/inet_ecn.h>
 
+#define INETFRAGS_EVICT_BUCKETS   128
+#define INETFRAGS_EVICT_MAX	  512
+
 /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
  * Value : 0xff if frame should be dropped.
  *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
@@ -46,8 +49,6 @@ const u8 ip_frag_ecn_table[16] = {
 };
 EXPORT_SYMBOL(ip_frag_ecn_table);
 
-static int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force);
-
 static unsigned int
 inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
 {
@@ -89,10 +90,92 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
 	mod_timer(&f->secret_timer, now + f->secret_interval);
 }
 
+static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
+{
+	return q->net->low_thresh == 0 ||
+	       frag_mem_limit(q->net) >= q->net->low_thresh;
+}
+
+static unsigned int
+inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
+{
+	struct inet_frag_queue *fq;
+	struct hlist_node *n;
+	unsigned int evicted = 0;
+	HLIST_HEAD(expired);
+
+evict_again:
+	spin_lock(&hb->chain_lock);
+
+	hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
+		if (!inet_fragq_should_evict(fq))
+			continue;
+
+		if (!del_timer(&fq->timer)) {
+			/* q expiring right now thus increment its refcount so
+			 * it won't be freed under us and wait until the timer
+			 * has finished executing then destroy it
+			 */
+			atomic_inc(&fq->refcnt);
+			spin_unlock(&hb->chain_lock);
+			del_timer_sync(&fq->timer);
+			WARN_ON(atomic_read(&fq->refcnt) != 1);
+			inet_frag_put(fq, f);
+			goto evict_again;
+		}
+
+		/* suppress xmit of (icmp) error packet */
+		fq->last_in &= ~INET_FRAG_FIRST_IN;
+		fq->last_in |= INET_FRAG_EVICTED;
+		hlist_del(&fq->list);
+		hlist_add_head(&fq->list, &expired);
+		++evicted;
+	}
+
+	spin_unlock(&hb->chain_lock);
+
+	hlist_for_each_entry_safe(fq, n, &expired, list)
+		f->frag_expire((unsigned long) fq);
+
+	return evicted;
+}
+
+static void inet_frag_worker(struct work_struct *work)
+{
+	unsigned int budget = INETFRAGS_EVICT_BUCKETS;
+	unsigned int i, evicted = 0;
+	struct inet_frags *f;
+
+	f = container_of(work, struct inet_frags, frags_work);
+
+	BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
+
+	read_lock_bh(&f->lock);
+
+	for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
+		evicted += inet_evict_bucket(f, &f->hash[i]);
+		i = (i + 1) & (INETFRAGS_HASHSZ - 1);
+		if (evicted > INETFRAGS_EVICT_MAX)
+			break;
+	}
+
+	f->next_bucket = i;
+
+	read_unlock_bh(&f->lock);
+}
+
+static void inet_frag_schedule_worker(struct inet_frags *f)
+{
+	if (unlikely(!work_pending(&f->frags_work)))
+		schedule_work(&f->frags_work);
+}
+
 void inet_frags_init(struct inet_frags *f)
 {
 	int i;
 
+	INIT_WORK(&f->frags_work, inet_frag_worker);
+
 	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
 		struct inet_frag_bucket *hb = &f->hash[i];
 
@@ -120,16 +203,22 @@ EXPORT_SYMBOL(inet_frags_init_net);
 void inet_frags_fini(struct inet_frags *f)
 {
 	del_timer(&f->secret_timer);
+	cancel_work_sync(&f->frags_work);
 }
 EXPORT_SYMBOL(inet_frags_fini);
 
 void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
 {
+	int i;
+
 	nf->low_thresh = 0;
 
-	local_bh_disable();
-	inet_frag_evictor(nf, f, true);
-	local_bh_enable();
+	read_lock_bh(&f->lock);
+
+	for (i = 0; i < INETFRAGS_HASHSZ ; i++)
+		inet_evict_bucket(f, &f->hash[i]);
+
+	read_unlock_bh(&f->lock);
 
 	percpu_counter_destroy(&nf->mem);
 }
@@ -205,41 +294,6 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
 }
 EXPORT_SYMBOL(inet_frag_destroy);
 
-static int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force)
-{
-	struct inet_frag_queue *q;
-	int work, evicted = 0;
-
-	work = frag_mem_limit(nf) - nf->low_thresh;
-	while (work > 0 || force) {
-		spin_lock(&nf->lru_lock);
-
-		if (list_empty(&nf->lru_list)) {
-			spin_unlock(&nf->lru_lock);
-			break;
-		}
-
-		q = list_first_entry(&nf->lru_list,
-				struct inet_frag_queue, lru_list);
-		atomic_inc(&q->refcnt);
-		/* Remove q from list to avoid several CPUs grabbing it */
-		list_del_init(&q->lru_list);
-
-		spin_unlock(&nf->lru_lock);
-
-		spin_lock(&q->lock);
-		if (!(q->last_in & INET_FRAG_COMPLETE))
-			inet_frag_kill(q, f);
-		spin_unlock(&q->lock);
-
-		if (atomic_dec_and_test(&q->refcnt))
-			inet_frag_destroy(q, f, &work);
-		evicted++;
-	}
-
-	return evicted;
-}
-
 static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 		struct inet_frag_queue *qp_in, struct inet_frags *f,
 		void *arg)
@@ -292,8 +346,10 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 {
 	struct inet_frag_queue *q;
 
-	if (frag_mem_limit(nf) > nf->high_thresh)
+	if (frag_mem_limit(nf) > nf->high_thresh) {
+		inet_frag_schedule_worker(f);
 		return NULL;
+	}
 
 	q = kzalloc(f->qsize, GFP_ATOMIC);
 	if (q == NULL)
@@ -331,8 +387,8 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 	struct inet_frag_queue *q;
 	int depth = 0;
 
-	if (frag_mem_limit(nf) > nf->high_thresh)
-		inet_frag_evictor(nf, f, false);
+	if (frag_mem_limit(nf) > nf->low_thresh)
+		inet_frag_schedule_worker(f);
 
 	hash &= (INETFRAGS_HASHSZ - 1);
 	hb = &f->hash[hash];
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 54988672d00d..54bd170c5eb4 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -195,7 +195,8 @@ static void ip_expire(unsigned long arg)
 
 	ipq_kill(qp);
 
-	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
+	if (!(qp->q.last_in & INET_FRAG_EVICTED))
+		IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
 
 	if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 97acbc490d9e..b3924b10dff3 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -141,7 +141,9 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
 	if (!dev)
 		goto out_rcu_unlock;
 
-	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
+	if (!(fq->q.last_in & INET_FRAG_EVICTED))
+		IP6_INC_STATS_BH(net, __in6_dev_get(dev),
+				 IPSTATS_MIB_REASMTIMEOUT);
 	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
 
 	/* Don't send error if the first segment did not arrive. */
-- 
cgit v1.2.3


From 434d305405ab86414f6ea3f261307d443a2c3506 Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Thu, 24 Jul 2014 16:50:33 +0200
Subject: inet: frag: don't account number of fragment queues

The 'nqueues' counter is protected by the lru list lock,
once thats removed this needs to be converted to atomic
counter.  Given this isn't used for anything except for
reporting it to userspace via /proc, just remove it.

We still report the memory currently used by fragment
reassembly queues.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h  | 3 ---
 include/net/ip.h         | 1 -
 include/net/ipv6.h       | 5 -----
 net/ipv4/inet_fragment.c | 1 -
 net/ipv4/ip_fragment.c   | 5 -----
 net/ipv4/proc.c          | 5 +++--
 net/ipv6/proc.c          | 4 ++--
 7 files changed, 5 insertions(+), 19 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index e975032ea11b..68de33765705 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -4,7 +4,6 @@
 #include <linux/percpu_counter.h>
 
 struct netns_frags {
-	int			nqueues;
 	struct list_head	lru_list;
 	spinlock_t		lru_lock;
 
@@ -158,7 +157,6 @@ static inline void inet_frag_lru_del(struct inet_frag_queue *q)
 {
 	spin_lock(&q->net->lru_lock);
 	list_del_init(&q->lru_list);
-	q->net->nqueues--;
 	spin_unlock(&q->net->lru_lock);
 }
 
@@ -167,7 +165,6 @@ static inline void inet_frag_lru_add(struct netns_frags *nf,
 {
 	spin_lock(&nf->lru_lock);
 	list_add_tail(&q->lru_list, &nf->lru_list);
-	q->net->nqueues++;
 	spin_unlock(&nf->lru_lock);
 }
 
diff --git a/include/net/ip.h b/include/net/ip.h
index 2e8f055989c3..ca14799545fd 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -495,7 +495,6 @@ static inline struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
 }
 #endif
 int ip_frag_mem(struct net *net);
-int ip_frag_nqueues(struct net *net);
 
 /*
  *	Functions provided by ip_forward.c
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 25c2170e1298..a2db816e8461 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -299,11 +299,6 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev)
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static inline int ip6_frag_nqueues(struct net *net)
-{
-	return net->ipv6.frags.nqueues;
-}
-
 static inline int ip6_frag_mem(struct net *net)
 {
 	return sum_frag_mem_limit(&net->ipv6.frags);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 43315ecb9400..231ca0b40811 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -193,7 +193,6 @@ EXPORT_SYMBOL(inet_frags_init);
 
 void inet_frags_init_net(struct netns_frags *nf)
 {
-	nf->nqueues = 0;
 	init_frag_mem_limit(nf);
 	INIT_LIST_HEAD(&nf->lru_list);
 	spin_lock_init(&nf->lru_lock);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 54bd170c5eb4..1f42c2e3966b 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -86,11 +86,6 @@ static inline u8 ip4_frag_ecn(u8 tos)
 
 static struct inet_frags ip4_frags;
 
-int ip_frag_nqueues(struct net *net)
-{
-	return net->ipv4.frags.nqueues;
-}
-
 int ip_frag_mem(struct net *net)
 {
 	return sum_frag_mem_limit(&net->ipv4.frags);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index ae0af9386f7c..8e3eb39f84e7 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -52,6 +52,7 @@
 static int sockstat_seq_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq->private;
+	unsigned int frag_mem;
 	int orphans, sockets;
 
 	local_bh_disable();
@@ -71,8 +72,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 		   sock_prot_inuse_get(net, &udplite_prot));
 	seq_printf(seq, "RAW: inuse %d\n",
 		   sock_prot_inuse_get(net, &raw_prot));
-	seq_printf(seq,  "FRAG: inuse %d memory %d\n",
-			ip_frag_nqueues(net), ip_frag_mem(net));
+	frag_mem = ip_frag_mem(net);
+	seq_printf(seq,  "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem);
 	return 0;
 }
 
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 3317440ea341..2d6f860e5c1e 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -33,6 +33,7 @@
 static int sockstat6_seq_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq->private;
+	unsigned int frag_mem = ip6_frag_mem(net);
 
 	seq_printf(seq, "TCP6: inuse %d\n",
 		       sock_prot_inuse_get(net, &tcpv6_prot));
@@ -42,8 +43,7 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
 			sock_prot_inuse_get(net, &udplitev6_prot));
 	seq_printf(seq, "RAW6: inuse %d\n",
 		       sock_prot_inuse_get(net, &rawv6_prot));
-	seq_printf(seq, "FRAG6: inuse %d memory %d\n",
-		       ip6_frag_nqueues(net), ip6_frag_mem(net));
+	seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 3fd588eb90bfbba17091381006ecafe29c45db4a Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Thu, 24 Jul 2014 16:50:34 +0200
Subject: inet: frag: remove lru list

no longer used.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h                 | 32 ++------------------------------
 net/ieee802154/reassembly.c             |  1 -
 net/ipv4/inet_fragment.c                | 12 ++----------
 net/ipv4/ip_fragment.c                  |  1 -
 net/ipv6/netfilter/nf_conntrack_reasm.c |  1 -
 net/ipv6/reassembly.c                   |  1 -
 6 files changed, 4 insertions(+), 44 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 68de33765705..90d21ea62c59 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -4,9 +4,6 @@
 #include <linux/percpu_counter.h>
 
 struct netns_frags {
-	struct list_head	lru_list;
-	spinlock_t		lru_lock;
-
 	/* The percpu_counter "mem" need to be cacheline aligned.
 	 *  mem.count must not share cacheline with other writers
 	 */
@@ -21,7 +18,6 @@ struct netns_frags {
 struct inet_frag_queue {
 	spinlock_t		lock;
 	struct timer_list	timer;      /* when will this queue expire? */
-	struct list_head	lru_list;   /* lru list member */
 	struct hlist_node	list;
 	atomic_t		refcnt;
 	struct sk_buff		*fragments; /* list of received fragments */
@@ -91,8 +87,7 @@ void inet_frags_init_net(struct netns_frags *nf);
 void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
 
 void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
-void inet_frag_destroy(struct inet_frag_queue *q,
-				struct inet_frags *f, int *work);
+void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f);
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 		struct inet_frags *f, void *key, unsigned int hash)
 	__releases(&f->lock);
@@ -102,7 +97,7 @@ void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
 static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f)
 {
 	if (atomic_dec_and_test(&q->refcnt))
-		inet_frag_destroy(q, f, NULL);
+		inet_frag_destroy(q, f);
 }
 
 /* Memory Tracking Functions. */
@@ -145,29 +140,6 @@ static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)
 	return res;
 }
 
-static inline void inet_frag_lru_move(struct inet_frag_queue *q)
-{
-	spin_lock(&q->net->lru_lock);
-	if (!list_empty(&q->lru_list))
-		list_move_tail(&q->lru_list, &q->net->lru_list);
-	spin_unlock(&q->net->lru_lock);
-}
-
-static inline void inet_frag_lru_del(struct inet_frag_queue *q)
-{
-	spin_lock(&q->net->lru_lock);
-	list_del_init(&q->lru_list);
-	spin_unlock(&q->net->lru_lock);
-}
-
-static inline void inet_frag_lru_add(struct netns_frags *nf,
-				     struct inet_frag_queue *q)
-{
-	spin_lock(&nf->lru_lock);
-	list_add_tail(&q->lru_list, &nf->lru_list);
-	spin_unlock(&nf->lru_lock);
-}
-
 /* RFC 3168 support :
  * We want to check ECN values of all fragments, do detect invalid combinations.
  * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
diff --git a/net/ieee802154/reassembly.c b/net/ieee802154/reassembly.c
index 9503a48556f7..b4bc7a50eccf 100644
--- a/net/ieee802154/reassembly.c
+++ b/net/ieee802154/reassembly.c
@@ -219,7 +219,6 @@ found:
 		return res;
 	}
 
-	inet_frag_lru_move(&fq->q);
 	return -1;
 err:
 	kfree_skb(skb);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 231ca0b40811..198a5ed7a815 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -194,8 +194,6 @@ EXPORT_SYMBOL(inet_frags_init);
 void inet_frags_init_net(struct netns_frags *nf)
 {
 	init_frag_mem_limit(nf);
-	INIT_LIST_HEAD(&nf->lru_list);
-	spin_lock_init(&nf->lru_lock);
 }
 EXPORT_SYMBOL(inet_frags_init_net);
 
@@ -237,7 +235,6 @@ static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
 	spin_unlock(&hb->chain_lock);
 
 	read_unlock(&f->lock);
-	inet_frag_lru_del(fq);
 }
 
 void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
@@ -261,8 +258,7 @@ static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
 	kfree_skb(skb);
 }
 
-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
-					int *work)
+void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
 {
 	struct sk_buff *fp;
 	struct netns_frags *nf;
@@ -282,14 +278,11 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f,
 		fp = xp;
 	}
 	sum = sum_truesize + f->qsize;
-	if (work)
-		*work -= sum;
 	sub_frag_mem_limit(q, sum);
 
 	if (f->destructor)
 		f->destructor(q);
 	kfree(q);
-
 }
 EXPORT_SYMBOL(inet_frag_destroy);
 
@@ -333,7 +326,7 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 
 	atomic_inc(&qp->refcnt);
 	hlist_add_head(&qp->list, &hb->chain);
-	inet_frag_lru_add(nf, qp);
+
 	spin_unlock(&hb->chain_lock);
 	read_unlock(&f->lock);
 
@@ -361,7 +354,6 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 	setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
 	spin_lock_init(&q->lock);
 	atomic_set(&q->refcnt, 1);
-	INIT_LIST_HEAD(&q->lru_list);
 
 	return q;
 }
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 1f42c2e3966b..8fbeee495037 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -489,7 +489,6 @@ found:
 	}
 
 	skb_dst_drop(skb);
-	inet_frag_lru_move(&qp->q);
 	return -EINPROGRESS;
 
 err:
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 58e32cf91c95..fb0f72a0ff31 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -349,7 +349,6 @@ found:
 		fq->q.last_in |= INET_FRAG_FIRST_IN;
 	}
 
-	inet_frag_lru_move(&fq->q);
 	return 0;
 
 discard_fq:
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index b3924b10dff3..af85551682c2 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -351,7 +351,6 @@ found:
 	}
 
 	skb_dst_drop(skb);
-	inet_frag_lru_move(&fq->q);
 	return -1;
 
 discard_fq:
-- 
cgit v1.2.3


From e3a57d18b06179d68fcf7a0a06ad844493c65e06 Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Thu, 24 Jul 2014 16:50:35 +0200
Subject: inet: frag: remove periodic secret rebuild timer

merge functionality into the eviction workqueue.

Instead of rebuilding every n seconds, take advantage of the upper
hash chain length limit.

If we hit it, mark table for rebuild and schedule workqueue.
To prevent frequent rebuilds when we're completely overloaded,
don't rebuild more than once every 5 seconds.

ipfrag_secret_interval sysctl is now obsolete and has been marked as
deprecated, it still can be changed so scripts won't be broken but it
won't have any effect. A comment is left above each unused secret_timer
variable to avoid confusion.

Joint work with Nikolay Aleksandrov.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt  | 10 --------
 include/net/inet_frag.h                 |  4 +--
 net/ieee802154/reassembly.c             |  5 ++--
 net/ipv4/inet_fragment.c                | 43 ++++++++++++++++++++++-----------
 net/ipv4/ip_fragment.c                  |  5 ++--
 net/ipv6/netfilter/nf_conntrack_reasm.c |  1 -
 net/ipv6/reassembly.c                   |  5 ++--
 7 files changed, 40 insertions(+), 33 deletions(-)

(limited to 'net/ipv4')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 625c8dda4be7..e8c304e37831 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -111,11 +111,6 @@ ipfrag_low_thresh - INTEGER
 ipfrag_time - INTEGER
 	Time in seconds to keep an IP fragment in memory.
 
-ipfrag_secret_interval - INTEGER
-	Regeneration interval (in seconds) of the hash secret (or lifetime
-	for the hash secret) for IP fragments.
-	Default: 600
-
 ipfrag_max_dist - INTEGER
 	ipfrag_max_dist is a non-negative integer value which defines the
 	maximum "disorder" which is allowed among fragments which share a
@@ -1164,11 +1159,6 @@ ip6frag_low_thresh - INTEGER
 ip6frag_time - INTEGER
 	Time in seconds to keep an IPv6 fragment in memory.
 
-ip6frag_secret_interval - INTEGER
-	Regeneration interval (in seconds) of the hash secret (or lifetime
-	for the hash secret) for IPv6 fragments.
-	Default: 600
-
 conf/default/*:
 	Change the interface-specific default settings.
 
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 90d21ea62c59..d9cc5bb64854 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -58,11 +58,11 @@ struct inet_frags {
 	 * Its primarily a rebuild protection rwlock.
 	 */
 	rwlock_t		lock ____cacheline_aligned_in_smp;
-	int			secret_interval;
-	struct timer_list	secret_timer;
 
 	struct work_struct	frags_work;
 	unsigned int next_bucket;
+	unsigned long last_rebuild_jiffies;
+	bool rebuild;
 
 	/* The first call to hashfn is responsible to initialize
 	 * rnd. This is best done with net_get_random_once.
diff --git a/net/ieee802154/reassembly.c b/net/ieee802154/reassembly.c
index b4bc7a50eccf..20d219682d84 100644
--- a/net/ieee802154/reassembly.c
+++ b/net/ieee802154/reassembly.c
@@ -419,10 +419,12 @@ static struct ctl_table lowpan_frags_ns_ctl_table[] = {
 	{ }
 };
 
+/* secret interval has been deprecated */
+static int lowpan_frags_secret_interval_unused;
 static struct ctl_table lowpan_frags_ctl_table[] = {
 	{
 		.procname	= "6lowpanfrag_secret_interval",
-		.data		= &lowpan_frags.secret_interval,
+		.data		= &lowpan_frags_secret_interval_unused,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
@@ -562,7 +564,6 @@ int __init lowpan_net_frag_init(void)
 	lowpan_frags.qsize = sizeof(struct frag_queue);
 	lowpan_frags.match = lowpan_frag_match;
 	lowpan_frags.frag_expire = lowpan_frag_expire;
-	lowpan_frags.secret_interval = 10 * 60 * HZ;
 	inet_frags_init(&lowpan_frags);
 
 	return ret;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 198a5ed7a815..58d4c38534f6 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -28,6 +28,9 @@
 #define INETFRAGS_EVICT_BUCKETS   128
 #define INETFRAGS_EVICT_MAX	  512
 
+/* don't rebuild inetfrag table with new secret more often than this */
+#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
+
 /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
  * Value : 0xff if frame should be dropped.
  *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
@@ -55,16 +58,24 @@ inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
 	return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
 }
 
-static void inet_frag_secret_rebuild(unsigned long dummy)
+static bool inet_frag_may_rebuild(struct inet_frags *f)
+{
+	return time_after(jiffies,
+	       f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
+}
+
+static void inet_frag_secret_rebuild(struct inet_frags *f)
 {
-	struct inet_frags *f = (struct inet_frags *)dummy;
-	unsigned long now = jiffies;
 	int i;
 
 	/* Per bucket lock NOT needed here, due to write lock protection */
-	write_lock(&f->lock);
+	write_lock_bh(&f->lock);
+
+	if (!inet_frag_may_rebuild(f))
+		goto out;
 
 	get_random_bytes(&f->rnd, sizeof(u32));
+
 	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
 		struct inet_frag_bucket *hb;
 		struct inet_frag_queue *q;
@@ -85,9 +96,11 @@ static void inet_frag_secret_rebuild(unsigned long dummy)
 			}
 		}
 	}
-	write_unlock(&f->lock);
 
-	mod_timer(&f->secret_timer, now + f->secret_interval);
+	f->rebuild = false;
+	f->last_rebuild_jiffies = jiffies;
+out:
+	write_unlock_bh(&f->lock);
 }
 
 static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
@@ -162,6 +175,8 @@ static void inet_frag_worker(struct work_struct *work)
 	f->next_bucket = i;
 
 	read_unlock_bh(&f->lock);
+	if (f->rebuild && inet_frag_may_rebuild(f))
+		inet_frag_secret_rebuild(f);
 }
 
 static void inet_frag_schedule_worker(struct inet_frags *f)
@@ -183,11 +198,7 @@ void inet_frags_init(struct inet_frags *f)
 		INIT_HLIST_HEAD(&hb->chain);
 	}
 	rwlock_init(&f->lock);
-
-	setup_timer(&f->secret_timer, inet_frag_secret_rebuild,
-			(unsigned long)f);
-	f->secret_timer.expires = jiffies + f->secret_interval;
-	add_timer(&f->secret_timer);
+	f->last_rebuild_jiffies = 0;
 }
 EXPORT_SYMBOL(inet_frags_init);
 
@@ -199,7 +210,6 @@ EXPORT_SYMBOL(inet_frags_init_net);
 
 void inet_frags_fini(struct inet_frags *f)
 {
-	del_timer(&f->secret_timer);
 	cancel_work_sync(&f->frags_work);
 }
 EXPORT_SYMBOL(inet_frags_fini);
@@ -399,8 +409,13 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 
 	if (depth <= INETFRAGS_MAXDEPTH)
 		return inet_frag_create(nf, f, key);
-	else
-		return ERR_PTR(-ENOBUFS);
+
+	if (inet_frag_may_rebuild(f)) {
+		f->rebuild = true;
+		inet_frag_schedule_worker(f);
+	}
+
+	return ERR_PTR(-ENOBUFS);
 }
 EXPORT_SYMBOL(inet_frag_find);
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8fbeee495037..44e591a7e03f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -720,10 +720,12 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
 	{ }
 };
 
+/* secret interval has been deprecated */
+static int ip4_frags_secret_interval_unused;
 static struct ctl_table ip4_frags_ctl_table[] = {
 	{
 		.procname	= "ipfrag_secret_interval",
-		.data		= &ip4_frags.secret_interval,
+		.data		= &ip4_frags_secret_interval_unused,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
@@ -853,6 +855,5 @@ void __init ipfrag_init(void)
 	ip4_frags.qsize = sizeof(struct ipq);
 	ip4_frags.match = ip4_frag_match;
 	ip4_frags.frag_expire = ip_expire;
-	ip4_frags.secret_interval = 10 * 60 * HZ;
 	inet_frags_init(&ip4_frags);
 }
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index fb0f72a0ff31..3b3ef9774cc2 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -669,7 +669,6 @@ int nf_ct_frag6_init(void)
 	nf_frags.qsize = sizeof(struct frag_queue);
 	nf_frags.match = ip6_frag_match;
 	nf_frags.frag_expire = nf_ct_frag6_expire;
-	nf_frags.secret_interval = 10 * 60 * HZ;
 	inet_frags_init(&nf_frags);
 
 	ret = register_pernet_subsys(&nf_ct_net_ops);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index af85551682c2..987fea46b915 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -604,10 +604,12 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = {
 	{ }
 };
 
+/* secret interval has been deprecated */
+static int ip6_frags_secret_interval_unused;
 static struct ctl_table ip6_frags_ctl_table[] = {
 	{
 		.procname	= "ip6frag_secret_interval",
-		.data		= &ip6_frags.secret_interval,
+		.data		= &ip6_frags_secret_interval_unused,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
@@ -737,7 +739,6 @@ int __init ipv6_frag_init(void)
 	ip6_frags.qsize = sizeof(struct frag_queue);
 	ip6_frags.match = ip6_frag_match;
 	ip6_frags.frag_expire = ip6_frag_expire;
-	ip6_frags.secret_interval = 10 * 60 * HZ;
 	inet_frags_init(&ip6_frags);
 out:
 	return ret;
-- 
cgit v1.2.3


From ab1c724f633080ed2e8a0cfe61654599b55cf8f9 Mon Sep 17 00:00:00 2001
From: Florian Westphal
Date: Thu, 24 Jul 2014 16:50:36 +0200
Subject: inet: frag: use seqlock for hash rebuild

rehash is rare operation, don't force readers to take
the read-side rwlock.

Instead, we only have to detect the (rare) case where
the secret was altered while we are trying to insert
a new inetfrag queue into the table.

If it was changed, drop the bucket lock and recompute
the hash to get the 'new' chain bucket that we have to
insert into.

Joint work with Nikolay Aleksandrov.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h                 | 13 +++--
 net/ieee802154/reassembly.c             |  1 -
 net/ipv4/inet_fragment.c                | 88 ++++++++++++++++++++-------------
 net/ipv4/ip_fragment.c                  |  1 -
 net/ipv6/netfilter/nf_conntrack_reasm.c |  2 +-
 net/ipv6/reassembly.c                   |  1 -
 6 files changed, 62 insertions(+), 44 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index d9cc5bb64854..6f4930a0b660 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -53,11 +53,6 @@ struct inet_frag_bucket {
 
 struct inet_frags {
 	struct inet_frag_bucket	hash[INETFRAGS_HASHSZ];
-	/* This rwlock is a global lock (seperate per IPv4, IPv6 and
-	 * netfilter). Important to keep this on a seperate cacheline.
-	 * Its primarily a rebuild protection rwlock.
-	 */
-	rwlock_t		lock ____cacheline_aligned_in_smp;
 
 	struct work_struct	frags_work;
 	unsigned int next_bucket;
@@ -66,8 +61,12 @@ struct inet_frags {
 
 	/* The first call to hashfn is responsible to initialize
 	 * rnd. This is best done with net_get_random_once.
+	 *
+	 * rnd_seqlock is used to let hash insertion detect
+	 * when it needs to re-lookup the hash chain to use.
 	 */
 	u32			rnd;
+	seqlock_t		rnd_seqlock;
 	int			qsize;
 
 	unsigned int		(*hashfn)(const struct inet_frag_queue *);
@@ -89,8 +88,8 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
 void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
 void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f);
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
-		struct inet_frags *f, void *key, unsigned int hash)
-	__releases(&f->lock);
+		struct inet_frags *f, void *key, unsigned int hash);
+
 void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
 				   const char *prefix);
 
diff --git a/net/ieee802154/reassembly.c b/net/ieee802154/reassembly.c
index 20d219682d84..8da635d92a58 100644
--- a/net/ieee802154/reassembly.c
+++ b/net/ieee802154/reassembly.c
@@ -124,7 +124,6 @@ fq_find(struct net *net, const struct lowpan_frag_info *frag_info,
 	arg.src = src;
 	arg.dst = dst;
 
-	read_lock(&lowpan_frags.lock);
 	hash = lowpan_hash_frag(frag_info->d_tag, frag_info->d_size, src, dst);
 
 	q = inet_frag_find(&ieee802154_lowpan->frags,
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 58d4c38534f6..62b1f73749dc 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -68,8 +68,7 @@ static void inet_frag_secret_rebuild(struct inet_frags *f)
 {
 	int i;
 
-	/* Per bucket lock NOT needed here, due to write lock protection */
-	write_lock_bh(&f->lock);
+	write_seqlock_bh(&f->rnd_seqlock);
 
 	if (!inet_frag_may_rebuild(f))
 		goto out;
@@ -82,6 +81,8 @@ static void inet_frag_secret_rebuild(struct inet_frags *f)
 		struct hlist_node *n;
 
 		hb = &f->hash[i];
+		spin_lock(&hb->chain_lock);
+
 		hlist_for_each_entry_safe(q, n, &hb->chain, list) {
 			unsigned int hval = inet_frag_hashfn(f, q);
 
@@ -92,15 +93,28 @@ static void inet_frag_secret_rebuild(struct inet_frags *f)
 
 				/* Relink to new hash chain. */
 				hb_dest = &f->hash[hval];
+
+				/* This is the only place where we take
+				 * another chain_lock while already holding
+				 * one.  As this will not run concurrently,
+				 * we cannot deadlock on hb_dest lock below, if its
+				 * already locked it will be released soon since
+				 * other caller cannot be waiting for hb lock
+				 * that we've taken above.
+				 */
+				spin_lock_nested(&hb_dest->chain_lock,
+						 SINGLE_DEPTH_NESTING);
 				hlist_add_head(&q->list, &hb_dest->chain);
+				spin_unlock(&hb_dest->chain_lock);
 			}
 		}
+		spin_unlock(&hb->chain_lock);
 	}
 
 	f->rebuild = false;
 	f->last_rebuild_jiffies = jiffies;
 out:
-	write_unlock_bh(&f->lock);
+	write_sequnlock_bh(&f->rnd_seqlock);
 }
 
 static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
@@ -163,7 +177,7 @@ static void inet_frag_worker(struct work_struct *work)
 
 	BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
 
-	read_lock_bh(&f->lock);
+	local_bh_disable();
 
 	for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
 		evicted += inet_evict_bucket(f, &f->hash[i]);
@@ -174,7 +188,8 @@ static void inet_frag_worker(struct work_struct *work)
 
 	f->next_bucket = i;
 
-	read_unlock_bh(&f->lock);
+	local_bh_enable();
+
 	if (f->rebuild && inet_frag_may_rebuild(f))
 		inet_frag_secret_rebuild(f);
 }
@@ -197,7 +212,8 @@ void inet_frags_init(struct inet_frags *f)
 		spin_lock_init(&hb->chain_lock);
 		INIT_HLIST_HEAD(&hb->chain);
 	}
-	rwlock_init(&f->lock);
+
+	seqlock_init(&f->rnd_seqlock);
 	f->last_rebuild_jiffies = 0;
 }
 EXPORT_SYMBOL(inet_frags_init);
@@ -216,35 +232,56 @@ EXPORT_SYMBOL(inet_frags_fini);
 
 void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
 {
+	unsigned int seq;
 	int i;
 
 	nf->low_thresh = 0;
+	local_bh_disable();
 
-	read_lock_bh(&f->lock);
+evict_again:
+	seq = read_seqbegin(&f->rnd_seqlock);
 
 	for (i = 0; i < INETFRAGS_HASHSZ ; i++)
 		inet_evict_bucket(f, &f->hash[i]);
 
-	read_unlock_bh(&f->lock);
+	if (read_seqretry(&f->rnd_seqlock, seq))
+		goto evict_again;
+
+	local_bh_enable();
 
 	percpu_counter_destroy(&nf->mem);
 }
 EXPORT_SYMBOL(inet_frags_exit_net);
 
-static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
+static struct inet_frag_bucket *
+get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
+__acquires(hb->chain_lock)
 {
 	struct inet_frag_bucket *hb;
-	unsigned int hash;
+	unsigned int seq, hash;
+
+ restart:
+	seq = read_seqbegin(&f->rnd_seqlock);
 
-	read_lock(&f->lock);
 	hash = inet_frag_hashfn(f, fq);
 	hb = &f->hash[hash];
 
 	spin_lock(&hb->chain_lock);
+	if (read_seqretry(&f->rnd_seqlock, seq)) {
+		spin_unlock(&hb->chain_lock);
+		goto restart;
+	}
+
+	return hb;
+}
+
+static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
+{
+	struct inet_frag_bucket *hb;
+
+	hb = get_frag_bucket_locked(fq, f);
 	hlist_del(&fq->list);
 	spin_unlock(&hb->chain_lock);
-
-	read_unlock(&f->lock);
 }
 
 void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
@@ -300,30 +337,18 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 		struct inet_frag_queue *qp_in, struct inet_frags *f,
 		void *arg)
 {
-	struct inet_frag_bucket *hb;
+	struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
 	struct inet_frag_queue *qp;
-	unsigned int hash;
-
-	read_lock(&f->lock); /* Protects against hash rebuild */
-	/*
-	 * While we stayed w/o the lock other CPU could update
-	 * the rnd seed, so we need to re-calculate the hash
-	 * chain. Fortunatelly the qp_in can be used to get one.
-	 */
-	hash = inet_frag_hashfn(f, qp_in);
-	hb = &f->hash[hash];
-	spin_lock(&hb->chain_lock);
 
 #ifdef CONFIG_SMP
 	/* With SMP race we have to recheck hash table, because
-	 * such entry could be created on other cpu, while we
-	 * released the hash bucket lock.
+	 * such entry could have been created on other cpu before
+	 * we acquired hash bucket lock.
 	 */
 	hlist_for_each_entry(qp, &hb->chain, list) {
 		if (qp->net == nf && f->match(qp, arg)) {
 			atomic_inc(&qp->refcnt);
 			spin_unlock(&hb->chain_lock);
-			read_unlock(&f->lock);
 			qp_in->last_in |= INET_FRAG_COMPLETE;
 			inet_frag_put(qp_in, f);
 			return qp;
@@ -338,7 +363,6 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 	hlist_add_head(&qp->list, &hb->chain);
 
 	spin_unlock(&hb->chain_lock);
-	read_unlock(&f->lock);
 
 	return qp;
 }
@@ -382,7 +406,6 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
 
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 		struct inet_frags *f, void *key, unsigned int hash)
-	__releases(&f->lock)
 {
 	struct inet_frag_bucket *hb;
 	struct inet_frag_queue *q;
@@ -399,19 +422,18 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 		if (q->net == nf && f->match(q, key)) {
 			atomic_inc(&q->refcnt);
 			spin_unlock(&hb->chain_lock);
-			read_unlock(&f->lock);
 			return q;
 		}
 		depth++;
 	}
 	spin_unlock(&hb->chain_lock);
-	read_unlock(&f->lock);
 
 	if (depth <= INETFRAGS_MAXDEPTH)
 		return inet_frag_create(nf, f, key);
 
 	if (inet_frag_may_rebuild(f)) {
-		f->rebuild = true;
+		if (!f->rebuild)
+			f->rebuild = true;
 		inet_frag_schedule_worker(f);
 	}
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 44e591a7e03f..ccee68dffd6e 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -244,7 +244,6 @@ static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
 	arg.iph = iph;
 	arg.user = user;
 
-	read_lock(&ip4_frags.lock);
 	hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
 
 	q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 3b3ef9774cc2..4d9da1e35f8c 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -193,7 +193,7 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id,
 	arg.dst = dst;
 	arg.ecn = ecn;
 
-	read_lock_bh(&nf_frags.lock);
+	local_bh_disable();
 	hash = nf_hash_frag(id, src, dst);
 
 	q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 987fea46b915..57a9707b2032 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -190,7 +190,6 @@ fq_find(struct net *net, __be32 id, const struct in6_addr *src,
 	arg.dst = dst;
 	arg.ecn = ecn;
 
-	read_lock(&ip6_frags.lock);
 	hash = inet6_hash_frag(id, src, dst);
 
 	q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash);
-- 
cgit v1.2.3


From 1bab4c75075b84675b96992ac47580a57c26958d Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov
Date: Thu, 24 Jul 2014 16:50:37 +0200
Subject: inet: frag: set limits and make init_net's high_thresh limit global

This patch makes init_net's high_thresh limit to be the maximum for all
namespaces, thus introducing a global memory limit threshold equal to the
sum of the individual high_thresh limits which are capped.
It also introduces some sane minimums for low_thresh as it shouldn't be
able to drop below 0 (or > high_thresh in the unsigned case), and
overall low_thresh should not ever be above high_thresh, so we make the
following relations for a namespace:
init_net:
 high_thresh - max(not capped), min(init_net low_thresh)
 low_thresh - max(init_net high_thresh), min (0)

all other namespaces:
 high_thresh = max(init_net high_thresh), min(namespace's low_thresh)
 low_thresh = max(namespace's high_thresh), min(0)

The major issue with having low_thresh > high_thresh is that we'll
schedule eviction but never evict anything and thus rely only on the
timers.

Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt  |  3 ++-
 net/ieee802154/reassembly.c             | 12 ++++++++++--
 net/ipv4/ip_fragment.c                  | 10 ++++++++--
 net/ipv6/netfilter/nf_conntrack_reasm.c | 12 ++++++++++--
 net/ipv6/reassembly.c                   | 12 ++++++++++--
 5 files changed, 40 insertions(+), 9 deletions(-)

(limited to 'net/ipv4')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index e8c304e37831..29a93518bf18 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -101,7 +101,8 @@ ipfrag_high_thresh - INTEGER
 	Maximum memory used to reassemble IP fragments. When
 	ipfrag_high_thresh bytes of memory is allocated for this purpose,
 	the fragment handler will toss packets until ipfrag_low_thresh
-	is reached.
+	is reached. This also serves as a maximum limit to namespaces
+	different from the initial one.
 
 ipfrag_low_thresh - INTEGER
 	Maximum memory used to reassemble IP fragments before the kernel
diff --git a/net/ieee802154/reassembly.c b/net/ieee802154/reassembly.c
index 8da635d92a58..f13d4f32e207 100644
--- a/net/ieee802154/reassembly.c
+++ b/net/ieee802154/reassembly.c
@@ -386,20 +386,25 @@ err:
 EXPORT_SYMBOL(lowpan_frag_rcv);
 
 #ifdef CONFIG_SYSCTL
+static int zero;
+
 static struct ctl_table lowpan_frags_ns_ctl_table[] = {
 	{
 		.procname	= "6lowpanfrag_high_thresh",
 		.data		= &init_net.ieee802154_lowpan.frags.high_thresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &init_net.ieee802154_lowpan.frags.low_thresh
 	},
 	{
 		.procname	= "6lowpanfrag_low_thresh",
 		.data		= &init_net.ieee802154_lowpan.frags.low_thresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &init_net.ieee802154_lowpan.frags.high_thresh
 	},
 	{
 		.procname	= "6lowpanfrag_time",
@@ -446,7 +451,10 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net)
 			goto err_alloc;
 
 		table[0].data = &ieee802154_lowpan->frags.high_thresh;
+		table[0].extra1 = &ieee802154_lowpan->frags.low_thresh;
+		table[0].extra2 = &init_net.ieee802154_lowpan.frags.high_thresh;
 		table[1].data = &ieee802154_lowpan->frags.low_thresh;
+		table[1].extra2 = &ieee802154_lowpan->frags.high_thresh;
 		table[2].data = &ieee802154_lowpan->frags.timeout;
 		table[3].data = &ieee802154_lowpan->max_dsize;
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index ccee68dffd6e..634fc31aa243 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -700,14 +700,17 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
 		.data		= &init_net.ipv4.frags.high_thresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &init_net.ipv4.frags.low_thresh
 	},
 	{
 		.procname	= "ipfrag_low_thresh",
 		.data		= &init_net.ipv4.frags.low_thresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &init_net.ipv4.frags.high_thresh
 	},
 	{
 		.procname	= "ipfrag_time",
@@ -752,7 +755,10 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
 			goto err_alloc;
 
 		table[0].data = &net->ipv4.frags.high_thresh;
+		table[0].extra1 = &net->ipv4.frags.low_thresh;
+		table[0].extra2 = &init_net.ipv4.frags.high_thresh;
 		table[1].data = &net->ipv4.frags.low_thresh;
+		table[1].extra2 = &net->ipv4.frags.high_thresh;
 		table[2].data = &net->ipv4.frags.timeout;
 
 		/* Don't export sysctls to unprivileged users */
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 4d9da1e35f8c..3d4bccf6d67d 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -63,6 +63,8 @@ struct nf_ct_frag6_skb_cb
 static struct inet_frags nf_frags;
 
 #ifdef CONFIG_SYSCTL
+static int zero;
+
 static struct ctl_table nf_ct_frag6_sysctl_table[] = {
 	{
 		.procname	= "nf_conntrack_frag6_timeout",
@@ -76,14 +78,17 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = {
 		.data		= &init_net.nf_frag.frags.low_thresh,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &init_net.nf_frag.frags.high_thresh
 	},
 	{
 		.procname	= "nf_conntrack_frag6_high_thresh",
 		.data		= &init_net.nf_frag.frags.high_thresh,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &init_net.nf_frag.frags.low_thresh
 	},
 	{ }
 };
@@ -102,7 +107,10 @@ static int nf_ct_frag6_sysctl_register(struct net *net)
 
 		table[0].data = &net->nf_frag.frags.timeout;
 		table[1].data = &net->nf_frag.frags.low_thresh;
+		table[1].extra2 = &net->nf_frag.frags.high_thresh;
 		table[2].data = &net->nf_frag.frags.high_thresh;
+		table[2].extra1 = &net->nf_frag.frags.low_thresh;
+		table[2].extra2 = &init_net.nf_frag.frags.high_thresh;
 	}
 
 	hdr = register_net_sysctl(net, "net/netfilter", table);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 57a9707b2032..f1709c4a289a 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -578,20 +578,25 @@ static const struct inet6_protocol frag_protocol =
 };
 
 #ifdef CONFIG_SYSCTL
+static int zero;
+
 static struct ctl_table ip6_frags_ns_ctl_table[] = {
 	{
 		.procname	= "ip6frag_high_thresh",
 		.data		= &init_net.ipv6.frags.high_thresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &init_net.ipv6.frags.low_thresh
 	},
 	{
 		.procname	= "ip6frag_low_thresh",
 		.data		= &init_net.ipv6.frags.low_thresh,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &init_net.ipv6.frags.high_thresh
 	},
 	{
 		.procname	= "ip6frag_time",
@@ -628,7 +633,10 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net)
 			goto err_alloc;
 
 		table[0].data = &net->ipv6.frags.high_thresh;
+		table[0].extra1 = &net->ipv6.frags.low_thresh;
+		table[0].extra2 = &init_net.ipv6.frags.high_thresh;
 		table[1].data = &net->ipv6.frags.low_thresh;
+		table[1].extra2 = &net->ipv6.frags.high_thresh;
 		table[2].data = &net->ipv6.frags.timeout;
 
 		/* Don't export sysctls to unprivileged users */
-- 
cgit v1.2.3


From 20e61da7ffcfd84a1b6f797e745608572e5bc218 Mon Sep 17 00:00:00 2001
From: WANG Cong
Date: Fri, 25 Jul 2014 15:25:08 -0700
Subject: ipv4: fail early when creating netdev named all or default

We create a proc dir for each network device, this will cause
conflicts when the devices have name "all" or "default".

Rather than emitting an ugly kernel warning, we could just
fail earlier by checking the device name.

Reported-by: Stephane Chazelas <stephane.chazelas@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h   |  6 ++++++
 net/ipv4/devinet.c | 36 +++++++++++++++++++++++++++---------
 2 files changed, 33 insertions(+), 9 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip.h b/include/net/ip.h
index ca14799545fd..09b32da1b929 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -216,6 +216,12 @@ static inline int inet_is_local_reserved_port(struct net *net, int port)
 		return 0;
 	return test_bit(port, net->ipv4.sysctl_local_reserved_ports);
 }
+
+static inline bool sysctl_dev_name_is_allowed(const char *name)
+{
+	return strcmp(name, "default") != 0  && strcmp(name, "all") != 0;
+}
+
 #else
 static inline int inet_is_local_reserved_port(struct net *net, int port)
 {
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e9449376b58e..214882e7d6de 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -180,11 +180,12 @@ static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
 static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
 			 int destroy);
 #ifdef CONFIG_SYSCTL
-static void devinet_sysctl_register(struct in_device *idev);
+static int devinet_sysctl_register(struct in_device *idev);
 static void devinet_sysctl_unregister(struct in_device *idev);
 #else
-static void devinet_sysctl_register(struct in_device *idev)
+static int devinet_sysctl_register(struct in_device *idev)
 {
+	return 0;
 }
 static void devinet_sysctl_unregister(struct in_device *idev)
 {
@@ -232,6 +233,7 @@ EXPORT_SYMBOL(in_dev_finish_destroy);
 static struct in_device *inetdev_init(struct net_device *dev)
 {
 	struct in_device *in_dev;
+	int err = -ENOMEM;
 
 	ASSERT_RTNL();
 
@@ -252,7 +254,13 @@ static struct in_device *inetdev_init(struct net_device *dev)
 	/* Account for reference dev->ip_ptr (below) */
 	in_dev_hold(in_dev);
 
-	devinet_sysctl_register(in_dev);
+	err = devinet_sysctl_register(in_dev);
+	if (err) {
+		in_dev->dead = 1;
+		in_dev_put(in_dev);
+		in_dev = NULL;
+		goto out;
+	}
 	ip_mc_init_dev(in_dev);
 	if (dev->flags & IFF_UP)
 		ip_mc_up(in_dev);
@@ -260,7 +268,7 @@ static struct in_device *inetdev_init(struct net_device *dev)
 	/* we can receive as soon as ip_ptr is set -- do this last */
 	rcu_assign_pointer(dev->ip_ptr, in_dev);
 out:
-	return in_dev;
+	return in_dev ?: ERR_PTR(err);
 out_kfree:
 	kfree(in_dev);
 	in_dev = NULL;
@@ -1347,8 +1355,8 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
 	if (!in_dev) {
 		if (event == NETDEV_REGISTER) {
 			in_dev = inetdev_init(dev);
-			if (!in_dev)
-				return notifier_from_errno(-ENOMEM);
+			if (IS_ERR(in_dev))
+				return notifier_from_errno(PTR_ERR(in_dev));
 			if (dev->flags & IFF_LOOPBACK) {
 				IN_DEV_CONF_SET(in_dev, NOXFRM, 1);
 				IN_DEV_CONF_SET(in_dev, NOPOLICY, 1);
@@ -2182,11 +2190,21 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
 	kfree(t);
 }
 
-static void devinet_sysctl_register(struct in_device *idev)
+static int devinet_sysctl_register(struct in_device *idev)
 {
-	neigh_sysctl_register(idev->dev, idev->arp_parms, NULL);
-	__devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
+	int err;
+
+	if (!sysctl_dev_name_is_allowed(idev->dev->name))
+		return -EINVAL;
+
+	err = neigh_sysctl_register(idev->dev, idev->arp_parms, NULL);
+	if (err)
+		return err;
+	err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
 					&idev->cnf);
+	if (err)
+		neigh_sysctl_unregister(idev->arp_parms);
+	return err;
 }
 
 static void devinet_sysctl_unregister(struct in_device *idev)
-- 
cgit v1.2.3


From 5a8dbf03ddf6dfe465b9f062e5b969f9606643f5 Mon Sep 17 00:00:00 2001
From: Himangi Saraogi
Date: Sun, 27 Jul 2014 12:36:51 +0530
Subject: net/ipv4: Use IS_ERR_OR_NULL

This patch introduces the use of the macro IS_ERR_OR_NULL in place of
tests for NULL and IS_ERR.

The following Coccinelle semantic patch was used for making the change:

@@
expression e;
@@

- e == NULL || IS_ERR(e)
+ IS_ERR_OR_NULL(e)
 || ...

Signed-off-by: Himangi Saraogi <himangi774@gmail.com>
Acked-by: Julia Lawall <julia.lawall@lip6.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/gre_offload.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index f0bdd47bbbcb..6556263c8fa5 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -74,7 +74,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 	/* segment inner packet. */
 	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
 	segs = skb_mac_gso_segment(skb, enc_features);
-	if (!segs || IS_ERR(segs)) {
+	if (IS_ERR_OR_NULL(segs)) {
 		skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len);
 		goto out;
 	}
-- 
cgit v1.2.3


From 27446442a810f29d0fa97356bbc11f45e7ecfa6e Mon Sep 17 00:00:00 2001
From: Himangi Saraogi
Date: Sun, 27 Jul 2014 12:38:38 +0530
Subject: net/udp_offload: Use IS_ERR_OR_NULL

This patch introduces the use of the macro IS_ERR_OR_NULL in place of
tests for NULL and IS_ERR.

The following Coccinelle semantic patch was used for making the change:

@@
expression e;
@@

- e == NULL || IS_ERR(e)
+ IS_ERR_OR_NULL(e)
 || ...

Signed-off-by: Himangi Saraogi <himangi774@gmail.com>
Acked-by: Julia Lawall <julia.lawall@lip6.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp_offload.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 4807544d018b..59035bc3008d 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -79,7 +79,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
 	/* segment inner packet. */
 	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
 	segs = skb_mac_gso_segment(skb, enc_features);
-	if (!segs || IS_ERR(segs)) {
+	if (IS_ERR_OR_NULL(segs)) {
 		skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
 				     mac_len);
 		goto out;
-- 
cgit v1.2.3


From c54a5e024777a2fd4b01239d58cf119901e8e5ed Mon Sep 17 00:00:00 2001
From: Karoly Kemeny
Date: Sun, 27 Jul 2014 12:29:07 +0200
Subject: ipv4: clean up cast warning in do_ip_getsockopt

Sparse warns because of implicit pointer cast.

v2: subject line correction, space between "void" and "*"

Signed-off-by: Karoly Kemeny <karoly.kemeny@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 64741b938632..5cb830c78990 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1319,7 +1319,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 		if (sk->sk_type != SOCK_STREAM)
 			return -ENOPROTOOPT;
 
-		msg.msg_control = optval;
+		msg.msg_control = (__force void *) optval;
 		msg.msg_controllen = len;
 		msg.msg_flags = flags;
 
-- 
cgit v1.2.3


From 95cb5745983c222867cc9ac593aebb2ad67d72c0 Mon Sep 17 00:00:00 2001
From: Dmitry Popov
Date: Tue, 29 Jul 2014 03:07:52 +0400
Subject: ip_tunnel(ipv4): fix tunnels with "local any remote $remote_ip"

Ipv4 tunnels created with "local any remote $ip" didn't work properly since
7d442fab0 (ipv4: Cache dst in tunnels). 99% of packets sent via those tunnels
had src addr = 0.0.0.0. That was because only dst_entry was cached, although
fl4.saddr has to be cached too. Every time ip_tunnel_xmit used cached dst_entry
(tunnel_rtable_get returned non-NULL), fl4.saddr was initialized with
tnl_params->saddr (= 0 in our case), and wasn't changed until iptunnel_xmit().

This patch adds saddr to ip_tunnel->dst_cache, fixing this issue.

Reported-by: Sergey Popov <pinkbyte@gentoo.org>
Signed-off-by: Dmitry Popov <ixaphire@qrator.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h |  1 +
 net/ipv4/ip_tunnel.c     | 29 ++++++++++++++++++-----------
 2 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index a4daf9eb8562..8dd8cab88b87 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -40,6 +40,7 @@ struct ip_tunnel_prl_entry {
 
 struct ip_tunnel_dst {
 	struct dst_entry __rcu 		*dst;
+	__be32				 saddr;
 };
 
 struct ip_tunnel {
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 6f9de61dce5f..45920d928341 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -69,23 +69,25 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
 }
 
 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
-			     struct dst_entry *dst)
+			     struct dst_entry *dst, __be32 saddr)
 {
 	struct dst_entry *old_dst;
 
 	dst_clone(dst);
 	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
 	dst_release(old_dst);
+	idst->saddr = saddr;
 }
 
-static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
+static void tunnel_dst_set(struct ip_tunnel *t,
+			   struct dst_entry *dst, __be32 saddr)
 {
-	__tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst);
+	__tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst, saddr);
 }
 
 static void tunnel_dst_reset(struct ip_tunnel *t)
 {
-	tunnel_dst_set(t, NULL);
+	tunnel_dst_set(t, NULL, 0);
 }
 
 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
@@ -93,20 +95,25 @@ void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
 	int i;
 
 	for_each_possible_cpu(i)
-		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
+		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
 }
 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
 
-static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie)
+static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
+					u32 cookie, __be32 *saddr)
 {
+	struct ip_tunnel_dst *idst;
 	struct dst_entry *dst;
 
 	rcu_read_lock();
-	dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
+	idst = this_cpu_ptr(t->dst_cache);
+	dst = rcu_dereference(idst->dst);
 	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
 		dst = NULL;
 	if (dst) {
-		if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+		if (!dst->obsolete || dst->ops->check(dst, cookie)) {
+			*saddr = idst->saddr;
+		} else {
 			tunnel_dst_reset(t);
 			dst_release(dst);
 			dst = NULL;
@@ -367,7 +374,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
 
 		if (!IS_ERR(rt)) {
 			tdev = rt->dst.dev;
-			tunnel_dst_set(tunnel, &rt->dst);
+			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
 			ip_rt_put(rt);
 		}
 		if (dev->type != ARPHRD_ETHER)
@@ -610,7 +617,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
 
-	rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL;
+	rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
 
 	if (!rt) {
 		rt = ip_route_output_key(tunnel->net, &fl4);
@@ -620,7 +627,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 			goto tx_error;
 		}
 		if (connected)
-			tunnel_dst_set(tunnel, &rt->dst);
+			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
 	}
 
 	if (rt->dst.dev == dev) {
-- 
cgit v1.2.3


From 45a07695bc64b3ab5d6d2215f9677e5b8c05a7d0 Mon Sep 17 00:00:00 2001
From: Christoph Paasch
Date: Tue, 29 Jul 2014 12:07:27 +0200
Subject: tcp: Fix integer-overflows in TCP veno

In veno we do a multiplication of the cwnd and the rtt. This
may overflow and thus their result is stored in a u64. However, we first
need to cast the cwnd so that actually 64-bit arithmetic is done.

A first attempt at fixing 76f1017757aa0 ([TCP]: TCP Veno congestion
control) was made by 159131149c2 (tcp: Overflow bug in Vegas), but it
failed to add the required cast in tcp_veno_cong_avoid().

Fixes: 76f1017757aa0 ([TCP]: TCP Veno congestion control)
Signed-off-by: Christoph Paasch <christoph.paasch@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_veno.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 27b9825753d1..8276977d2c85 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -144,7 +144,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 
 		rtt = veno->minrtt;
 
-		target_cwnd = (tp->snd_cwnd * veno->basertt);
+		target_cwnd = (u64)tp->snd_cwnd * veno->basertt;
 		target_cwnd <<= V_PARAM_SHIFT;
 		do_div(target_cwnd, rtt);
 
-- 
cgit v1.2.3


From 1f74e613ded11517db90b2bd57e9464d9e0fb161 Mon Sep 17 00:00:00 2001
From: Christoph Paasch
Date: Tue, 29 Jul 2014 13:40:57 +0200
Subject: tcp: Fix integer-overflow in TCP vegas

In vegas we do a multiplication of the cwnd and the rtt. This
may overflow and thus their result is stored in a u64. However, we first
need to cast the cwnd so that actually 64-bit arithmetic is done.

Then, we need to do do_div to allow this to be used on 32-bit arches.

Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: David Laight <David.Laight@ACULAB.COM>
Cc: Doug Leith <doug.leith@nuim.ie>
Fixes: 8d3a564da34e (tcp: tcp_vegas cong avoid fix)
Signed-off-by: Christoph Paasch <christoph.paasch@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_vegas.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 9a5e05f27f4f..b40ad897f945 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -218,7 +218,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 			 * This is:
 			 *     (actual rate in segments) * baseRTT
 			 */
-			target_cwnd = tp->snd_cwnd * vegas->baseRTT / rtt;
+			target_cwnd = (u64)tp->snd_cwnd * vegas->baseRTT;
+			do_div(target_cwnd, rtt);
 
 			/* Calculate the difference between the window we had,
 			 * and the window we would like to have. This quantity
-- 
cgit v1.2.3


From 388070faa1e1f6138c1ea0e82271c7ecb3f36477 Mon Sep 17 00:00:00 2001
From: Banerjee, Debabrata
Date: Wed, 30 Jul 2014 13:50:17 -0400
Subject: tcp: don't require root to read tcp_metrics

commit d23ff7016 (tcp: add generic netlink support for tcp_metrics) introduced
netlink support for the new tcp_metrics, however it restricted getting of
tcp_metrics to root user only. This is a change from how these values could
have been fetched when in the old route cache. Unless there's a legitimate
reason to restrict the reading of these values it would be better if normal
users could fetch them.

Cc: Julian Anastasov <ja@ssi.bg>
Cc: linux-kernel@vger.kernel.org

Signed-off-by: Debabrata Banerjee <dbanerje@akamai.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_metrics.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 4fe041805989..0d54e59b9ea8 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1093,7 +1093,6 @@ static const struct genl_ops tcp_metrics_nl_ops[] = {
 		.doit = tcp_metrics_nl_cmd_get,
 		.dumpit = tcp_metrics_nl_dump,
 		.policy = tcp_metrics_nl_policy,
-		.flags = GENL_ADMIN_PERM,
 	},
 	{
 		.cmd = TCP_METRICS_CMD_DEL,
-- 
cgit v1.2.3


From 7304fe4681634a8e0511a5922c972aa132ffb43d Mon Sep 17 00:00:00 2001
From: Duan Jiong
Date: Thu, 31 Jul 2014 17:54:32 +0800
Subject: net: fix the counter ICMP_MIB_INERRORS/ICMP6_MIB_INERRORS

When dealing with ICMPv[46] Error Message, function icmp_socket_deliver()
and icmpv6_notify() do some valid checks on packet's length, but then some
protocols check packet's length redaudantly. So remove those duplicated
statements, and increase counter ICMP_MIB_INERRORS/ICMP6_MIB_INERRORS in
function icmp_socket_deliver() and icmpv6_notify() respectively.

In addition, add missed counter in udp6/udplite6 when socket is NULL.

Signed-off-by: Duan Jiong <duanj.fnst@cn.fujitsu.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c     |  4 +++-
 net/ipv4/tcp_ipv4.c |  5 -----
 net/ipv6/icmp.c     | 13 ++++++++-----
 net/ipv6/udp.c      |  8 ++++++--
 net/sctp/input.c    |  5 -----
 5 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 42b7bcf8045b..092400ef88d0 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -663,8 +663,10 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
 	/* Checkin full IP header plus 8 bytes of protocol to
 	 * avoid additional coding at protocol handlers.
 	 */
-	if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
+	if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) {
+		ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
 		return;
+	}
 
 	raw_icmp_error(skb, protocol, info);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1edc739b9da5..d0ba39537d5c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -344,11 +344,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 	int err;
 	struct net *net = dev_net(icmp_skb->dev);
 
-	if (icmp_skb->len < (iph->ihl << 2) + 8) {
-		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
-		return;
-	}
-
 	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 			iph->saddr, th->source, inet_iif(icmp_skb));
 	if (!sk) {
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index f6c84a6eb238..06ba3e58320b 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -626,9 +626,10 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
 	int inner_offset;
 	__be16 frag_off;
 	u8 nexthdr;
+	struct net *net = dev_net(skb->dev);
 
 	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
-		return;
+		goto out;
 
 	nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr;
 	if (ipv6_ext_hdr(nexthdr)) {
@@ -636,14 +637,14 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
 		inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
 						&nexthdr, &frag_off);
 		if (inner_offset<0)
-			return;
+			goto out;
 	} else {
 		inner_offset = sizeof(struct ipv6hdr);
 	}
 
 	/* Checkin header including 8 bytes of inner protocol header. */
 	if (!pskb_may_pull(skb, inner_offset+8))
-		return;
+		goto out;
 
 	/* BUGGG_FUTURE: we should try to parse exthdrs in this packet.
 	   Without this we will not able f.e. to make source routed
@@ -652,13 +653,15 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
 	   --ANK (980726)
 	 */
 
-	rcu_read_lock();
 	ipprot = rcu_dereference(inet6_protos[nexthdr]);
 	if (ipprot && ipprot->err_handler)
 		ipprot->err_handler(skb, NULL, type, code, inner_offset, info);
-	rcu_read_unlock();
 
 	raw6_icmp_error(skb, nexthdr, type, code, inner_offset, info);
+	return;
+
+out:
+	ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
 }
 
 /*
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 5b6091de5868..c6941a1ac977 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -533,11 +533,15 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	struct udphdr *uh = (struct udphdr*)(skb->data+offset);
 	struct sock *sk;
 	int err;
+	struct net *net = dev_net(skb->dev);
 
-	sk = __udp6_lib_lookup(dev_net(skb->dev), daddr, uh->dest,
+	sk = __udp6_lib_lookup(net, daddr, uh->dest,
 			       saddr, uh->source, inet6_iif(skb), udptable);
-	if (sk == NULL)
+	if (sk == NULL) {
+		ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
+				   ICMP6_MIB_INERRORS);
 		return;
+	}
 
 	if (type == ICMPV6_PKT_TOOBIG) {
 		if (!ip6_sk_accept_pmtu(sk))
diff --git a/net/sctp/input.c b/net/sctp/input.c
index f2e2cbd2d750..c1b991294516 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -575,11 +575,6 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
 	int err;
 	struct net *net = dev_net(skb->dev);
 
-	if (skb->len < ihlen + 8) {
-		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
-		return;
-	}
-
 	/* Fix up skb to look at the embedded net header. */
 	saveip = skb->network_header;
 	savesctp = skb->transport_header;
-- 
cgit v1.2.3


From 4330487acfff0cf1d7b14d238583a182e0a444bb Mon Sep 17 00:00:00 2001
From: Duan Jiong
Date: Fri, 1 Aug 2014 09:52:58 +0800
Subject: net: use inet6_iif instead of IP6CB()->iif

Signed-off-by: Duan Jiong <duanj.fnst@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ping.c     | 2 +-
 net/ipv6/raw.c      | 8 ++++----
 net/ipv6/udp.c      | 2 +-
 net/l2tp/l2tp_ip6.c | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 044a0ddf6a79..a3c59a077a5f 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -911,7 +911,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 				sin6->sin6_flowinfo = ip6_flowinfo(ip6);
 			sin6->sin6_scope_id =
 				ipv6_iface_scope_id(&sin6->sin6_addr,
-						    IP6CB(skb)->iif);
+						    inet6_iif(skb));
 			*addr_len = sizeof(*sin6);
 		}
 
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index dee80fb1aa86..39d44226e402 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -176,7 +176,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 		goto out;
 
 	net = dev_net(skb->dev);
-	sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr, IP6CB(skb)->iif);
+	sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr, inet6_iif(skb));
 
 	while (sk) {
 		int filtered;
@@ -220,7 +220,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
 			}
 		}
 		sk = __raw_v6_lookup(net, sk_next(sk), nexthdr, daddr, saddr,
-				     IP6CB(skb)->iif);
+				     inet6_iif(skb));
 	}
 out:
 	read_unlock(&raw_v6_hashinfo.lock);
@@ -375,7 +375,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
 		net = dev_net(skb->dev);
 
 		while ((sk = __raw_v6_lookup(net, sk, nexthdr, saddr, daddr,
-						IP6CB(skb)->iif))) {
+						inet6_iif(skb)))) {
 			rawv6_err(sk, skb, NULL, type, code,
 					inner_offset, info);
 			sk = sk_next(sk);
@@ -506,7 +506,7 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk,
 		sin6->sin6_addr = ipv6_hdr(skb)->saddr;
 		sin6->sin6_flowinfo = 0;
 		sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr,
-							  IP6CB(skb)->iif);
+							  inet6_iif(skb));
 		*addr_len = sizeof(*sin6);
 	}
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index c6941a1ac977..4836af8f582d 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -472,7 +472,7 @@ try_again:
 			sin6->sin6_addr = ipv6_hdr(skb)->saddr;
 			sin6->sin6_scope_id =
 				ipv6_iface_scope_id(&sin6->sin6_addr,
-						    IP6CB(skb)->iif);
+						    inet6_iif(skb));
 		}
 		*addr_len = sizeof(*sin6);
 	}
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index f3f98a156cee..0edb263cc002 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -687,7 +687,7 @@ static int l2tp_ip6_recvmsg(struct kiocb *iocb, struct sock *sk,
 		lsa->l2tp_scope_id = 0;
 		lsa->l2tp_conn_id = 0;
 		if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL)
-			lsa->l2tp_scope_id = IP6CB(skb)->iif;
+			lsa->l2tp_scope_id = inet6_iif(skb);
 	}
 
 	if (np->rxopt.all)
-- 
cgit v1.2.3


From 188b1210f3d633e27dc97309c97315ce484122ec Mon Sep 17 00:00:00 2001
From: Duan Jiong
Date: Fri, 1 Aug 2014 14:00:36 +0800
Subject: ipv4: remove nested rcu_read_lock/unlock

ip_local_deliver_finish() already have a rcu_read_lock/unlock, so
the rcu_read_lock/unlock is unnecessary.

See the stack below:
ip_local_deliver_finish
	|
	|
	->icmp_rcv
		|
		|
		->icmp_socket_deliver

Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Duan Jiong <duanj.fnst@cn.fujitsu.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 092400ef88d0..ea7d4afe8205 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -670,11 +670,9 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
 
 	raw_icmp_error(skb, protocol, info);
 
-	rcu_read_lock();
 	ipprot = rcu_dereference(inet_protos[protocol]);
 	if (ipprot && ipprot->err_handler)
 		ipprot->err_handler(skb, info);
-	rcu_read_unlock();
 }
 
 static bool icmp_tag_validation(int proto)
-- 
cgit v1.2.3


From 06aa8b8a0345c78f4d9a1fb3f852952b12a0e40c Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov
Date: Fri, 1 Aug 2014 12:29:44 +0200
Subject: inet: frags: rename last_in to flags

The last_in field has been used to store various flags different from
first/last frag in so give it a more descriptive name: flags.

Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h                 |  2 +-
 net/ieee802154/reassembly.c             | 14 +++++++-------
 net/ipv4/inet_fragment.c                | 14 +++++++-------
 net/ipv4/ip_fragment.c                  | 20 ++++++++++----------
 net/ipv6/netfilter/nf_conntrack_reasm.c | 12 ++++++------
 net/ipv6/reassembly.c                   | 18 +++++++++---------
 6 files changed, 40 insertions(+), 40 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 6f4930a0b660..5024d6c20407 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -25,7 +25,7 @@ struct inet_frag_queue {
 	ktime_t			stamp;
 	int			len;        /* total length of orig datagram */
 	int			meat;
-	__u8			last_in;    /* first/last segment arrived? */
+	__u8			flags;    /* first/last segment arrived? */
 
 #define INET_FRAG_EVICTED	8
 #define INET_FRAG_COMPLETE	4
diff --git a/net/ieee802154/reassembly.c b/net/ieee802154/reassembly.c
index f13d4f32e207..5607accd2fee 100644
--- a/net/ieee802154/reassembly.c
+++ b/net/ieee802154/reassembly.c
@@ -99,7 +99,7 @@ static void lowpan_frag_expire(unsigned long data)
 
 	spin_lock(&fq->q.lock);
 
-	if (fq->q.last_in & INET_FRAG_COMPLETE)
+	if (fq->q.flags & INET_FRAG_COMPLETE)
 		goto out;
 
 	inet_frag_kill(&fq->q, &lowpan_frags);
@@ -142,7 +142,7 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq,
 	struct net_device *dev;
 	int end, offset;
 
-	if (fq->q.last_in & INET_FRAG_COMPLETE)
+	if (fq->q.flags & INET_FRAG_COMPLETE)
 		goto err;
 
 	offset = lowpan_cb(skb)->d_offset << 3;
@@ -154,14 +154,14 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq,
 		 * or have different end, the segment is corrupted.
 		 */
 		if (end < fq->q.len ||
-		    ((fq->q.last_in & INET_FRAG_LAST_IN) && end != fq->q.len))
+		    ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len))
 			goto err;
-		fq->q.last_in |= INET_FRAG_LAST_IN;
+		fq->q.flags |= INET_FRAG_LAST_IN;
 		fq->q.len = end;
 	} else {
 		if (end > fq->q.len) {
 			/* Some bits beyond end -> corruption. */
-			if (fq->q.last_in & INET_FRAG_LAST_IN)
+			if (fq->q.flags & INET_FRAG_LAST_IN)
 				goto err;
 			fq->q.len = end;
 		}
@@ -201,13 +201,13 @@ found:
 	if (frag_type == LOWPAN_DISPATCH_FRAG1) {
 		/* Calculate uncomp. 6lowpan header to estimate full size */
 		fq->q.meat += lowpan_uncompress_size(skb, NULL);
-		fq->q.last_in |= INET_FRAG_FIRST_IN;
+		fq->q.flags |= INET_FRAG_FIRST_IN;
 	} else {
 		fq->q.meat += skb->len;
 	}
 	add_frag_mem_limit(&fq->q, skb->truesize);
 
-	if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+	if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
 	    fq->q.meat == fq->q.len) {
 		int res;
 		unsigned long orefdst = skb->_skb_refdst;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 62b1f73749dc..e3ebc6608e5d 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -152,8 +152,8 @@ evict_again:
 		}
 
 		/* suppress xmit of (icmp) error packet */
-		fq->last_in &= ~INET_FRAG_FIRST_IN;
-		fq->last_in |= INET_FRAG_EVICTED;
+		fq->flags &= ~INET_FRAG_FIRST_IN;
+		fq->flags |= INET_FRAG_EVICTED;
 		hlist_del(&fq->list);
 		hlist_add_head(&fq->list, &expired);
 		++evicted;
@@ -289,16 +289,16 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
 	if (del_timer(&fq->timer))
 		atomic_dec(&fq->refcnt);
 
-	if (!(fq->last_in & INET_FRAG_COMPLETE)) {
+	if (!(fq->flags & INET_FRAG_COMPLETE)) {
 		fq_unlink(fq, f);
 		atomic_dec(&fq->refcnt);
-		fq->last_in |= INET_FRAG_COMPLETE;
+		fq->flags |= INET_FRAG_COMPLETE;
 	}
 }
 EXPORT_SYMBOL(inet_frag_kill);
 
 static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
-		struct sk_buff *skb)
+				  struct sk_buff *skb)
 {
 	if (f->skb_free)
 		f->skb_free(skb);
@@ -311,7 +311,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
 	struct netns_frags *nf;
 	unsigned int sum, sum_truesize = 0;
 
-	WARN_ON(!(q->last_in & INET_FRAG_COMPLETE));
+	WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
 	WARN_ON(del_timer(&q->timer) != 0);
 
 	/* Release all fragment data. */
@@ -349,7 +349,7 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 		if (qp->net == nf && f->match(qp, arg)) {
 			atomic_inc(&qp->refcnt);
 			spin_unlock(&hb->chain_lock);
-			qp_in->last_in |= INET_FRAG_COMPLETE;
+			qp_in->flags |= INET_FRAG_COMPLETE;
 			inet_frag_put(qp_in, f);
 			return qp;
 		}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 634fc31aa243..6fce1ecc5bca 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -185,16 +185,16 @@ static void ip_expire(unsigned long arg)
 
 	spin_lock(&qp->q.lock);
 
-	if (qp->q.last_in & INET_FRAG_COMPLETE)
+	if (qp->q.flags & INET_FRAG_COMPLETE)
 		goto out;
 
 	ipq_kill(qp);
 
-	if (!(qp->q.last_in & INET_FRAG_EVICTED))
+	if (!(qp->q.flags & INET_FRAG_EVICTED))
 		IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
 
-	if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {
+	if ((qp->q.flags & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {
 		struct sk_buff *head = qp->q.fragments;
 		const struct iphdr *iph;
 		int err;
@@ -302,7 +302,7 @@ static int ip_frag_reinit(struct ipq *qp)
 	} while (fp);
 	sub_frag_mem_limit(&qp->q, sum_truesize);
 
-	qp->q.last_in = 0;
+	qp->q.flags = 0;
 	qp->q.len = 0;
 	qp->q.meat = 0;
 	qp->q.fragments = NULL;
@@ -323,7 +323,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	int err = -ENOENT;
 	u8 ecn;
 
-	if (qp->q.last_in & INET_FRAG_COMPLETE)
+	if (qp->q.flags & INET_FRAG_COMPLETE)
 		goto err;
 
 	if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
@@ -350,9 +350,9 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 		 * or have different end, the segment is corrupted.
 		 */
 		if (end < qp->q.len ||
-		    ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len))
+		    ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
 			goto err;
-		qp->q.last_in |= INET_FRAG_LAST_IN;
+		qp->q.flags |= INET_FRAG_LAST_IN;
 		qp->q.len = end;
 	} else {
 		if (end&7) {
@@ -362,7 +362,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 		}
 		if (end > qp->q.len) {
 			/* Some bits beyond end -> corruption. */
-			if (qp->q.last_in & INET_FRAG_LAST_IN)
+			if (qp->q.flags & INET_FRAG_LAST_IN)
 				goto err;
 			qp->q.len = end;
 		}
@@ -471,13 +471,13 @@ found:
 	qp->ecn |= ecn;
 	add_frag_mem_limit(&qp->q, skb->truesize);
 	if (offset == 0)
-		qp->q.last_in |= INET_FRAG_FIRST_IN;
+		qp->q.flags |= INET_FRAG_FIRST_IN;
 
 	if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
 	    skb->len + ihl > qp->q.max_size)
 		qp->q.max_size = skb->len + ihl;
 
-	if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+	if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
 	    qp->q.meat == qp->q.len) {
 		unsigned long orefdst = skb->_skb_refdst;
 
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 3d4bccf6d67d..cca686e42b97 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -222,7 +222,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 	int offset, end;
 	u8 ecn;
 
-	if (fq->q.last_in & INET_FRAG_COMPLETE) {
+	if (fq->q.flags & INET_FRAG_COMPLETE) {
 		pr_debug("Already completed\n");
 		goto err;
 	}
@@ -253,11 +253,11 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 		 * or have different end, the segment is corrupted.
 		 */
 		if (end < fq->q.len ||
-		    ((fq->q.last_in & INET_FRAG_LAST_IN) && end != fq->q.len)) {
+		    ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) {
 			pr_debug("already received last fragment\n");
 			goto err;
 		}
-		fq->q.last_in |= INET_FRAG_LAST_IN;
+		fq->q.flags |= INET_FRAG_LAST_IN;
 		fq->q.len = end;
 	} else {
 		/* Check if the fragment is rounded to 8 bytes.
@@ -272,7 +272,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 		}
 		if (end > fq->q.len) {
 			/* Some bits beyond end -> corruption. */
-			if (fq->q.last_in & INET_FRAG_LAST_IN) {
+			if (fq->q.flags & INET_FRAG_LAST_IN) {
 				pr_debug("last packet already reached.\n");
 				goto err;
 			}
@@ -354,7 +354,7 @@ found:
 	 */
 	if (offset == 0) {
 		fq->nhoffset = nhoff;
-		fq->q.last_in |= INET_FRAG_FIRST_IN;
+		fq->q.flags |= INET_FRAG_FIRST_IN;
 	}
 
 	return 0;
@@ -617,7 +617,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb, u32 user)
 		goto ret_orig;
 	}
 
-	if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+	if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
 	    fq->q.meat == fq->q.len) {
 		ret_skb = nf_ct_frag6_reasm(fq, dev);
 		if (ret_skb == NULL)
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 512ccc027ce3..b4baceed0d0d 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -131,7 +131,7 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
 
 	spin_lock(&fq->q.lock);
 
-	if (fq->q.last_in & INET_FRAG_COMPLETE)
+	if (fq->q.flags & INET_FRAG_COMPLETE)
 		goto out;
 
 	inet_frag_kill(&fq->q, frags);
@@ -141,13 +141,13 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
 	if (!dev)
 		goto out_rcu_unlock;
 
-	if (!(fq->q.last_in & INET_FRAG_EVICTED))
+	if (!(fq->q.flags & INET_FRAG_EVICTED))
 		IP6_INC_STATS_BH(net, __in6_dev_get(dev),
 				 IPSTATS_MIB_REASMTIMEOUT);
 	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
 
 	/* Don't send error if the first segment did not arrive. */
-	if (!(fq->q.last_in & INET_FRAG_FIRST_IN) || !fq->q.fragments)
+	if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !fq->q.fragments)
 		goto out_rcu_unlock;
 
 	/*
@@ -209,7 +209,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 	struct net *net = dev_net(skb_dst(skb)->dev);
 	u8 ecn;
 
-	if (fq->q.last_in & INET_FRAG_COMPLETE)
+	if (fq->q.flags & INET_FRAG_COMPLETE)
 		goto err;
 
 	offset = ntohs(fhdr->frag_off) & ~0x7;
@@ -240,9 +240,9 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 		 * or have different end, the segment is corrupted.
 		 */
 		if (end < fq->q.len ||
-		    ((fq->q.last_in & INET_FRAG_LAST_IN) && end != fq->q.len))
+		    ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len))
 			goto err;
-		fq->q.last_in |= INET_FRAG_LAST_IN;
+		fq->q.flags |= INET_FRAG_LAST_IN;
 		fq->q.len = end;
 	} else {
 		/* Check if the fragment is rounded to 8 bytes.
@@ -260,7 +260,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 		}
 		if (end > fq->q.len) {
 			/* Some bits beyond end -> corruption. */
-			if (fq->q.last_in & INET_FRAG_LAST_IN)
+			if (fq->q.flags & INET_FRAG_LAST_IN)
 				goto err;
 			fq->q.len = end;
 		}
@@ -335,10 +335,10 @@ found:
 	 */
 	if (offset == 0) {
 		fq->nhoffset = nhoff;
-		fq->q.last_in |= INET_FRAG_FIRST_IN;
+		fq->q.flags |= INET_FRAG_FIRST_IN;
 	}
 
-	if (fq->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+	if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
 	    fq->q.meat == fq->q.len) {
 		int res;
 		unsigned long orefdst = skb->_skb_refdst;
-- 
cgit v1.2.3


From f926e23660d52601089222cb4755aabc693ca390 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov
Date: Fri, 1 Aug 2014 12:29:46 +0200
Subject: inet: frags: fix function declaration alignments in inet_fragment

Fix a couple of functions' declaration alignments.

Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_fragment.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index e3ebc6608e5d..fa49916c23a0 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -334,8 +334,9 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
 EXPORT_SYMBOL(inet_frag_destroy);
 
 static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
-		struct inet_frag_queue *qp_in, struct inet_frags *f,
-		void *arg)
+						struct inet_frag_queue *qp_in,
+						struct inet_frags *f,
+						void *arg)
 {
 	struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
 	struct inet_frag_queue *qp;
@@ -368,7 +369,8 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 }
 
 static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
-		struct inet_frags *f, void *arg)
+					       struct inet_frags *f,
+					       void *arg)
 {
 	struct inet_frag_queue *q;
 
@@ -393,7 +395,8 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 }
 
 static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
-		struct inet_frags *f, void *arg)
+						struct inet_frags *f,
+						void *arg)
 {
 	struct inet_frag_queue *q;
 
@@ -405,7 +408,8 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
 }
 
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
-		struct inet_frags *f, void *key, unsigned int hash)
+				       struct inet_frags *f, void *key,
+				       unsigned int hash)
 {
 	struct inet_frag_bucket *hb;
 	struct inet_frag_queue *q;
-- 
cgit v1.2.3


From 2e404f632f44979ddf0ce0808a438249a72d7015 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov
Date: Fri, 1 Aug 2014 12:29:47 +0200
Subject: inet: frags: use INET_FRAG_EVICTED to prevent icmp messages

Now that we have INET_FRAG_EVICTED we might as well use it to stop
sending icmp messages in the "frag_expire" functions instead of
stripping INET_FRAG_FIRST_IN from their flags when evicting.
Also fix the comment style in ip6_expire_frag_queue().

Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_fragment.c |  2 --
 net/ipv4/ip_fragment.c   | 14 +++++++-------
 net/ipv6/reassembly.c    | 15 ++++++++-------
 3 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index fa49916c23a0..4baa76c60398 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -151,8 +151,6 @@ evict_again:
 			goto evict_again;
 		}
 
-		/* suppress xmit of (icmp) error packet */
-		fq->flags &= ~INET_FRAG_FIRST_IN;
 		fq->flags |= INET_FRAG_EVICTED;
 		hlist_del(&fq->list);
 		hlist_add_head(&fq->list, &expired);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 6fce1ecc5bca..cb56bcc1eee2 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -189,16 +189,18 @@ static void ip_expire(unsigned long arg)
 		goto out;
 
 	ipq_kill(qp);
-
-	if (!(qp->q.flags & INET_FRAG_EVICTED))
-		IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
 	IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
 
-	if ((qp->q.flags & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {
+	if (!(qp->q.flags & INET_FRAG_EVICTED)) {
 		struct sk_buff *head = qp->q.fragments;
 		const struct iphdr *iph;
 		int err;
 
+		IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
+
+		if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
+			goto out;
+
 		rcu_read_lock();
 		head->dev = dev_get_by_index_rcu(net, qp->iif);
 		if (!head->dev)
@@ -211,8 +213,7 @@ static void ip_expire(unsigned long arg)
 		if (err)
 			goto out_rcu_unlock;
 
-		/*
-		 * Only an end host needs to send an ICMP
+		/* Only an end host needs to send an ICMP
 		 * "Fragment Reassembly Timeout" message, per RFC792.
 		 */
 		if (qp->user == IP_DEFRAG_AF_PACKET ||
@@ -221,7 +222,6 @@ static void ip_expire(unsigned long arg)
 		     (skb_rtable(head)->rt_type != RTN_LOCAL)))
 			goto out_rcu_unlock;
 
-
 		/* Send an ICMP "Fragment Reassembly Timeout" message. */
 		icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
 out_rcu_unlock:
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index b4baceed0d0d..beb6872a8fa5 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -141,19 +141,20 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
 	if (!dev)
 		goto out_rcu_unlock;
 
-	if (!(fq->q.flags & INET_FRAG_EVICTED))
-		IP6_INC_STATS_BH(net, __in6_dev_get(dev),
-				 IPSTATS_MIB_REASMTIMEOUT);
 	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
 
+	if (fq->q.flags & INET_FRAG_EVICTED)
+		goto out_rcu_unlock;
+
+	IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
+
 	/* Don't send error if the first segment did not arrive. */
 	if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !fq->q.fragments)
 		goto out_rcu_unlock;
 
-	/*
-	   But use as source device on which LAST ARRIVED
-	   segment was received. And do not use fq->dev
-	   pointer directly, device might already disappeared.
+	/* But use as source device on which LAST ARRIVED
+	 * segment was received. And do not use fq->dev
+	 * pointer directly, device might already disappeared.
 	 */
 	fq->q.fragments->dev = dev;
 	icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
-- 
cgit v1.2.3


From d4ad4d22e7ac6b8711b35d7e86eb29f03f8ac153 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov
Date: Fri, 1 Aug 2014 12:29:48 +0200
Subject: inet: frags: use kmem_cache for inet_frag_queue

Use kmem_cache to allocate/free inet_frag_queue objects since they're
all the same size per inet_frags user and are alloced/freed in high volumes
thus making it a perfect case for kmem_cache.

Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_frag.h                 |  4 +++-
 net/ieee802154/reassembly.c             |  7 ++++++-
 net/ipv4/inet_fragment.c                | 13 ++++++++++---
 net/ipv4/ip_fragment.c                  |  5 ++++-
 net/ipv6/netfilter/nf_conntrack_reasm.c |  8 ++++++--
 net/ipv6/reassembly.c                   |  7 ++++++-
 6 files changed, 35 insertions(+), 9 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 90015c47b447..65a8855e99fe 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -101,9 +101,11 @@ struct inet_frags {
 	void			(*destructor)(struct inet_frag_queue *);
 	void			(*skb_free)(struct sk_buff *);
 	void			(*frag_expire)(unsigned long data);
+	struct kmem_cache	*frags_cachep;
+	const char		*frags_cache_name;
 };
 
-void inet_frags_init(struct inet_frags *);
+int inet_frags_init(struct inet_frags *);
 void inet_frags_fini(struct inet_frags *);
 
 void inet_frags_init_net(struct netns_frags *nf);
diff --git a/net/ieee802154/reassembly.c b/net/ieee802154/reassembly.c
index 5607accd2fee..ffec6ce51005 100644
--- a/net/ieee802154/reassembly.c
+++ b/net/ieee802154/reassembly.c
@@ -30,6 +30,8 @@
 
 #include "reassembly.h"
 
+static const char lowpan_frags_cache_name[] = "lowpan-frags";
+
 struct lowpan_frag_info {
 	__be16 d_tag;
 	u16 d_size;
@@ -571,7 +573,10 @@ int __init lowpan_net_frag_init(void)
 	lowpan_frags.qsize = sizeof(struct frag_queue);
 	lowpan_frags.match = lowpan_frag_match;
 	lowpan_frags.frag_expire = lowpan_frag_expire;
-	inet_frags_init(&lowpan_frags);
+	lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
+	ret = inet_frags_init(&lowpan_frags);
+	if (ret)
+		goto err_pernet;
 
 	return ret;
 err_pernet:
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 4baa76c60398..9eb89f3f0ee4 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -198,7 +198,7 @@ static void inet_frag_schedule_worker(struct inet_frags *f)
 		schedule_work(&f->frags_work);
 }
 
-void inet_frags_init(struct inet_frags *f)
+int inet_frags_init(struct inet_frags *f)
 {
 	int i;
 
@@ -213,6 +213,12 @@ void inet_frags_init(struct inet_frags *f)
 
 	seqlock_init(&f->rnd_seqlock);
 	f->last_rebuild_jiffies = 0;
+	f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
+					    NULL);
+	if (!f->frags_cachep)
+		return -ENOMEM;
+
+	return 0;
 }
 EXPORT_SYMBOL(inet_frags_init);
 
@@ -225,6 +231,7 @@ EXPORT_SYMBOL(inet_frags_init_net);
 void inet_frags_fini(struct inet_frags *f)
 {
 	cancel_work_sync(&f->frags_work);
+	kmem_cache_destroy(f->frags_cachep);
 }
 EXPORT_SYMBOL(inet_frags_fini);
 
@@ -327,7 +334,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
 
 	if (f->destructor)
 		f->destructor(q);
-	kfree(q);
+	kmem_cache_free(f->frags_cachep, q);
 }
 EXPORT_SYMBOL(inet_frag_destroy);
 
@@ -377,7 +384,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 		return NULL;
 	}
 
-	q = kzalloc(f->qsize, GFP_ATOMIC);
+	q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
 	if (q == NULL)
 		return NULL;
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index cb56bcc1eee2..15f0e2bad7ad 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -55,6 +55,7 @@
  */
 
 static int sysctl_ipfrag_max_dist __read_mostly = 64;
+static const char ip_frag_cache_name[] = "ip4-frags";
 
 struct ipfrag_skb_cb
 {
@@ -860,5 +861,7 @@ void __init ipfrag_init(void)
 	ip4_frags.qsize = sizeof(struct ipq);
 	ip4_frags.match = ip4_frag_match;
 	ip4_frags.frag_expire = ip_expire;
-	inet_frags_init(&ip4_frags);
+	ip4_frags.frags_cache_name = ip_frag_cache_name;
+	if (inet_frags_init(&ip4_frags))
+		panic("IP: failed to allocate ip4_frags cache\n");
 }
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index cca686e42b97..6f187c8d8a1b 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -50,6 +50,7 @@
 #include <linux/module.h>
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 
+static const char nf_frags_cache_name[] = "nf-frags";
 
 struct nf_ct_frag6_skb_cb
 {
@@ -677,12 +678,15 @@ int nf_ct_frag6_init(void)
 	nf_frags.qsize = sizeof(struct frag_queue);
 	nf_frags.match = ip6_frag_match;
 	nf_frags.frag_expire = nf_ct_frag6_expire;
-	inet_frags_init(&nf_frags);
-
+	nf_frags.frags_cache_name = nf_frags_cache_name;
+	ret = inet_frags_init(&nf_frags);
+	if (ret)
+		goto out;
 	ret = register_pernet_subsys(&nf_ct_net_ops);
 	if (ret)
 		inet_frags_fini(&nf_frags);
 
+out:
 	return ret;
 }
 
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index beb6872a8fa5..c6557d9f7808 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -60,6 +60,8 @@
 #include <net/inet_frag.h>
 #include <net/inet_ecn.h>
 
+static const char ip6_frag_cache_name[] = "ip6-frags";
+
 struct ip6frag_skb_cb
 {
 	struct inet6_skb_parm	h;
@@ -748,7 +750,10 @@ int __init ipv6_frag_init(void)
 	ip6_frags.qsize = sizeof(struct frag_queue);
 	ip6_frags.match = ip6_frag_match;
 	ip6_frags.frag_expire = ip6_frag_expire;
-	inet_frags_init(&ip6_frags);
+	ip6_frags.frags_cache_name = ip6_frag_cache_name;
+	ret = inet_frags_init(&ip6_frags);
+	if (ret)
+		goto err_pernet;
 out:
 	return ret;
 
-- 
cgit v1.2.3


From 64a124edcc94011ce459f0b9bdf51e1783146712 Mon Sep 17 00:00:00 2001
From: Dmitry Popov
Date: Sun, 3 Aug 2014 22:45:19 +0400
Subject: tcp: md5: remove unneeded check in tcp_v4_parse_md5_keys

tcpm_key is an array inside struct tcp_md5sig, there is no need to check it
against NULL.

Signed-off-by: Dmitry Popov <ixaphire@qrator.net>
Acked-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d0ba39537d5c..992a1f926009 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1049,7 +1049,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 	if (sin->sin_family != AF_INET)
 		return -EINVAL;
 
-	if (!cmd.tcpm_key || !cmd.tcpm_keylen)
+	if (!cmd.tcpm_keylen)
 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
 				      AF_INET);
 
-- 
cgit v1.2.3


From 5ae344c949e79b8545a11db149f0a85a6e59e1f3 Mon Sep 17 00:00:00 2001
From: Neal Cardwell
Date: Mon, 4 Aug 2014 19:12:29 -0400
Subject: tcp: reduce spurious retransmits due to transient SACK reneging

This commit reduces spurious retransmits due to apparent SACK reneging
by only reacting to SACK reneging that persists for a short delay.

When a sequence space hole at snd_una is filled, some TCP receivers
send a series of ACKs as they apparently scan their out-of-order queue
and cumulatively ACK all the packets that have now been consecutiveyly
received. This is essentially misbehavior B in "Misbehaviors in TCP
SACK generation" ACM SIGCOMM Computer Communication Review, April
2011, so we suspect that this is from several common OSes (Windows
2000, Windows Server 2003, Windows XP). However, this issue has also
been seen in other cases, e.g. the netdev thread "TCP being hoodwinked
into spurious retransmissions by lack of timestamps?" from March 2014,
where the receiver was thought to be a BSD box.

Since snd_una would temporarily be adjacent to a previously SACKed
range in these scenarios, this receiver behavior triggered the Linux
SACK reneging code path in the sender. This led the sender to clear
the SACK scoreboard, enter CA_Loss, and spuriously retransmit
(potentially) every packet from the entire write queue at line rate
just a few milliseconds before the ACK for each packet arrives at the
sender.

To avoid such situations, now when a sender sees apparent reneging it
does not yet retransmit, but rather adjusts the RTO timer to give the
receiver a little time (max(RTT/2, 10ms)) to send us some more ACKs
that will restore sanity to the SACK scoreboard. If the reneging
persists until this RTO then, as before, we clear the SACK scoreboard
and enter CA_Loss.

A 10ms delay tolerates a receiver sending such a stream of ACKs at
56Kbit/sec. And to allow for receivers with slower or more congested
paths, we wait for at least RTT/2.

We validated the resulting max(RTT/2, 10ms) delay formula with a mix
of North American and South American Google web server traffic, and
found that for ACKs displaying transient reneging:

 (1) 90% of inter-ACK delays were less than 10ms
 (2) 99% of inter-ACK delays were less than RTT/2

In tests on Google web servers this commit reduced reneging events by
75%-90% (as measured by the TcpExtTCPSACKReneging counter), without
any measurable impact on latency for user HTTP and SPDY requests.

Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    |  2 +-
 net/ipv4/tcp_input.c | 29 ++++++++++++++++++-----------
 net/ipv4/tcp_timer.c |  4 ++--
 3 files changed, 21 insertions(+), 14 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0aeb2eb749dc..dafa1cbc149b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -411,7 +411,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 			   bool fastopen);
 int tcp_child_process(struct sock *parent, struct sock *child,
 		      struct sk_buff *skb);
-void tcp_enter_loss(struct sock *sk, int how);
+void tcp_enter_loss(struct sock *sk);
 void tcp_clear_retrans(struct tcp_sock *tp);
 void tcp_update_metrics(struct sock *sk);
 void tcp_init_metrics(struct sock *sk);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7832d941dbcd..6a2984507755 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1904,16 +1904,17 @@ void tcp_clear_retrans(struct tcp_sock *tp)
 	tp->sacked_out = 0;
 }
 
-/* Enter Loss state. If "how" is not zero, forget all SACK information
+/* Enter Loss state. If we detect SACK reneging, forget all SACK information
  * and reset tags completely, otherwise preserve SACKs. If receiver
  * dropped its ofo queue, we will know this due to reneging detection.
  */
-void tcp_enter_loss(struct sock *sk, int how)
+void tcp_enter_loss(struct sock *sk)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	bool new_recovery = false;
+	bool is_reneg;			/* is receiver reneging on SACKs? */
 
 	/* Reduce ssthresh if it has not yet been made inside this window. */
 	if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
@@ -1934,7 +1935,11 @@ void tcp_enter_loss(struct sock *sk, int how)
 		tcp_reset_reno_sack(tp);
 
 	tp->undo_marker = tp->snd_una;
-	if (how) {
+
+	skb = tcp_write_queue_head(sk);
+	is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
+	if (is_reneg) {
+		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
 		tp->sacked_out = 0;
 		tp->fackets_out = 0;
 	}
@@ -1948,7 +1953,7 @@ void tcp_enter_loss(struct sock *sk, int how)
 			tp->undo_marker = 0;
 
 		TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
-		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
+		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
 			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 			tp->lost_out += tcp_skb_pcount(skb);
@@ -1981,19 +1986,21 @@ void tcp_enter_loss(struct sock *sk, int how)
  * remembered SACKs do not reflect real state of receiver i.e.
  * receiver _host_ is heavily congested (or buggy).
  *
- * Do processing similar to RTO timeout.
+ * To avoid big spurious retransmission bursts due to transient SACK
+ * scoreboard oddities that look like reneging, we give the receiver a
+ * little time (max(RTT/2, 10ms)) to send us some more ACKs that will
+ * restore sanity to the SACK scoreboard. If the apparent reneging
+ * persists until this RTO then we'll clear the SACK scoreboard.
  */
 static bool tcp_check_sack_reneging(struct sock *sk, int flag)
 {
 	if (flag & FLAG_SACK_RENEGING) {
-		struct inet_connection_sock *icsk = inet_csk(sk);
-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+		struct tcp_sock *tp = tcp_sk(sk);
+		unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
+					  msecs_to_jiffies(10));
 
-		tcp_enter_loss(sk, 1);
-		icsk->icsk_retransmits++;
-		tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-					  icsk->icsk_rto, TCP_RTO_MAX);
+					  delay, TCP_RTO_MAX);
 		return true;
 	}
 	return false;
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 286227abed10..df90cd1ce37f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -391,7 +391,7 @@ void tcp_retransmit_timer(struct sock *sk)
 			tcp_write_err(sk);
 			goto out;
 		}
-		tcp_enter_loss(sk, 0);
+		tcp_enter_loss(sk);
 		tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
 		__sk_dst_reset(sk);
 		goto out_reset_timer;
@@ -422,7 +422,7 @@ void tcp_retransmit_timer(struct sock *sk)
 		NET_INC_STATS_BH(sock_net(sk), mib_idx);
 	}
 
-	tcp_enter_loss(sk, 0);
+	tcp_enter_loss(sk);
 
 	if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
 		/* Retransmission failed because of local congestion,
-- 
cgit v1.2.3


From 09c2d251b70723650ba47e83571ff49281320f7c Mon Sep 17 00:00:00 2001
From: Willem de Bruijn
Date: Mon, 4 Aug 2014 22:11:47 -0400
Subject: net-timestamp: add key to disambiguate concurrent datagrams

Datagrams timestamped on transmission can coexist in the kernel stack
and be reordered in packet scheduling. When reading looped datagrams
from the socket error queue it is not always possible to unique
correlate looped data with original send() call (for application
level retransmits). Even if possible, it may be expensive and complex,
requiring packet inspection.

Introduce a data-independent ID mechanism to associate timestamps with
send calls. Pass an ID alongside the timestamp in field ee_data of
sock_extended_err.

The ID is a simple 32 bit unsigned int that is associated with the
socket and incremented on each send() call for which software tx
timestamp generation is enabled.

The feature is enabled only if SOF_TIMESTAMPING_OPT_ID is set, to
avoid changing ee_data for existing applications that expect it 0.
The counter is reset each time the flag is reenabled. Reenabling
does not change the ID of already submitted data. It is possible
to receive out of order IDs if the timestamp stream is not quiesced
first.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h          | 1 +
 include/net/sock.h              | 2 ++
 include/uapi/linux/net_tstamp.h | 8 +++++---
 net/core/skbuff.c               | 2 ++
 net/core/sock.c                 | 3 +++
 net/ipv4/ip_output.c            | 6 ++++++
 net/ipv6/ip6_output.c           | 9 ++++++++-
 7 files changed, 27 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 477f0f60db45..0e35b3af7317 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -278,6 +278,7 @@ struct skb_shared_info {
 	unsigned short  gso_type;
 	struct sk_buff	*frag_list;
 	struct skb_shared_hwtstamps hwtstamps;
+	u32		tskey;
 	__be32          ip6_frag_id;
 
 	/*
diff --git a/include/net/sock.h b/include/net/sock.h
index a21129716aae..52fe0bc5598a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -280,6 +280,7 @@ struct cg_proto;
   *	@sk_timer: sock cleanup timer
   *	@sk_stamp: time stamp of last packet received
   *	@sk_tsflags: SO_TIMESTAMPING socket options
+  *	@sk_tskey: counter to disambiguate concurrent tstamp requests
   *	@sk_socket: Identd and reporting IO signals
   *	@sk_user_data: RPC layer private data
   *	@sk_frag: cached page frag
@@ -414,6 +415,7 @@ struct sock {
 	struct timer_list	sk_timer;
 	ktime_t			sk_stamp;
 	u16			sk_tsflags;
+	u32			sk_tskey;
 	struct socket		*sk_socket;
 	void			*sk_user_data;
 	struct page_frag	sk_frag;
diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index f53879c0f590..1e861d2e1a31 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -20,9 +20,11 @@ enum {
 	SOF_TIMESTAMPING_SOFTWARE = (1<<4),
 	SOF_TIMESTAMPING_SYS_HARDWARE = (1<<5),
 	SOF_TIMESTAMPING_RAW_HARDWARE = (1<<6),
-	SOF_TIMESTAMPING_MASK =
-	(SOF_TIMESTAMPING_RAW_HARDWARE - 1) |
-	SOF_TIMESTAMPING_RAW_HARDWARE
+	SOF_TIMESTAMPING_OPT_ID = (1<<7),
+
+	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_ID,
+	SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
+				 SOF_TIMESTAMPING_LAST
 };
 
 /**
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c9f68802e992..0df4f1d14c5a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3522,6 +3522,8 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
 	serr->ee.ee_errno = ENOMSG;
 	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
 	serr->ee.ee_info = SCM_TSTAMP_SND;
+	if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
+		serr->ee.ee_data = skb_shinfo(skb)->tskey;
 
 	err = sock_queue_err_skb(sk, skb);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 47c9377e14b9..1e0f1c63ad6b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -848,6 +848,9 @@ set_rcvbuf:
 			ret = -EINVAL;
 			break;
 		}
+		if (val & SOF_TIMESTAMPING_OPT_ID &&
+		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID))
+			sk->sk_tskey = 0;
 		sk->sk_tsflags = val;
 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 			sock_enable_timestamp(sk,
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b16556836d66..215af2b155cb 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -855,11 +855,15 @@ static int __ip_append_data(struct sock *sk,
 	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
 	int csummode = CHECKSUM_NONE;
 	struct rtable *rt = (struct rtable *)cork->dst;
+	u32 tskey = 0;
 
 	skb = skb_peek_tail(queue);
 
 	exthdrlen = !skb ? rt->dst.header_len : 0;
 	mtu = cork->fragsize;
+	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
+	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
+		tskey = sk->sk_tskey++;
 
 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
 
@@ -976,6 +980,8 @@ alloc_new_skb:
 			/* only the initial fragment is time stamped */
 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
 			cork->tx_flags = 0;
+			skb_shinfo(skb)->tskey = tskey;
+			tskey = 0;
 
 			/*
 			 *	Find where to start putting bytes.
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index f5dafe609f8b..315a55d66079 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1157,6 +1157,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 	int err;
 	int offset = 0;
 	__u8 tx_flags = 0;
+	u32 tskey = 0;
 
 	if (flags&MSG_PROBE)
 		return 0;
@@ -1272,8 +1273,12 @@ emsgsize:
 		}
 	}
 
-	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW)
+	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
 		sock_tx_timestamp(sk, &tx_flags);
+		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
+		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
+			tskey = sk->sk_tskey++;
+	}
 
 	/*
 	 * Let's try using as much space as possible.
@@ -1397,6 +1402,8 @@ alloc_new_skb:
 			/* Only the initial fragment is time stamped */
 			skb_shinfo(skb)->tx_flags = tx_flags;
 			tx_flags = 0;
+			skb_shinfo(skb)->tskey = tskey;
+			tskey = 0;
 
 			/*
 			 *	Find where to start putting bytes
-- 
cgit v1.2.3


From 4ed2d765dfaccff5ebdac68e2064b59125033a3b Mon Sep 17 00:00:00 2001
From: Willem de Bruijn
Date: Mon, 4 Aug 2014 22:11:49 -0400
Subject: net-timestamp: TCP timestamping

TCP timestamping extends SO_TIMESTAMPING to bytestreams.

Bytestreams do not have a 1:1 relationship between send() buffers and
network packets. The feature interprets a send call on a bytestream as
a request for a timestamp for the last byte in that send() buffer.

The choice corresponds to a request for a timestamp when all bytes in
the buffer have been sent. That assumption depends on in-order kernel
transmission. This is the common case. That said, it is possible to
construct a traffic shaping tree that would result in reordering.
The guarantee is strong, then, but not ironclad.

This implementation supports send and sendpages (splice). GSO replaces
one large packet with multiple smaller packets. This patch also copies
the option into the correct smaller packet.

This patch does not yet support timestamping on data in an initial TCP
Fast Open SYN, because that takes a very different data path.

If ID generation in ee_data is enabled, bytestream timestamps return a
byte offset, instead of the packet counter for datagrams.

The implementation supports a single timestamp per packet. It silenty
replaces requests for previous timestamps. To avoid missing tstamps,
flush the tcp queue by disabling Nagle, cork and autocork. Missing
tstamps can be detected by offset when the ee_data ID is enabled.

Implementation details:

- On GSO, the timestamping code can be included in the main loop. I
moved it into its own loop to reduce the impact on the common case
to a single branch.

- To avoid leaking the absolute seqno to userspace, the offset
returned in ee_data must always be relative. It is an offset between
an skb and sk field. The first is always set (also for GSO & ACK).
The second must also never be uninitialized. Only allow the ID
option on sockets in the ESTABLISHED state, for which the seqno
is available. Never reset it to zero (instead, move it to the
current seqno when reenabling the option).

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c      |  5 ++++-
 net/core/sock.c        | 13 +++++++++++--
 net/ipv4/tcp.c         | 22 +++++++++++++++++++---
 net/ipv4/tcp_offload.c | 18 ++++++++++++++++++
 4 files changed, 52 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 9705c0732aab..3dec0293a7c5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3522,8 +3522,11 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
 	serr->ee.ee_errno = ENOMSG;
 	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
 	serr->ee.ee_info = tstype;
-	if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
+	if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
 		serr->ee.ee_data = skb_shinfo(skb)->tskey;
+		if (sk->sk_protocol == IPPROTO_TCP)
+			serr->ee.ee_data -= sk->sk_tskey;
+	}
 
 	err = sock_queue_err_skb(sk, skb);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 1e0f1c63ad6b..2714811afbd8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -849,8 +849,17 @@ set_rcvbuf:
 			break;
 		}
 		if (val & SOF_TIMESTAMPING_OPT_ID &&
-		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID))
-			sk->sk_tskey = 0;
+		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
+			if (sk->sk_protocol == IPPROTO_TCP) {
+				if (sk->sk_state != TCP_ESTABLISHED) {
+					ret = -EINVAL;
+					break;
+				}
+				sk->sk_tskey = tcp_sk(sk)->snd_una;
+			} else {
+				sk->sk_tskey = 0;
+			}
+		}
 		sk->sk_tsflags = val;
 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 			sock_enable_timestamp(sk,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9d2118e5fbc7..744af67a5989 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -426,6 +426,15 @@ void tcp_init_sock(struct sock *sk)
 }
 EXPORT_SYMBOL(tcp_init_sock);
 
+void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+	sock_tx_timestamp(sk, &shinfo->tx_flags);
+	if (shinfo->tx_flags & SKBTX_ANY_SW_TSTAMP)
+		shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
+}
+
 /*
  *	Wait for a TCP event.
  *
@@ -523,7 +532,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 	}
 	/* This barrier is coupled with smp_wmb() in tcp_reset() */
 	smp_rmb();
-	if (sk->sk_err)
+	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
 		mask |= POLLERR;
 
 	return mask;
@@ -959,8 +968,10 @@ new_segment:
 
 		copied += copy;
 		offset += copy;
-		if (!(size -= copy))
+		if (!(size -= copy)) {
+			tcp_tx_timestamp(sk, skb);
 			goto out;
+		}
 
 		if (skb->len < size_goal || (flags & MSG_OOB))
 			continue;
@@ -1252,8 +1263,10 @@ new_segment:
 
 			from += copy;
 			copied += copy;
-			if ((seglen -= copy) == 0 && iovlen == 0)
+			if ((seglen -= copy) == 0 && iovlen == 0) {
+				tcp_tx_timestamp(sk, skb);
 				goto out;
+			}
 
 			if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
 				continue;
@@ -1617,6 +1630,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	struct sk_buff *skb;
 	u32 urg_hole = 0;
 
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return ip_recv_error(sk, msg, len, addr_len);
+
 	if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
 	    (sk->sk_state == TCP_ESTABLISHED))
 		sk_busy_loop(sk, nonblock);
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 55046ecd083e..f597119fc4e7 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -14,6 +14,21 @@
 #include <net/tcp.h>
 #include <net/protocol.h>
 
+void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, unsigned int seq,
+		    unsigned int mss)
+{
+	while (skb) {
+		if (ts_seq < (__u64) seq + mss) {
+			skb_shinfo(skb)->tx_flags = SKBTX_SW_TSTAMP;
+			skb_shinfo(skb)->tskey = ts_seq;
+			return;
+		}
+
+		skb = skb->next;
+		seq += mss;
+	}
+}
+
 struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 				netdev_features_t features)
 {
@@ -91,6 +106,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 	th = tcp_hdr(skb);
 	seq = ntohl(th->seq);
 
+	if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP))
+		tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss);
+
 	newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
 					       (__force u32)delta));
 
-- 
cgit v1.2.3


From e1c8a607b28190cd09a271508aa3025d3c2f312e Mon Sep 17 00:00:00 2001
From: Willem de Bruijn
Date: Mon, 4 Aug 2014 22:11:50 -0400
Subject: net-timestamp: ACK timestamp for bytestreams

Add SOF_TIMESTAMPING_TX_ACK, a request for a tstamp when the last byte
in the send() call is acknowledged. It implements the feature for TCP.

The timestamp is generated when the TCP socket cumulative ACK is moved
beyond the tracked seqno for the first time. The feature ignores SACK
and FACK, because those acknowledge the specific byte, but not
necessarily the entire contents of the buffer up to that byte.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h          | 7 ++++++-
 include/uapi/linux/errqueue.h   | 1 +
 include/uapi/linux/net_tstamp.h | 3 ++-
 net/ipv4/tcp_input.c            | 6 ++++++
 net/socket.c                    | 2 ++
 5 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 50e1e9b3a5a5..11c270551d25 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -250,9 +250,14 @@ enum {
 
 	/* generate software time stamp when entering packet scheduling */
 	SKBTX_SCHED_TSTAMP = 1 << 6,
+
+	/* generate software timestamp on peer data acknowledgment */
+	SKBTX_ACK_TSTAMP = 1 << 7,
 };
 
-#define SKBTX_ANY_SW_TSTAMP	(SKBTX_SW_TSTAMP | SKBTX_SCHED_TSTAMP)
+#define SKBTX_ANY_SW_TSTAMP	(SKBTX_SW_TSTAMP    | \
+				 SKBTX_SCHED_TSTAMP | \
+				 SKBTX_ACK_TSTAMP)
 #define SKBTX_ANY_TSTAMP	(SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP)
 
 /*
diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h
index 17437cf297b7..07bdce1f444a 100644
--- a/include/uapi/linux/errqueue.h
+++ b/include/uapi/linux/errqueue.h
@@ -40,6 +40,7 @@ struct scm_timestamping {
 enum {
 	SCM_TSTAMP_SND,		/* driver passed skb to NIC, or HW */
 	SCM_TSTAMP_SCHED,	/* data entered the packet scheduler */
+	SCM_TSTAMP_ACK,		/* data acknowledged by peer */
 };
 
 #endif /* _UAPI_LINUX_ERRQUEUE_H */
diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index 60733845fcdd..ff354021bb69 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -22,8 +22,9 @@ enum {
 	SOF_TIMESTAMPING_RAW_HARDWARE = (1<<6),
 	SOF_TIMESTAMPING_OPT_ID = (1<<7),
 	SOF_TIMESTAMPING_TX_SCHED = (1<<8),
+	SOF_TIMESTAMPING_TX_ACK = (1<<9),
 
-	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_TX_SCHED,
+	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_TX_ACK,
 	SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
 				 SOF_TIMESTAMPING_LAST
 };
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6a2984507755..a3d47af01906 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -74,6 +74,7 @@
 #include <linux/ipsec.h>
 #include <asm/unaligned.h>
 #include <net/netdma.h>
+#include <linux/errqueue.h>
 
 int sysctl_tcp_timestamps __read_mostly = 1;
 int sysctl_tcp_window_scaling __read_mostly = 1;
@@ -3106,6 +3107,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			tp->retrans_stamp = 0;
 		}
 
+		if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_ACK_TSTAMP) &&
+		    between(skb_shinfo(skb)->tskey, prior_snd_una,
+			    tp->snd_una + 1))
+			__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+
 		if (!fully_acked)
 			break;
 
diff --git a/net/socket.c b/net/socket.c
index 3a2778d71631..ae89569a2db5 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -619,6 +619,8 @@ void sock_tx_timestamp(struct sock *sk, __u8 *tx_flags)
 		*tx_flags |= SKBTX_SW_TSTAMP;
 	if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED)
 		*tx_flags |= SKBTX_SCHED_TSTAMP;
+	if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)
+		*tx_flags |= SKBTX_ACK_TSTAMP;
 
 	if (sock_flag(sk, SOCK_WIFI_STATUS))
 		*tx_flags |= SKBTX_WIFI_STATUS;
-- 
cgit v1.2.3