From 628e341f319f1a64a4639088faba952e4ec8f0a8 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa
Date: Wed, 14 Aug 2013 13:05:23 +0200
Subject: xfrm: make local error reporting more robust

In xfrm4 and xfrm6 we need to take care about sockets of the other
address family. This could happen because a 6in4 or 4in6 tunnel could
get protected by ipsec.

Because we don't want to have a run-time dependency on ipv6 when only
using ipv4 xfrm we have to embed a pointer to the correct local_error
function in xfrm_state_afinet and look it up when returning an error
depending on the socket address family.

Thanks to vi0ss for the great bug report:
<https://bugzilla.kernel.org/show_bug.cgi?id=58691>

v2:
a) fix two more unsafe interpretations of skb->sk as ipv6 socket
   (xfrm6_local_dontfrag and __xfrm6_output)
v3:
a) add an EXPORT_SYMBOL_GPL(xfrm_local_error) to fix a link error when
   building ipv6 as a module (thanks to Steffen Klassert)

Reported-by: <vi0oss@gmail.com>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/xfrm6_output.c | 10 ++++++----
 net/ipv6/xfrm6_state.c  |  1 +
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 8755a3079d0f..b64fff30eb06 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -34,8 +34,10 @@ static int xfrm6_local_dontfrag(struct sk_buff *skb)
 	struct sock *sk = skb->sk;
 
 	if (sk) {
-		proto = sk->sk_protocol;
+		if (sk->sk_family != AF_INET6)
+			return 0;
 
+		proto = sk->sk_protocol;
 		if (proto == IPPROTO_UDP || proto == IPPROTO_RAW)
 			return inet6_sk(sk)->dontfrag;
 	}
@@ -54,7 +56,7 @@ static void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu)
 	ipv6_local_rxpmtu(sk, &fl6, mtu);
 }
 
-static void xfrm6_local_error(struct sk_buff *skb, u32 mtu)
+void xfrm6_local_error(struct sk_buff *skb, u32 mtu)
 {
 	struct flowi6 fl6;
 	struct sock *sk = skb->sk;
@@ -80,7 +82,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb)
 		if (xfrm6_local_dontfrag(skb))
 			xfrm6_local_rxpmtu(skb, mtu);
 		else if (skb->sk)
-			xfrm6_local_error(skb, mtu);
+			xfrm_local_error(skb, mtu);
 		else
 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 		ret = -EMSGSIZE;
@@ -142,7 +144,7 @@ static int __xfrm6_output(struct sk_buff *skb)
 		xfrm6_local_rxpmtu(skb, mtu);
 		return -EMSGSIZE;
 	} else if (!skb->local_df && skb->len > mtu && skb->sk) {
-		xfrm6_local_error(skb, mtu);
+		xfrm_local_error(skb, mtu);
 		return -EMSGSIZE;
 	}
 
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index d8c70b8efc24..3fc970135fc6 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -183,6 +183,7 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = {
 	.extract_input		= xfrm6_extract_input,
 	.extract_output		= xfrm6_extract_output,
 	.transport_finish	= xfrm6_transport_finish,
+	.local_error		= xfrm6_local_error,
 };
 
 int __init xfrm6_state_init(void)
-- 
cgit v1.2.3


From 0ea9d5e3e0e03a63b11392f5613378977dae7eca Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa
Date: Tue, 13 Aug 2013 04:35:58 +0200
Subject: xfrm: introduce helper for safe determination of mtu

skb->sk socket can be of AF_INET or AF_INET6 address family. Thus we
always have to make sure we a referring to the correct interpretation
of skb->sk.

We only depend on header defines to query the mtu, so we don't introduce
a new dependency to ipv6 by this change.

Cc: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/route.h     |  8 ++++++++
 include/net/xfrm.h      | 12 ++++++++++++
 net/ipv4/ip_output.c    |  8 --------
 net/ipv4/xfrm4_output.c |  4 +---
 net/ipv6/xfrm6_output.c |  5 ++++-
 5 files changed, 25 insertions(+), 12 deletions(-)

(limited to 'net/ipv6')

diff --git a/include/net/route.h b/include/net/route.h
index 2ea40c1b5e00..afdeeb5bec25 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -317,4 +317,12 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
 	return hoplimit;
 }
 
+static inline int ip_skb_dst_mtu(struct sk_buff *skb)
+{
+	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
+
+	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
+	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
+}
+
 #endif	/* _ROUTE_H */
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index e823786e7c66..b41d2d10ff0e 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -20,6 +20,7 @@
 #include <net/route.h>
 #include <net/ipv6.h>
 #include <net/ip6_fib.h>
+#include <net/ip6_route.h>
 #include <net/flow.h>
 
 #include <linux/interrupt.h>
@@ -1723,4 +1724,15 @@ static inline int xfrm_mark_put(struct sk_buff *skb, const struct xfrm_mark *m)
 	return ret;
 }
 
+static inline int xfrm_skb_dst_mtu(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+
+	if (sk && sk->sk_family == AF_INET6)
+		return ip6_skb_dst_mtu(skb);
+	else if (sk && sk->sk_family == AF_INET)
+		return ip_skb_dst_mtu(skb);
+	return dst_mtu(skb_dst(skb));
+}
+
 #endif	/* _NET_XFRM_H */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4bcabf3ab4ca..9ee17e3d11c3 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -211,14 +211,6 @@ static inline int ip_finish_output2(struct sk_buff *skb)
 	return -EINVAL;
 }
 
-static inline int ip_skb_dst_mtu(struct sk_buff *skb)
-{
-	struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
-
-	return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
-	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
-}
-
 static int ip_finish_output(struct sk_buff *skb)
 {
 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 7a5491ffa4de..80baf4a3b1b5 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -21,7 +21,6 @@
 static int xfrm4_tunnel_check_size(struct sk_buff *skb)
 {
 	int mtu, ret = 0;
-	struct dst_entry *dst;
 
 	if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
 		goto out;
@@ -29,8 +28,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
 	if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df)
 		goto out;
 
-	dst = skb_dst(skb);
-	mtu = dst_mtu(dst);
+	mtu = xfrm_skb_dst_mtu(skb);
 	if (skb->len > mtu) {
 		if (skb->sk)
 			xfrm_local_error(skb, mtu);
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index b64fff30eb06..3ac5ab264fed 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -138,7 +138,10 @@ static int __xfrm6_output(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
 	struct xfrm_state *x = dst->xfrm;
-	int mtu = ip6_skb_dst_mtu(skb);
+	int mtu = xfrm_skb_dst_mtu(skb);
+
+	if (mtu < IPV6_MIN_MTU)
+		mtu = IPV6_MIN_MTU;
 
 	if (skb->len > mtu && xfrm6_local_dontfrag(skb)) {
 		xfrm6_local_rxpmtu(skb, mtu);
-- 
cgit v1.2.3


From 3d483058c8c8b87a167155ca9ddd776dd730bc39 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa
Date: Sun, 18 Aug 2013 13:46:52 +0200
Subject: ipv6: wire up skb->encapsulation

When pushing a new header before current one call skb_reset_inner_headers
to record the position of the inner headers in the various ipv6 tunnel
protocols.

We later need this to correctly identify the addresses needed to send
back an error in the xfrm layer.

This change is safe, because skb->protocol is always checked before
dereferencing data from the inner protocol.

Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/ip6_gre.c    | 5 +++++
 net/ipv6/ip6_tunnel.c | 6 ++++++
 net/ipv6/sit.c        | 5 +++++
 3 files changed, 16 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index ecd60733e5e2..90747f1973fe 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -724,6 +724,11 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
 		ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL);
 	}
 
+	if (likely(!skb->encapsulation)) {
+		skb_reset_inner_headers(skb);
+		skb->encapsulation = 1;
+	}
+
 	skb_push(skb, gre_hlen);
 	skb_reset_network_header(skb);
 	skb_set_transport_header(skb, sizeof(*ipv6h));
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 1e55866cead7..46ba243605a3 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1027,6 +1027,12 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
 		init_tel_txopt(&opt, encap_limit);
 		ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL);
 	}
+
+	if (likely(!skb->encapsulation)) {
+		skb_reset_inner_headers(skb);
+		skb->encapsulation = 1;
+	}
+
 	skb_push(skb, sizeof(struct ipv6hdr));
 	skb_reset_network_header(skb);
 	ipv6h = ipv6_hdr(skb);
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index a3437a4cd07e..fbfc5a83867f 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -888,6 +888,11 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 		ttl = iph6->hop_limit;
 	tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6));
 
+	if (likely(!skb->encapsulation)) {
+		skb_reset_inner_headers(skb);
+		skb->encapsulation = 1;
+	}
+
 	err = iptunnel_xmit(dev_net(dev), rt, skb, fl4.saddr, fl4.daddr,
 			    IPPROTO_IPV6, tos, ttl, df);
 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
-- 
cgit v1.2.3


From 5d0ff542d0264f61dc4bdb34eba39ffb4ea3bc23 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa
Date: Sun, 18 Aug 2013 13:46:57 +0200
Subject: ipv6: xfrm: dereference inner ipv6 header if encapsulated

In xfrm6_local_error use inner_header if the packet was encapsulated.

Cc: Steffen Klassert <steffen.klassert@secunet.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/xfrm6_output.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 3ac5ab264fed..e092e306882d 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -59,10 +59,12 @@ static void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu)
 void xfrm6_local_error(struct sk_buff *skb, u32 mtu)
 {
 	struct flowi6 fl6;
+	const struct ipv6hdr *hdr;
 	struct sock *sk = skb->sk;
 
+	hdr = skb->encapsulation ? inner_ipv6_hdr(skb) : ipv6_hdr(skb);
 	fl6.fl6_dport = inet_sk(sk)->inet_dport;
-	fl6.daddr = ipv6_hdr(skb)->daddr;
+	fl6.daddr = hdr->daddr;
 
 	ipv6_local_error(sk, EMSGSIZE, &fl6, mtu);
 }
-- 
cgit v1.2.3


From 5a25cf1e310888eb333f9e034be84a8117111d30 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa
Date: Mon, 26 Aug 2013 12:31:19 +0200
Subject: xfrm: revert ipv4 mtu determination to dst_mtu

In commit 0ea9d5e3e0e03a63b11392f5613378977dae7eca ("xfrm: introduce
helper for safe determination of mtu") I switched the determination of
ipv4 mtus from dst_mtu to ip_skb_dst_mtu. This was an error because in
case of IP_PMTUDISC_PROBE we fall back to the interface mtu, which is
never correct for ipv4 ipsec.

This patch partly reverts 0ea9d5e3e0e03a63b11392f5613378977dae7eca
("xfrm: introduce helper for safe determination of mtu").

Cc: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      | 12 ------------
 net/ipv4/xfrm4_output.c |  2 +-
 net/ipv6/xfrm6_output.c |  8 +++++---
 3 files changed, 6 insertions(+), 16 deletions(-)

(limited to 'net/ipv6')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index ac5b02515355..e823786e7c66 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -20,7 +20,6 @@
 #include <net/route.h>
 #include <net/ipv6.h>
 #include <net/ip6_fib.h>
-#include <net/ip6_route.h>
 #include <net/flow.h>
 
 #include <linux/interrupt.h>
@@ -1724,15 +1723,4 @@ static inline int xfrm_mark_put(struct sk_buff *skb, const struct xfrm_mark *m)
 	return ret;
 }
 
-static inline int xfrm_skb_dst_mtu(struct sk_buff *skb)
-{
-	struct sock *sk = skb->sk;
-
-	if (sk && skb->protocol == htons(ETH_P_IPV6))
-		return ip6_skb_dst_mtu(skb);
-	else if (sk && skb->protocol == htons(ETH_P_IP))
-		return ip_skb_dst_mtu(skb);
-	return dst_mtu(skb_dst(skb));
-}
-
 #endif	/* _NET_XFRM_H */
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 80baf4a3b1b5..baa0f63731fd 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -28,7 +28,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
 	if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->local_df)
 		goto out;
 
-	mtu = xfrm_skb_dst_mtu(skb);
+	mtu = dst_mtu(skb_dst(skb));
 	if (skb->len > mtu) {
 		if (skb->sk)
 			xfrm_local_error(skb, mtu);
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index e092e306882d..6cd625e37706 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -140,10 +140,12 @@ static int __xfrm6_output(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
 	struct xfrm_state *x = dst->xfrm;
-	int mtu = xfrm_skb_dst_mtu(skb);
+	int mtu;
 
-	if (mtu < IPV6_MIN_MTU)
-		mtu = IPV6_MIN_MTU;
+	if (skb->protocol == htons(ETH_P_IPV6))
+		mtu = ip6_skb_dst_mtu(skb);
+	else
+		mtu = dst_mtu(skb_dst(skb));
 
 	if (skb->len > mtu && xfrm6_local_dontfrag(skb)) {
 		xfrm6_local_rxpmtu(skb, mtu);
-- 
cgit v1.2.3


From 9c9c9ad5fae7e9ef56a38acb508a01919b225e9a Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa
Date: Mon, 26 Aug 2013 12:31:23 +0200
Subject: ipv6: set skb->protocol on tcp, raw and ip6_append_data genereated
 skbs

Currently we don't initialize skb->protocol when transmitting data via
tcp, raw(with and without inclhdr) or udp+ufo or appending data directly
to the socket transmit queue (via ip6_append_data). This needs to be
done so that we can get the correct mtu in the xfrm layer.

Setting of skb->protocol happens only in functions where we also have
a transmitting socket and a new skb, so we don't overwrite old values.

Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/ip6_output.c | 3 +++
 net/ipv6/raw.c        | 1 +
 2 files changed, 4 insertions(+)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 6e3ddf806ec2..e7ceb6c871d1 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -238,6 +238,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 	hdr->saddr = fl6->saddr;
 	hdr->daddr = *first_hop;
 
+	skb->protocol = htons(ETH_P_IPV6);
 	skb->priority = sk->sk_priority;
 	skb->mark = sk->sk_mark;
 
@@ -1057,6 +1058,7 @@ static inline int ip6_ufo_append_data(struct sock *sk,
 		/* initialize protocol header pointer */
 		skb->transport_header = skb->network_header + fragheaderlen;
 
+		skb->protocol = htons(ETH_P_IPV6);
 		skb->ip_summed = CHECKSUM_PARTIAL;
 		skb->csum = 0;
 	}
@@ -1359,6 +1361,7 @@ alloc_new_skb:
 			/*
 			 *	Fill in the control structures
 			 */
+			skb->protocol = htons(ETH_P_IPV6);
 			skb->ip_summed = CHECKSUM_NONE;
 			skb->csum = 0;
 			/* reserve for fragmentation and ipsec header */
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index c45f7a5c36e9..cdaed47ba932 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -628,6 +628,7 @@ static int rawv6_send_hdrinc(struct sock *sk, void *from, int length,
 		goto error;
 	skb_reserve(skb, hlen);
 
+	skb->protocol = htons(ETH_P_IPV6);
 	skb->priority = sk->sk_priority;
 	skb->mark = sk->sk_mark;
 	skb_dst_set(skb, &rt->dst);
-- 
cgit v1.2.3


From 1f324e38870cc09659cf23bc626f1b8869e201f2 Mon Sep 17 00:00:00 2001
From: Thomas Graf
Date: Wed, 28 Aug 2013 01:07:25 +0200
Subject: ipv6: Don't depend on per socket memory for neighbour discovery
 messages

Allocating skbs when sending out neighbour discovery messages
currently uses sock_alloc_send_skb() based on a per net namespace
socket and thus share a socket wmem buffer space.

If a netdevice is temporarily unable to transmit due to carrier
loss or for other reasons, the queued up ndisc messages will cosnume
all of the wmem space and will thus prevent from any more skbs to
be allocated even for netdevices that are able to transmit packets.

The number of neighbour discovery messages sent is very limited,
simply use alloc_skb() and don't depend on any socket wmem space any
longer.

This patch has orginally been posted by Eric Dumazet in a modified
form.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ndisc.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 04d31c2fbef1..5cb98df966c2 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -370,16 +370,12 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev,
 {
 	int hlen = LL_RESERVED_SPACE(dev);
 	int tlen = dev->needed_tailroom;
-	struct sock *sk = dev_net(dev)->ipv6.ndisc_sk;
 	struct sk_buff *skb;
-	int err;
 
-	skb = sock_alloc_send_skb(sk,
-				  hlen + sizeof(struct ipv6hdr) + len + tlen,
-				  1, &err);
+	skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC);
 	if (!skb) {
-		ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb, err=%d\n",
-			  __func__, err);
+		ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n",
+			  __func__);
 		return NULL;
 	}
 
-- 
cgit v1.2.3


From 737e828bdbdaf2f9d7de07f20a0308ac46ce5178 Mon Sep 17 00:00:00 2001
From: Li Hongjun
Date: Wed, 28 Aug 2013 11:54:50 +0200
Subject: ipv4 tunnels: fix an oops when using ipip/sit with IPsec

Since commit 3d7b46cd20e3 (ip_tunnel: push generic protocol handling to
ip_tunnel module.), an Oops is triggered when an xfrm policy is configured on
an IPv4 over IPv4 tunnel.

xfrm4_policy_check() calls __xfrm_policy_check2(), which uses skb_dst(skb). But
this field is NULL because iptunnel_pull_header() calls skb_dst_drop(skb).

Signed-off-by: Li Hongjun <hongjun.li@6wind.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipip.c | 5 ++---
 net/ipv6/sit.c  | 6 ++----
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 51fc2a1dcdd3..b3ac3c3f6219 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -190,15 +190,14 @@ static int ipip_rcv(struct sk_buff *skb)
 	struct ip_tunnel *tunnel;
 	const struct iphdr *iph;
 
-	if (iptunnel_pull_header(skb, 0, tpi.proto))
-		goto drop;
-
 	iph = ip_hdr(skb);
 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
 			iph->saddr, iph->daddr, 0);
 	if (tunnel) {
 		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
 			goto drop;
+		if (iptunnel_pull_header(skb, 0, tpi.proto))
+			goto drop;
 		return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
 	}
 
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index fbfc5a83867f..21b25dd8466b 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -645,11 +645,7 @@ static int ipip_rcv(struct sk_buff *skb)
 	const struct iphdr *iph;
 	struct ip_tunnel *tunnel;
 
-	if (iptunnel_pull_header(skb, 0, tpi.proto))
-		goto drop;
-
 	iph = ip_hdr(skb);
-
 	tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev,
 				     iph->saddr, iph->daddr);
 	if (tunnel != NULL) {
@@ -659,6 +655,8 @@ static int ipip_rcv(struct sk_buff *skb)
 
 		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
 			goto drop;
+		if (iptunnel_pull_header(skb, 0, tpi.proto))
+			goto drop;
 		return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
 	}
 
-- 
cgit v1.2.3


From 25ad6117e73656071b38fd19fa67ae325471c758 Mon Sep 17 00:00:00 2001
From: David S. Miller
Date: Fri, 30 Aug 2013 17:39:33 -0400
Subject: Revert "ipv6: Don't depend on per socket memory for neighbour
 discovery messages"

This reverts commit 1f324e38870cc09659cf23bc626f1b8869e201f2.

It seems to cause regressions, and in particular the output path
really depends upon there being a socket attached to skb->sk for
checks such as sk_mc_loop(skb->sk) for example.  See ip6_output_finish2().

Reported-by: Stephen Warren <swarren@wwwdotorg.org>
Reported-by: Fabio Estevam <festevam@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ndisc.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 5cb98df966c2..04d31c2fbef1 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -370,12 +370,16 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev,
 {
 	int hlen = LL_RESERVED_SPACE(dev);
 	int tlen = dev->needed_tailroom;
+	struct sock *sk = dev_net(dev)->ipv6.ndisc_sk;
 	struct sk_buff *skb;
+	int err;
 
-	skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC);
+	skb = sock_alloc_send_skb(sk,
+				  hlen + sizeof(struct ipv6hdr) + len + tlen,
+				  1, &err);
 	if (!skb) {
-		ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n",
-			  __func__);
+		ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb, err=%d\n",
+			  __func__, err);
 		return NULL;
 	}
 
-- 
cgit v1.2.3


From 61e76b178dbe7145e8d6afa84bb4ccea71918994 Mon Sep 17 00:00:00 2001
From: Jiri Bohac
Date: Fri, 30 Aug 2013 11:18:45 +0200
Subject: ICMPv6: treat dest unreachable codes 5 and 6 as EACCES, not EPROTO

RFC 4443 has defined two additional codes for ICMPv6 type 1 (destination
unreachable) messages:
        5 - Source address failed ingress/egress policy
	6 - Reject route to destination

Now they are treated as protocol error and icmpv6_err_convert() converts them
to EPROTO.

RFC 4443 says:
	"Codes 5 and 6 are more informative subsets of code 1."

Treat codes 5 and 6 as code 1 (EACCES)

Btw, connect() returning -EPROTO confuses firefox, so that fallback to
other/IPv4 addresses does not work:
https://bugzilla.mozilla.org/show_bug.cgi?id=910773

Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/icmpv6.h |  2 ++
 net/ipv6/icmp.c             | 10 +++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h
index e0133c73c304..590beda78ea0 100644
--- a/include/uapi/linux/icmpv6.h
+++ b/include/uapi/linux/icmpv6.h
@@ -115,6 +115,8 @@ struct icmp6hdr {
 #define ICMPV6_NOT_NEIGHBOUR		2
 #define ICMPV6_ADDR_UNREACH		3
 #define ICMPV6_PORT_UNREACH		4
+#define ICMPV6_POLICY_FAIL		5
+#define ICMPV6_REJECT_ROUTE		6
 
 /*
  *	Codes for Time Exceeded
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 7cfc8d284870..67ae4e0d40bf 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -940,6 +940,14 @@ static const struct icmp6_err {
 		.err	= ECONNREFUSED,
 		.fatal	= 1,
 	},
+	{	/* POLICY_FAIL */
+		.err	= EACCES,
+		.fatal	= 1,
+	},
+	{	/* REJECT_ROUTE	*/
+		.err	= EACCES,
+		.fatal	= 1,
+	},
 };
 
 int icmpv6_err_convert(u8 type, u8 code, int *err)
@@ -951,7 +959,7 @@ int icmpv6_err_convert(u8 type, u8 code, int *err)
 	switch (type) {
 	case ICMPV6_DEST_UNREACH:
 		fatal = 1;
-		if (code <= ICMPV6_PORT_UNREACH) {
+		if (code < ARRAY_SIZE(tab_unreach)) {
 			*err  = tab_unreach[code].err;
 			fatal = tab_unreach[code].fatal;
 		}
-- 
cgit v1.2.3


From 13c7bf0871509723a2b3c054d3d6e9e506464e71 Mon Sep 17 00:00:00 2001
From: Petr Holasek
Date: Fri, 30 Aug 2013 17:02:38 +0200
Subject: ipv6: ipv6_create_tempaddr cleanup

This two-liner removes max_addresses variable which is now unecessary related
to patch [ipv6: remove max_addresses check from ipv6_create_tempaddr].

Signed-off-by: Petr Holasek <pholasek@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Ding Tianhong <dingtianhong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 498ea99194af..05c83dd54a4d 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1054,7 +1054,6 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *i
 	unsigned long regen_advance;
 	int tmp_plen;
 	int ret = 0;
-	int max_addresses;
 	u32 addr_flags;
 	unsigned long now = jiffies;
 
@@ -1100,7 +1099,6 @@ retry:
 				 idev->cnf.temp_prefered_lft + age -
 				 idev->cnf.max_desync_factor);
 	tmp_plen = ifp->prefix_len;
-	max_addresses = idev->cnf.max_addresses;
 	tmp_tstamp = ifp->tstamp;
 	spin_unlock_bh(&ifp->lock);
 
-- 
cgit v1.2.3


From 639739b5e609a5074839bb22fc061b37baa06269 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa
Date: Tue, 3 Sep 2013 02:13:31 +0200
Subject: ipv6: fix null pointer dereference in __ip6addrlbl_add

Commit b67bfe0d42cac56c512dd5da4b1b347a23f4b70a ("hlist: drop
the node parameter from iterators") changed the behavior of
hlist_for_each_entry_safe to leave the p argument NULL.

Fix this up by tracking the last argument.

Reported-by: Michele Baldessari <michele@acksyn.org>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Tested-by: Michele Baldessari <michele@acksyn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrlabel.c | 48 +++++++++++++++++++++++-------------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index f083a583a05c..b30ad3741b46 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -251,38 +251,36 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net,
 /* add a label */
 static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace)
 {
+	struct hlist_node *n;
+	struct ip6addrlbl_entry *last = NULL, *p = NULL;
 	int ret = 0;
 
-	ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n",
-			__func__,
-			newp, replace);
+	ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp,
+		  replace);
 
-	if (hlist_empty(&ip6addrlbl_table.head)) {
-		hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head);
-	} else {
-		struct hlist_node *n;
-		struct ip6addrlbl_entry *p = NULL;
-		hlist_for_each_entry_safe(p, n,
-					  &ip6addrlbl_table.head, list) {
-			if (p->prefixlen == newp->prefixlen &&
-			    net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) &&
-			    p->ifindex == newp->ifindex &&
-			    ipv6_addr_equal(&p->prefix, &newp->prefix)) {
-				if (!replace) {
-					ret = -EEXIST;
-					goto out;
-				}
-				hlist_replace_rcu(&p->list, &newp->list);
-				ip6addrlbl_put(p);
-				goto out;
-			} else if ((p->prefixlen == newp->prefixlen && !p->ifindex) ||
-				   (p->prefixlen < newp->prefixlen)) {
-				hlist_add_before_rcu(&newp->list, &p->list);
+	hlist_for_each_entry_safe(p, n,	&ip6addrlbl_table.head, list) {
+		if (p->prefixlen == newp->prefixlen &&
+		    net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) &&
+		    p->ifindex == newp->ifindex &&
+		    ipv6_addr_equal(&p->prefix, &newp->prefix)) {
+			if (!replace) {
+				ret = -EEXIST;
 				goto out;
 			}
+			hlist_replace_rcu(&p->list, &newp->list);
+			ip6addrlbl_put(p);
+			goto out;
+		} else if ((p->prefixlen == newp->prefixlen && !p->ifindex) ||
+			   (p->prefixlen < newp->prefixlen)) {
+			hlist_add_before_rcu(&newp->list, &p->list);
+			goto out;
 		}
-		hlist_add_after_rcu(&p->list, &newp->list);
+		last = p;
 	}
+	if (last)
+		hlist_add_after_rcu(&last->list, &newp->list);
+	else
+		hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head);
 out:
 	if (!ret)
 		ip6addrlbl_table.seq++;
-- 
cgit v1.2.3


From 25a6e6b84fba601eff7c28d30da8ad7cfbef0d43 Mon Sep 17 00:00:00 2001
From: Thomas Graf
Date: Tue, 3 Sep 2013 13:37:01 +0200
Subject: ipv6: Don't depend on per socket memory for neighbour discovery
 messages

Allocating skbs when sending out neighbour discovery messages
currently uses sock_alloc_send_skb() based on a per net namespace
socket and thus share a socket wmem buffer space.

If a netdevice is temporarily unable to transmit due to carrier
loss or for other reasons, the queued up ndisc messages will cosnume
all of the wmem space and will thus prevent from any more skbs to
be allocated even for netdevices that are able to transmit packets.

The number of neighbour discovery messages sent is very limited,
use of alloc_skb() bypasses the socket wmem buffer size enforcement
while the manual call to skb_set_owner_w() maintains the socket
reference needed for the IPv6 output path.

This patch has orginally been posted by Eric Dumazet in a modified
form.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Stephen Warren <swarren@wwwdotorg.org>
Cc: Fabio Estevam <festevam@gmail.com>
Tested-by: Fabio Estevam <fabio.estevam@freescale.com>
Tested-by: Stephen Warren <swarren@nvidia.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ndisc.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 04d31c2fbef1..78141fa46cf8 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -372,14 +372,11 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev,
 	int tlen = dev->needed_tailroom;
 	struct sock *sk = dev_net(dev)->ipv6.ndisc_sk;
 	struct sk_buff *skb;
-	int err;
 
-	skb = sock_alloc_send_skb(sk,
-				  hlen + sizeof(struct ipv6hdr) + len + tlen,
-				  1, &err);
+	skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC);
 	if (!skb) {
-		ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb, err=%d\n",
-			  __func__, err);
+		ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n",
+			  __func__);
 		return NULL;
 	}
 
@@ -389,6 +386,11 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev,
 	skb_reserve(skb, hlen + sizeof(struct ipv6hdr));
 	skb_reset_transport_header(skb);
 
+	/* Manually assign socket ownership as we avoid calling
+	 * sock_alloc_send_pskb() to bypass wmem buffer limits
+	 */
+	skb_set_owner_w(skb, sk);
+
 	return skb;
 }
 
-- 
cgit v1.2.3


From 3a1c756590633c0e86df606e5c618c190926a0df Mon Sep 17 00:00:00 2001
From: Daniel Borkmann
Date: Tue, 3 Sep 2013 19:29:12 +0200
Subject: net: ipv6: tcp: fix potential use after free in tcp_v6_do_rcv

In tcp_v6_do_rcv() code, when processing pkt options, we soley work
on our skb clone opt_skb that we've created earlier before entering
tcp_rcv_established() on our way. However, only in condition ...

  if (np->rxopt.bits.rxtclass)
    np->rcv_tclass = ipv6_get_dsfield(ipv6_hdr(skb));

... we work on skb itself. As we extract every other information out
of opt_skb in ipv6_pktoptions path, this seems wrong, since skb can
already be released by tcp_rcv_established() earlier on. When we try
to access it in ipv6_hdr(), we will dereference freed skb.

[ Bug added by commit 4c507d2897bd9b ("net: implement IP_RECVTOS for
  IP_PKTOPTIONS") ]

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv6')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 6e1649d58533..eeb4cb0c57c1 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1427,7 +1427,7 @@ ipv6_pktoptions:
 		if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
 			np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit;
 		if (np->rxopt.bits.rxtclass)
-			np->rcv_tclass = ipv6_get_dsfield(ipv6_hdr(skb));
+			np->rcv_tclass = ipv6_get_dsfield(ipv6_hdr(opt_skb));
 		if (ipv6_opt_accepted(sk, opt_skb)) {
 			skb_set_owner_r(opt_skb, sk);
 			opt_skb = xchg(&np->pktoptions, opt_skb);
-- 
cgit v1.2.3