From 675716400da6f15b9d3db04ef74ee74ca9a00af3 Mon Sep 17 00:00:00 2001
From: Ilya Maximets
Date: Thu, 4 Jul 2019 17:25:03 +0300
Subject: xdp: fix possible cq entry leak

Completion queue address reservation could not be undone.
In case of bad 'queue_id' or skb allocation failure, reserved entry
will be leaked reducing the total capacity of completion queue.

Fix that by moving reservation to the point where failure is not
possible. Additionally, 'queue_id' checking moved out from the loop
since there is no point to check it there.

Fixes: 35fcde7f8deb ("xsk: support for Tx")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Björn Töpel <bjorn.topel@intel.com>
Tested-by: William Tu <u9012063@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xsk.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index d4d6f10aa936..b994c32a664a 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -240,6 +240,9 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 
 	mutex_lock(&xs->mutex);
 
+	if (xs->queue_id >= xs->dev->real_num_tx_queues)
+		goto out;
+
 	while (xskq_peek_desc(xs->tx, &desc)) {
 		char *buffer;
 		u64 addr;
@@ -250,12 +253,6 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 			goto out;
 		}
 
-		if (xskq_reserve_addr(xs->umem->cq))
-			goto out;
-
-		if (xs->queue_id >= xs->dev->real_num_tx_queues)
-			goto out;
-
 		len = desc.len;
 		skb = sock_alloc_send_skb(sk, len, 1, &err);
 		if (unlikely(!skb)) {
@@ -267,7 +264,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 		addr = desc.addr;
 		buffer = xdp_umem_get_data(xs->umem, addr);
 		err = skb_store_bits(skb, 0, buffer, len);
-		if (unlikely(err)) {
+		if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) {
 			kfree_skb(skb);
 			goto out;
 		}
-- 
cgit v1.2.3


From 5464c3a0e9a037b63d5229cdea08dddc01a98aac Mon Sep 17 00:00:00 2001
From: Ilya Maximets
Date: Mon, 8 Jul 2019 14:03:44 +0300
Subject: xdp: fix potential deadlock on socket mutex

There are 2 call chains:

  a) xsk_bind --> xdp_umem_assign_dev
  b) unregister_netdevice_queue --> xsk_notifier

with the following locking order:

  a) xs->mutex --> rtnl_lock
  b) rtnl_lock --> xdp.lock --> xs->mutex

Different order of taking 'xs->mutex' and 'rtnl_lock' could produce a
deadlock here. Fix that by moving the 'rtnl_lock' before 'xs->lock' in
the bind call chain (a).

Reported-by: syzbot+bf64ec93de836d7f4c2c@syzkaller.appspotmail.com
Fixes: 455302d1c9ae ("xdp: fix hang while unregistering device bound to xdp socket")
Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/xdp/xdp_umem.c | 16 ++++++----------
 net/xdp/xsk.c      |  2 ++
 2 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 20c91f02d3d8..83de74ca729a 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -87,21 +87,20 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 	struct netdev_bpf bpf;
 	int err = 0;
 
+	ASSERT_RTNL();
+
 	force_zc = flags & XDP_ZEROCOPY;
 	force_copy = flags & XDP_COPY;
 
 	if (force_zc && force_copy)
 		return -EINVAL;
 
-	rtnl_lock();
-	if (xdp_get_umem_from_qid(dev, queue_id)) {
-		err = -EBUSY;
-		goto out_rtnl_unlock;
-	}
+	if (xdp_get_umem_from_qid(dev, queue_id))
+		return -EBUSY;
 
 	err = xdp_reg_umem_at_qid(dev, umem, queue_id);
 	if (err)
-		goto out_rtnl_unlock;
+		return err;
 
 	umem->dev = dev;
 	umem->queue_id = queue_id;
@@ -110,7 +109,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 
 	if (force_copy)
 		/* For copy-mode, we are done. */
-		goto out_rtnl_unlock;
+		return 0;
 
 	if (!dev->netdev_ops->ndo_bpf ||
 	    !dev->netdev_ops->ndo_xsk_async_xmit) {
@@ -125,7 +124,6 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
 	err = dev->netdev_ops->ndo_bpf(dev, &bpf);
 	if (err)
 		goto err_unreg_umem;
-	rtnl_unlock();
 
 	umem->zc = true;
 	return 0;
@@ -135,8 +133,6 @@ err_unreg_umem:
 		err = 0; /* fallback to copy mode */
 	if (err)
 		xdp_clear_umem_at_qid(dev, queue_id);
-out_rtnl_unlock:
-	rtnl_unlock();
 	return err;
 }
 
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index b994c32a664a..59b57d708697 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -430,6 +430,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY))
 		return -EINVAL;
 
+	rtnl_lock();
 	mutex_lock(&xs->mutex);
 	if (xs->state != XSK_READY) {
 		err = -EBUSY;
@@ -515,6 +516,7 @@ out_unlock:
 		xs->state = XSK_BOUND;
 out_release:
 	mutex_unlock(&xs->mutex);
+	rtnl_unlock();
 	return err;
 }
 
-- 
cgit v1.2.3


From 6e3d1bbbba55265d327e44a2d14de70462232853 Mon Sep 17 00:00:00 2001
From: Haishuang Yan
Date: Sun, 14 Jul 2019 21:31:22 +0800
Subject: sit: use dst_cache in ipip6_tunnel_xmit

Same as other ip tunnel, use dst_cache in xmit action to avoid
unnecessary fib lookups.

Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/sit.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 80610899a323..b2ccbc473127 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -900,12 +900,17 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
 			   RT_TOS(tos), RT_SCOPE_UNIVERSE, IPPROTO_IPV6,
 			   0, dst, tiph->saddr, 0, 0,
 			   sock_net_uid(tunnel->net, NULL));
-	rt = ip_route_output_flow(tunnel->net, &fl4, NULL);
 
-	if (IS_ERR(rt)) {
-		dev->stats.tx_carrier_errors++;
-		goto tx_error_icmp;
+	rt = dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr);
+	if (!rt) {
+		rt = ip_route_output_flow(tunnel->net, &fl4, NULL);
+		if (IS_ERR(rt)) {
+			dev->stats.tx_carrier_errors++;
+			goto tx_error_icmp;
+		}
+		dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, fl4.saddr);
 	}
+
 	if (rt->rt_type != RTN_UNICAST) {
 		ip_rt_put(rt);
 		dev->stats.tx_carrier_errors++;
-- 
cgit v1.2.3


From 071c37983d99da07797294ea78e9da1a6e287144 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi
Date: Sun, 14 Jul 2019 23:36:11 +0200
Subject: net: neigh: fix multiple neigh timer scheduling

Neigh timer can be scheduled multiple times from userspace adding
multiple neigh entries and forcing the neigh timer scheduling passing
NTF_USE in the netlink requests.
This will result in a refcount leak and in the following dump stack:

[   32.465295] NEIGH: BUG, double timer add, state is 8
[   32.465308] CPU: 0 PID: 416 Comm: double_timer_ad Not tainted 5.2.0+ #65
[   32.465311] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.12.0-2.fc30 04/01/2014
[   32.465313] Call Trace:
[   32.465318]  dump_stack+0x7c/0xc0
[   32.465323]  __neigh_event_send+0x20c/0x880
[   32.465326]  ? ___neigh_create+0x846/0xfb0
[   32.465329]  ? neigh_lookup+0x2a9/0x410
[   32.465332]  ? neightbl_fill_info.constprop.0+0x800/0x800
[   32.465334]  neigh_add+0x4f8/0x5e0
[   32.465337]  ? neigh_xmit+0x620/0x620
[   32.465341]  ? find_held_lock+0x85/0xa0
[   32.465345]  rtnetlink_rcv_msg+0x204/0x570
[   32.465348]  ? rtnl_dellink+0x450/0x450
[   32.465351]  ? mark_held_locks+0x90/0x90
[   32.465354]  ? match_held_lock+0x1b/0x230
[   32.465357]  netlink_rcv_skb+0xc4/0x1d0
[   32.465360]  ? rtnl_dellink+0x450/0x450
[   32.465363]  ? netlink_ack+0x420/0x420
[   32.465366]  ? netlink_deliver_tap+0x115/0x560
[   32.465369]  ? __alloc_skb+0xc9/0x2f0
[   32.465372]  netlink_unicast+0x270/0x330
[   32.465375]  ? netlink_attachskb+0x2f0/0x2f0
[   32.465378]  netlink_sendmsg+0x34f/0x5a0
[   32.465381]  ? netlink_unicast+0x330/0x330
[   32.465385]  ? move_addr_to_kernel.part.0+0x20/0x20
[   32.465388]  ? netlink_unicast+0x330/0x330
[   32.465391]  sock_sendmsg+0x91/0xa0
[   32.465394]  ___sys_sendmsg+0x407/0x480
[   32.465397]  ? copy_msghdr_from_user+0x200/0x200
[   32.465401]  ? _raw_spin_unlock_irqrestore+0x37/0x40
[   32.465404]  ? lockdep_hardirqs_on+0x17d/0x250
[   32.465407]  ? __wake_up_common_lock+0xcb/0x110
[   32.465410]  ? __wake_up_common+0x230/0x230
[   32.465413]  ? netlink_bind+0x3e1/0x490
[   32.465416]  ? netlink_setsockopt+0x540/0x540
[   32.465420]  ? __fget_light+0x9c/0xf0
[   32.465423]  ? sockfd_lookup_light+0x8c/0xb0
[   32.465426]  __sys_sendmsg+0xa5/0x110
[   32.465429]  ? __ia32_sys_shutdown+0x30/0x30
[   32.465432]  ? __fd_install+0xe1/0x2c0
[   32.465435]  ? lockdep_hardirqs_off+0xb5/0x100
[   32.465438]  ? mark_held_locks+0x24/0x90
[   32.465441]  ? do_syscall_64+0xf/0x270
[   32.465444]  do_syscall_64+0x63/0x270
[   32.465448]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Fix the issue unscheduling neigh_timer if selected entry is in 'IN_TIMER'
receiving a netlink request with NTF_USE flag set

Reported-by: Marek Majkowski <marek@cloudflare.com>
Fixes: 0c5c2d308906 ("neigh: Allow for user space users of the neighbour table")
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 742cea4ce72e..0dfc97bc8760 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1124,6 +1124,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
 
 			atomic_set(&neigh->probes,
 				   NEIGH_VAR(neigh->parms, UCAST_PROBES));
+			neigh_del_timer(neigh);
 			neigh->nud_state     = NUD_INCOMPLETE;
 			neigh->updated = now;
 			next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
@@ -1140,6 +1141,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
 		}
 	} else if (neigh->nud_state & NUD_STALE) {
 		neigh_dbg(2, "neigh %p is delayed\n", neigh);
+		neigh_del_timer(neigh);
 		neigh->nud_state = NUD_DELAY;
 		neigh->updated = jiffies;
 		neigh_add_timer(neigh, jiffies +
-- 
cgit v1.2.3


From b43995469e5804636a55372e9bbb17ccb22441c5 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev
Date: Mon, 15 Jul 2019 09:39:52 -0700
Subject: bpf: rename bpf_ctx_wide_store_ok to bpf_ctx_wide_access_ok

Rename bpf_ctx_wide_store_ok to bpf_ctx_wide_access_ok to indicate
that it can be used for both loads and stores.

Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h |  2 +-
 net/core/filter.c      | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6d944369ca87..ff65d22cf336 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -747,7 +747,7 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
 	return size <= size_default && (size & (size - 1)) == 0;
 }
 
-#define bpf_ctx_wide_store_ok(off, size, type, field)			\
+#define bpf_ctx_wide_access_ok(off, size, type, field)			\
 	(size == sizeof(__u64) &&					\
 	off >= offsetof(type, field) &&					\
 	off + sizeof(__u64) <= offsetofend(type, field) &&		\
diff --git a/net/core/filter.c b/net/core/filter.c
index 47f6386fb17a..c5983ddb1a9f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6890,14 +6890,14 @@ static bool sock_addr_is_valid_access(int off, int size,
 			if (!bpf_ctx_narrow_access_ok(off, size, size_default))
 				return false;
 		} else {
-			if (bpf_ctx_wide_store_ok(off, size,
-						  struct bpf_sock_addr,
-						  user_ip6))
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   user_ip6))
 				return true;
 
-			if (bpf_ctx_wide_store_ok(off, size,
-						  struct bpf_sock_addr,
-						  msg_src_ip6))
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   msg_src_ip6))
 				return true;
 
 			if (size != size_default)
-- 
cgit v1.2.3


From d4ecfeb15494ec261fef2d25d96eecba66f0b182 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev
Date: Mon, 15 Jul 2019 09:39:53 -0700
Subject: bpf: allow wide aligned loads for bpf_sock_addr user_ip6 and
 msg_src_ip6

Add explicit check for u64 loads of user_ip6 and msg_src_ip6 and
update the comment.

Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h |  4 ++--
 net/core/filter.c        | 12 +++++++++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6f68438aa4ed..81be929b89fc 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3248,7 +3248,7 @@ struct bpf_sock_addr {
 	__u32 user_ip4;		/* Allows 1,2,4-byte read and 4-byte write.
 				 * Stored in network byte order.
 				 */
-	__u32 user_ip6[4];	/* Allows 1,2,4-byte read and 4,8-byte write.
+	__u32 user_ip6[4];	/* Allows 1,2,4,8-byte read and 4,8-byte write.
 				 * Stored in network byte order.
 				 */
 	__u32 user_port;	/* Allows 4-byte read and write.
@@ -3260,7 +3260,7 @@ struct bpf_sock_addr {
 	__u32 msg_src_ip4;	/* Allows 1,2,4-byte read and 4-byte write.
 				 * Stored in network byte order.
 				 */
-	__u32 msg_src_ip6[4];	/* Allows 1,2,4-byte read and 4,8-byte write.
+	__u32 msg_src_ip6[4];	/* Allows 1,2,4,8-byte read and 4,8-byte write.
 				 * Stored in network byte order.
 				 */
 	__bpf_md_ptr(struct bpf_sock *, sk);
diff --git a/net/core/filter.c b/net/core/filter.c
index c5983ddb1a9f..0f6854ccf894 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6884,9 +6884,19 @@ static bool sock_addr_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
 	case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
 				msg_src_ip6[3]):
-		/* Only narrow read access allowed for now. */
 		if (type == BPF_READ) {
 			bpf_ctx_record_field_size(info, size_default);
+
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   user_ip6))
+				return true;
+
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   msg_src_ip6))
+				return true;
+
 			if (!bpf_ctx_narrow_access_ok(off, size, size_default))
 				return false;
 		} else {
-- 
cgit v1.2.3


From db8051f30fbab7f579d691137f1e23f3bb1ac2eb Mon Sep 17 00:00:00 2001
From: Qian Cai
Date: Tue, 16 Jul 2019 11:43:05 -0400
Subject: skbuff: fix compilation warnings in skb_dump()

The commit 6413139dfc64 ("skbuff: increase verbosity when dumping skb
data") introduced a few compilation warnings.

net/core/skbuff.c:766:32: warning: format specifies type 'unsigned
short' but the argument has type 'unsigned int' [-Wformat]
                       level, sk->sk_family, sk->sk_type,
sk->sk_protocol);
                                             ^~~~~~~~~~~
net/core/skbuff.c:766:45: warning: format specifies type 'unsigned
short' but the argument has type 'unsigned int' [-Wformat]
                       level, sk->sk_family, sk->sk_type,
sk->sk_protocol);
^~~~~~~~~~~~~~~

Fix them by using the proper types.

Fixes: 6413139dfc64 ("skbuff: increase verbosity when dumping skb data")
Signed-off-by: Qian Cai <cai@lca.pw>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6f1e31f674a3..0338820ee0ec 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -762,7 +762,7 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
 		printk("%sdev name=%s feat=0x%pNF\n",
 		       level, dev->name, &dev->features);
 	if (sk)
-		printk("%ssk family=%hu type=%hu proto=%hu\n",
+		printk("%ssk family=%hu type=%u proto=%u\n",
 		       level, sk->sk_family, sk->sk_type, sk->sk_protocol);
 
 	if (full_pkt && headroom)
-- 
cgit v1.2.3


From a5b647007e9d794956dbed9339a3354a9fc4d5c3 Mon Sep 17 00:00:00 2001
From: Vedang Patel
Date: Tue, 16 Jul 2019 12:52:18 -0700
Subject: fix: taprio: Change type of txtime-delay parameter to u32

During the review of the iproute2 patches for txtime-assist mode, it was
pointed out that it does not make sense for the txtime-delay parameter to
be negative. So, change the type of the parameter from s32 to u32.

Fixes: 4cfd5779bd6e ("taprio: Add support for txtime-assist mode")
Reported-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Vedang Patel <vedang.patel@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_sched.h | 2 +-
 net/sched/sch_taprio.c         | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 1f623252abe8..18f185299f47 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1174,7 +1174,7 @@ enum {
 	TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */
 	TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */
 	TCA_TAPRIO_ATTR_FLAGS, /* u32 */
-	TCA_TAPRIO_ATTR_TXTIME_DELAY, /* s32 */
+	TCA_TAPRIO_ATTR_TXTIME_DELAY, /* u32 */
 	__TCA_TAPRIO_ATTR_MAX,
 };
 
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index 388750ddc57a..c39db507ba3f 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -75,7 +75,7 @@ struct taprio_sched {
 	struct sched_gate_list __rcu *admin_sched;
 	struct hrtimer advance_timer;
 	struct list_head taprio_list;
-	int txtime_delay;
+	u32 txtime_delay;
 };
 
 static ktime_t sched_base_time(const struct sched_gate_list *sched)
@@ -1113,7 +1113,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
 			goto unlock;
 		}
 
-		q->txtime_delay = nla_get_s32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]);
+		q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]);
 	}
 
 	if (!TXTIME_ASSIST_IS_ENABLED(taprio_flags) &&
@@ -1430,7 +1430,7 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
 		goto options_error;
 
 	if (q->txtime_delay &&
-	    nla_put_s32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
+	    nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
 		goto options_error;
 
 	if (oper && dump_schedule(skb, oper))
-- 
cgit v1.2.3


From 86fda90ab5888d983f7307442ba62978e3504bd3 Mon Sep 17 00:00:00 2001
From: Hariprasad Kelam
Date: Tue, 16 Jul 2019 07:50:02 +0530
Subject: net: sctp: fix warning "NULL check before some freeing functions is
 not needed"

This patch removes NULL checks before calling kfree.

fixes below issues reported by coccicheck
net/sctp/sm_make_chunk.c:2586:3-8: WARNING: NULL check before some
freeing functions is not needed.
net/sctp/sm_make_chunk.c:2652:3-8: WARNING: NULL check before some
freeing functions is not needed.
net/sctp/sm_make_chunk.c:2667:3-8: WARNING: NULL check before some
freeing functions is not needed.
net/sctp/sm_make_chunk.c:2684:3-8: WARNING: NULL check before some
freeing functions is not needed.

Signed-off-by: Hariprasad Kelam <hariprasad.kelam@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_make_chunk.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index ed39396b9bba..36bd8a6e82df 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2582,8 +2582,7 @@ do_addr_param:
 	case SCTP_PARAM_STATE_COOKIE:
 		asoc->peer.cookie_len =
 			ntohs(param.p->length) - sizeof(struct sctp_paramhdr);
-		if (asoc->peer.cookie)
-			kfree(asoc->peer.cookie);
+		kfree(asoc->peer.cookie);
 		asoc->peer.cookie = kmemdup(param.cookie->body, asoc->peer.cookie_len, gfp);
 		if (!asoc->peer.cookie)
 			retval = 0;
@@ -2648,8 +2647,7 @@ do_addr_param:
 			goto fall_through;
 
 		/* Save peer's random parameter */
-		if (asoc->peer.peer_random)
-			kfree(asoc->peer.peer_random);
+		kfree(asoc->peer.peer_random);
 		asoc->peer.peer_random = kmemdup(param.p,
 					    ntohs(param.p->length), gfp);
 		if (!asoc->peer.peer_random) {
@@ -2663,8 +2661,7 @@ do_addr_param:
 			goto fall_through;
 
 		/* Save peer's HMAC list */
-		if (asoc->peer.peer_hmacs)
-			kfree(asoc->peer.peer_hmacs);
+		kfree(asoc->peer.peer_hmacs);
 		asoc->peer.peer_hmacs = kmemdup(param.p,
 					    ntohs(param.p->length), gfp);
 		if (!asoc->peer.peer_hmacs) {
@@ -2680,8 +2677,7 @@ do_addr_param:
 		if (!ep->auth_enable)
 			goto fall_through;
 
-		if (asoc->peer.peer_chunks)
-			kfree(asoc->peer.peer_chunks);
+		kfree(asoc->peer.peer_chunks);
 		asoc->peer.peer_chunks = kmemdup(param.p,
 					    ntohs(param.p->length), gfp);
 		if (!asoc->peer.peer_chunks)
-- 
cgit v1.2.3


From f11fe1dae1c46762a9a7b0dd142b6bfebe7783ff Mon Sep 17 00:00:00 2001
From: YueHaibing
Date: Tue, 16 Jul 2019 15:16:02 +0800
Subject: net/sched: Make NET_ACT_CT depends on NF_NAT

If NF_NAT is m and NET_ACT_CT is y, build fails:

net/sched/act_ct.o: In function `tcf_ct_act':
act_ct.c:(.text+0x21ac): undefined reference to `nf_ct_nat_ext_add'
act_ct.c:(.text+0x229a): undefined reference to `nf_nat_icmp_reply_translation'
act_ct.c:(.text+0x233a): undefined reference to `nf_nat_setup_info'
act_ct.c:(.text+0x234a): undefined reference to `nf_nat_alloc_null_binding'
act_ct.c:(.text+0x237c): undefined reference to `nf_nat_packet'

Reported-by: Hulk Robot <hulkci@huawei.com>
Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index dd55b9ac3a66..afd2ba157a13 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -942,7 +942,7 @@ config NET_ACT_TUNNEL_KEY
 
 config NET_ACT_CT
         tristate "connection tracking tc action"
-        depends on NET_CLS_ACT && NF_CONNTRACK
+        depends on NET_CLS_ACT && NF_CONNTRACK && NF_NAT
         help
 	  Say Y here to allow sending the packets to conntrack module.
 
-- 
cgit v1.2.3


From 2c7da8e6b041a8df2661def81ac90c9c0c719909 Mon Sep 17 00:00:00 2001
From: Gerd Rausch
Date: Tue, 16 Jul 2019 15:28:51 -0700
Subject: net/rds: Give fr_state a chance to transition to FRMR_IS_FREE

In the context of FRMR (ib_frmr.c):

Memory regions make it onto the "clean_list" via "rds_ib_flush_mr_pool",
after the memory region has been posted for invalidation via
"rds_ib_post_inv".

At that point in time, "fr_state" may still be in state "FRMR_IS_INUSE",
since the only place where "fr_state" transitions to "FRMR_IS_FREE"
is in "rds_ib_mr_cqe_handler", which is triggered by a tasklet.

So in case we notice that "fr_state != FRMR_IS_FREE" (see below),
we wait for "fr_inv_done" to trigger with a maximum of 10msec.
Then we check again, and only put the memory region onto the drop_list
(via "rds_ib_free_frmr") in case the situation remains unchanged.

This avoids the problem of memory-regions bouncing between "clean_list"
and "drop_list" before they even have a chance to be properly invalidated.

Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib_frmr.c | 27 ++++++++++++++++++++++++++-
 net/rds/ib_mr.h   |  1 +
 2 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 32ae26ed58a0..6038138d6e38 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -75,6 +75,7 @@ static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
 		pool->max_items_soft = pool->max_items;
 
 	frmr->fr_state = FRMR_IS_FREE;
+	init_waitqueue_head(&frmr->fr_inv_done);
 	return ibmr;
 
 out_no_cigar:
@@ -285,6 +286,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 	if (frmr->fr_inv) {
 		frmr->fr_state = FRMR_IS_FREE;
 		frmr->fr_inv = false;
+		wake_up(&frmr->fr_inv_done);
 	}
 
 	atomic_inc(&ic->i_fastreg_wrs);
@@ -345,8 +347,31 @@ struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
 	}
 
 	do {
-		if (ibmr)
+		if (ibmr) {
+			/* Memory regions make it onto the "clean_list" via
+			 * "rds_ib_flush_mr_pool", after the memory region has
+			 * been posted for invalidation via "rds_ib_post_inv".
+			 *
+			 * At that point in time, "fr_state" may still be
+			 * in state "FRMR_IS_INUSE", since the only place where
+			 * "fr_state" transitions to "FRMR_IS_FREE" is in
+			 * is in "rds_ib_mr_cqe_handler", which is
+			 * triggered by a tasklet.
+			 *
+			 * So we wait for "fr_inv_done" to trigger
+			 * and only put memory regions onto the drop_list
+			 * that failed (i.e. not marked "FRMR_IS_FREE").
+			 *
+			 * This avoids the problem of memory-regions bouncing
+			 * between "clean_list" and "drop_list" before they
+			 * even have a chance to be properly invalidated.
+			 */
+			frmr = &ibmr->u.frmr;
+			wait_event(frmr->fr_inv_done, frmr->fr_state != FRMR_IS_INUSE);
+			if (frmr->fr_state == FRMR_IS_FREE)
+				break;
 			rds_ib_free_frmr(ibmr, true);
+		}
 		ibmr = rds_ib_alloc_frmr(rds_ibdev, nents);
 		if (IS_ERR(ibmr))
 			return ibmr;
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 5da12c248431..42daccb7b5eb 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -57,6 +57,7 @@ struct rds_ib_frmr {
 	struct ib_mr		*mr;
 	enum rds_ib_fr_state	fr_state;
 	bool			fr_inv;
+	wait_queue_head_t	fr_inv_done;
 	struct ib_send_wr	fr_wr;
 	unsigned int		dma_npages;
 	unsigned int		sg_byte_len;
-- 
cgit v1.2.3


From c9467447fc50ec3715d8ec98f4da874fce539235 Mon Sep 17 00:00:00 2001
From: Gerd Rausch
Date: Tue, 16 Jul 2019 15:28:57 -0700
Subject: net/rds: Get rid of "wait_clean_list_grace" and add locking

Waiting for activity on the "clean_list" to quiesce is no substitute
for proper locking.

We can have multiple threads competing for "llist_del_first"
via "rds_ib_reuse_mr", and a single thread competing
for "llist_del_all" and "llist_del_first" via "rds_ib_flush_mr_pool".

Since "llist_del_first" depends on "list->first->next" not to change
in the midst of the operation, simply waiting for all current calls
to "rds_ib_reuse_mr" to quiesce across all CPUs is woefully inadequate:

By the time "wait_clean_list_grace" is done iterating over all CPUs to see
that there is no concurrent caller to "rds_ib_reuse_mr", a new caller may
have just shown up on the first CPU.

Furthermore, <linux/llist.h> explicitly calls out the need for locking:
 * Cases where locking is needed:
 * If we have multiple consumers with llist_del_first used in one consumer,
 * and llist_del_first or llist_del_all used in other consumers,
 * then a lock is needed.

Also, while at it, drop the unused "pool" parameter
from "list_to_llist_nodes".

Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib_mr.h   |  1 +
 net/rds/ib_rdma.c | 56 ++++++++++++++++++-------------------------------------
 2 files changed, 19 insertions(+), 38 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 42daccb7b5eb..ab26c20ed66f 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -98,6 +98,7 @@ struct rds_ib_mr_pool {
 	struct llist_head	free_list;	/* unused MRs */
 	struct llist_head	clean_list;	/* unused & unmapped MRs */
 	wait_queue_head_t	flush_wait;
+	spinlock_t		clean_lock;	/* "clean_list" concurrency */
 
 	atomic_t		free_pinned;	/* memory pinned by free MRs */
 	unsigned long		max_items;
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 0b347f46b2f4..6b047e63a769 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -40,9 +40,6 @@
 
 struct workqueue_struct *rds_ib_mr_wq;
 
-static DEFINE_PER_CPU(unsigned long, clean_list_grace);
-#define CLEAN_LIST_BUSY_BIT 0
-
 static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
 {
 	struct rds_ib_device *rds_ibdev;
@@ -195,12 +192,11 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
 {
 	struct rds_ib_mr *ibmr = NULL;
 	struct llist_node *ret;
-	unsigned long *flag;
+	unsigned long flags;
 
-	preempt_disable();
-	flag = this_cpu_ptr(&clean_list_grace);
-	set_bit(CLEAN_LIST_BUSY_BIT, flag);
+	spin_lock_irqsave(&pool->clean_lock, flags);
 	ret = llist_del_first(&pool->clean_list);
+	spin_unlock_irqrestore(&pool->clean_lock, flags);
 	if (ret) {
 		ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
 		if (pool->pool_type == RDS_IB_MR_8K_POOL)
@@ -209,23 +205,9 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
 			rds_ib_stats_inc(s_ib_rdma_mr_1m_reused);
 	}
 
-	clear_bit(CLEAN_LIST_BUSY_BIT, flag);
-	preempt_enable();
 	return ibmr;
 }
 
-static inline void wait_clean_list_grace(void)
-{
-	int cpu;
-	unsigned long *flag;
-
-	for_each_online_cpu(cpu) {
-		flag = &per_cpu(clean_list_grace, cpu);
-		while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
-			cpu_relax();
-	}
-}
-
 void rds_ib_sync_mr(void *trans_private, int direction)
 {
 	struct rds_ib_mr *ibmr = trans_private;
@@ -324,8 +306,7 @@ static unsigned int llist_append_to_list(struct llist_head *llist,
  * of clusters.  Each cluster has linked llist nodes of
  * MR_CLUSTER_SIZE mrs that are ready for reuse.
  */
-static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
-				struct list_head *list,
+static void list_to_llist_nodes(struct list_head *list,
 				struct llist_node **nodes_head,
 				struct llist_node **nodes_tail)
 {
@@ -402,8 +383,13 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
 	 */
 	dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list);
 	dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list);
-	if (free_all)
+	if (free_all) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&pool->clean_lock, flags);
 		llist_append_to_list(&pool->clean_list, &unmap_list);
+		spin_unlock_irqrestore(&pool->clean_lock, flags);
+	}
 
 	free_goal = rds_ib_flush_goal(pool, free_all);
 
@@ -416,27 +402,20 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
 		rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal);
 
 	if (!list_empty(&unmap_list)) {
-		/* we have to make sure that none of the things we're about
-		 * to put on the clean list would race with other cpus trying
-		 * to pull items off.  The llist would explode if we managed to
-		 * remove something from the clean list and then add it back again
-		 * while another CPU was spinning on that same item in llist_del_first.
-		 *
-		 * This is pretty unlikely, but just in case  wait for an llist grace period
-		 * here before adding anything back into the clean list.
-		 */
-		wait_clean_list_grace();
-
-		list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
+		unsigned long flags;
+
+		list_to_llist_nodes(&unmap_list, &clean_nodes, &clean_tail);
 		if (ibmr_ret) {
 			*ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
 			clean_nodes = clean_nodes->next;
 		}
 		/* more than one entry in llist nodes */
-		if (clean_nodes)
+		if (clean_nodes) {
+			spin_lock_irqsave(&pool->clean_lock, flags);
 			llist_add_batch(clean_nodes, clean_tail,
 					&pool->clean_list);
-
+			spin_unlock_irqrestore(&pool->clean_lock, flags);
+		}
 	}
 
 	atomic_sub(unpinned, &pool->free_pinned);
@@ -610,6 +589,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
 	init_llist_head(&pool->free_list);
 	init_llist_head(&pool->drop_list);
 	init_llist_head(&pool->clean_list);
+	spin_lock_init(&pool->clean_lock);
 	mutex_init(&pool->flush_lock);
 	init_waitqueue_head(&pool->flush_wait);
 	INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
-- 
cgit v1.2.3


From 5f33141d2fc05a2d2134ba0e7b47ce4aa88340f0 Mon Sep 17 00:00:00 2001
From: Gerd Rausch
Date: Tue, 16 Jul 2019 15:29:02 -0700
Subject: net/rds: Wait for the FRMR_IS_FREE (or FRMR_IS_STALE) transition
 after posting IB_WR_LOCAL_INV

In order to:
1) avoid a silly bouncing between "clean_list" and "drop_list"
   triggered by function "rds_ib_reg_frmr" as it is releases frmr
   regions whose state is not "FRMR_IS_FREE" right away.

2) prevent an invalid access error in a race from a pending
   "IB_WR_LOCAL_INV" operation with a teardown ("dma_unmap_sg", "put_page")
   and de-registration ("ib_dereg_mr") of the corresponding
   memory region.

Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib_frmr.c | 65 ++++++++++++++++++++++++++++++++-----------------------
 net/rds/ib_mr.h   |  2 ++
 2 files changed, 40 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 6038138d6e38..708c553c3da5 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -76,6 +76,7 @@ static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
 
 	frmr->fr_state = FRMR_IS_FREE;
 	init_waitqueue_head(&frmr->fr_inv_done);
+	init_waitqueue_head(&frmr->fr_reg_done);
 	return ibmr;
 
 out_no_cigar:
@@ -124,6 +125,7 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
 	 */
 	ib_update_fast_reg_key(frmr->mr, ibmr->remap_count++);
 	frmr->fr_state = FRMR_IS_INUSE;
+	frmr->fr_reg = true;
 
 	memset(&reg_wr, 0, sizeof(reg_wr));
 	reg_wr.wr.wr_id = (unsigned long)(void *)ibmr;
@@ -144,7 +146,17 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
 		if (printk_ratelimit())
 			pr_warn("RDS/IB: %s returned error(%d)\n",
 				__func__, ret);
+		goto out;
 	}
+
+	/* Wait for the registration to complete in order to prevent an invalid
+	 * access error resulting from a race between the memory region already
+	 * being accessed while registration is still pending.
+	 */
+	wait_event(frmr->fr_reg_done, !frmr->fr_reg);
+
+out:
+
 	return ret;
 }
 
@@ -262,6 +274,19 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
 		pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
 		goto out;
 	}
+
+	/* Wait for the FRMR_IS_FREE (or FRMR_IS_STALE) transition in order to
+	 * 1) avoid a silly bouncing between "clean_list" and "drop_list"
+	 *    triggered by function "rds_ib_reg_frmr" as it is releases frmr
+	 *    regions whose state is not "FRMR_IS_FREE" right away.
+	 * 2) prevents an invalid access error in a race
+	 *    from a pending "IB_WR_LOCAL_INV" operation
+	 *    with a teardown ("dma_unmap_sg", "put_page")
+	 *    and de-registration ("ib_dereg_mr") of the corresponding
+	 *    memory region.
+	 */
+	wait_event(frmr->fr_inv_done, frmr->fr_state != FRMR_IS_INUSE);
+
 out:
 	return ret;
 }
@@ -289,6 +314,11 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 		wake_up(&frmr->fr_inv_done);
 	}
 
+	if (frmr->fr_reg) {
+		frmr->fr_reg = false;
+		wake_up(&frmr->fr_reg_done);
+	}
+
 	atomic_inc(&ic->i_fastreg_wrs);
 }
 
@@ -297,14 +327,18 @@ void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
 {
 	struct rds_ib_mr *ibmr, *next;
 	struct rds_ib_frmr *frmr;
-	int ret = 0;
+	int ret = 0, ret2;
 	unsigned int freed = *nfreed;
 
 	/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
 	list_for_each_entry(ibmr, list, unmap_list) {
-		if (ibmr->sg_dma_len)
-			ret |= rds_ib_post_inv(ibmr);
+		if (ibmr->sg_dma_len) {
+			ret2 = rds_ib_post_inv(ibmr);
+			if (ret2 && !ret)
+				ret = ret2;
+		}
 	}
+
 	if (ret)
 		pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, ret);
 
@@ -347,31 +381,8 @@ struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
 	}
 
 	do {
-		if (ibmr) {
-			/* Memory regions make it onto the "clean_list" via
-			 * "rds_ib_flush_mr_pool", after the memory region has
-			 * been posted for invalidation via "rds_ib_post_inv".
-			 *
-			 * At that point in time, "fr_state" may still be
-			 * in state "FRMR_IS_INUSE", since the only place where
-			 * "fr_state" transitions to "FRMR_IS_FREE" is in
-			 * is in "rds_ib_mr_cqe_handler", which is
-			 * triggered by a tasklet.
-			 *
-			 * So we wait for "fr_inv_done" to trigger
-			 * and only put memory regions onto the drop_list
-			 * that failed (i.e. not marked "FRMR_IS_FREE").
-			 *
-			 * This avoids the problem of memory-regions bouncing
-			 * between "clean_list" and "drop_list" before they
-			 * even have a chance to be properly invalidated.
-			 */
-			frmr = &ibmr->u.frmr;
-			wait_event(frmr->fr_inv_done, frmr->fr_state != FRMR_IS_INUSE);
-			if (frmr->fr_state == FRMR_IS_FREE)
-				break;
+		if (ibmr)
 			rds_ib_free_frmr(ibmr, true);
-		}
 		ibmr = rds_ib_alloc_frmr(rds_ibdev, nents);
 		if (IS_ERR(ibmr))
 			return ibmr;
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index ab26c20ed66f..9045a8c0edff 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -58,6 +58,8 @@ struct rds_ib_frmr {
 	enum rds_ib_fr_state	fr_state;
 	bool			fr_inv;
 	wait_queue_head_t	fr_inv_done;
+	bool			fr_reg;
+	wait_queue_head_t	fr_reg_done;
 	struct ib_send_wr	fr_wr;
 	unsigned int		dma_npages;
 	unsigned int		sg_byte_len;
-- 
cgit v1.2.3


From aea01a2234d26ffa9d9ee01e43705824c0c7b08a Mon Sep 17 00:00:00 2001
From: Gerd Rausch
Date: Tue, 16 Jul 2019 15:29:07 -0700
Subject: net/rds: Fix NULL/ERR_PTR inconsistency

Make function "rds_ib_try_reuse_ibmr" return NULL in case
memory region could not be allocated, since callers
simply check if the return value is not NULL.

Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib_rdma.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 6b047e63a769..c8c1e3ae8d84 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -450,7 +450,7 @@ struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool)
 				rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
 			else
 				rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
-			return ERR_PTR(-EAGAIN);
+			break;
 		}
 
 		/* We do have some empty MRs. Flush them out. */
@@ -464,7 +464,7 @@ struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool)
 			return ibmr;
 	}
 
-	return ibmr;
+	return NULL;
 }
 
 static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
-- 
cgit v1.2.3


From 9547dff1085d5935d6070377023096821033e30c Mon Sep 17 00:00:00 2001
From: Gerd Rausch
Date: Tue, 16 Jul 2019 15:29:12 -0700
Subject: net/rds: Set fr_state only to FRMR_IS_FREE if IB_WR_LOCAL_INV had
 been successful

Fix a bug where fr_state first goes to FRMR_IS_STALE, because of a failure
of operation IB_WR_LOCAL_INV, but then gets set back to "FRMR_IS_FREE"
uncoditionally, even though the operation failed.

Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib_frmr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 708c553c3da5..adaa8e99e5a9 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -309,7 +309,8 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 	}
 
 	if (frmr->fr_inv) {
-		frmr->fr_state = FRMR_IS_FREE;
+		if (frmr->fr_state == FRMR_IS_INUSE)
+			frmr->fr_state = FRMR_IS_FREE;
 		frmr->fr_inv = false;
 		wake_up(&frmr->fr_inv_done);
 	}
-- 
cgit v1.2.3


From 3a2886cca703fde5ee21baea9fedf8b1389c59d7 Mon Sep 17 00:00:00 2001
From: Gerd Rausch
Date: Tue, 16 Jul 2019 15:29:17 -0700
Subject: net/rds: Keep track of and wait for FRWR segments in use upon
 shutdown

Since "rds_ib_free_frmr" and "rds_ib_free_frmr_list" simply put
the FRMR memory segments on the "drop_list" or "free_list",
and it is the job of "rds_ib_flush_mr_pool" to reap those entries
by ultimately issuing a "IB_WR_LOCAL_INV" work-request,
we need to trigger and then wait for all those memory segments
attached to a particular connection to be fully released before
we can move on to release the QP, CQ, etc.

So we make "rds_ib_conn_path_shutdown" wait for one more
atomic_t called "i_fastreg_inuse_count" that keeps track of how
many FRWR memory segments are out there marked "FRMR_IS_INUSE"
(and also wake_up rds_ib_ring_empty_wait, as they go away).

Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib.h      |  1 +
 net/rds/ib_cm.c   |  7 +++++++
 net/rds/ib_frmr.c | 43 +++++++++++++++++++++++++++++++++++++------
 3 files changed, 45 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/rds/ib.h b/net/rds/ib.h
index 66c03c7665b2..303c6ee8bdb7 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -156,6 +156,7 @@ struct rds_ib_connection {
 
 	/* To control the number of wrs from fastreg */
 	atomic_t		i_fastreg_wrs;
+	atomic_t		i_fastreg_inuse_count;
 
 	/* interrupt handling */
 	struct tasklet_struct	i_send_tasklet;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 8891822eba4f..1b6fd6c8b12b 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -40,6 +40,7 @@
 #include "rds_single_path.h"
 #include "rds.h"
 #include "ib.h"
+#include "ib_mr.h"
 
 /*
  * Set the selected protocol version
@@ -993,6 +994,11 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
 				ic->i_cm_id, err);
 		}
 
+		/* kick off "flush_worker" for all pools in order to reap
+		 * all FRMR registrations that are still marked "FRMR_IS_INUSE"
+		 */
+		rds_ib_flush_mrs();
+
 		/*
 		 * We want to wait for tx and rx completion to finish
 		 * before we tear down the connection, but we have to be
@@ -1005,6 +1011,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
 		wait_event(rds_ib_ring_empty_wait,
 			   rds_ib_ring_empty(&ic->i_recv_ring) &&
 			   (atomic_read(&ic->i_signaled_sends) == 0) &&
+			   (atomic_read(&ic->i_fastreg_inuse_count) == 0) &&
 			   (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
 		tasklet_kill(&ic->i_send_tasklet);
 		tasklet_kill(&ic->i_recv_tasklet);
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index adaa8e99e5a9..06ecf9d2d4bf 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -32,6 +32,24 @@
 
 #include "ib_mr.h"
 
+static inline void
+rds_transition_frwr_state(struct rds_ib_mr *ibmr,
+			  enum rds_ib_fr_state old_state,
+			  enum rds_ib_fr_state new_state)
+{
+	if (cmpxchg(&ibmr->u.frmr.fr_state,
+		    old_state, new_state) == old_state &&
+	    old_state == FRMR_IS_INUSE) {
+		/* enforce order of ibmr->u.frmr.fr_state update
+		 * before decrementing i_fastreg_inuse_count
+		 */
+		smp_mb__before_atomic();
+		atomic_dec(&ibmr->ic->i_fastreg_inuse_count);
+		if (waitqueue_active(&rds_ib_ring_empty_wait))
+			wake_up(&rds_ib_ring_empty_wait);
+	}
+}
+
 static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
 					   int npages)
 {
@@ -118,13 +136,18 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
 	if (unlikely(ret != ibmr->sg_len))
 		return ret < 0 ? ret : -EINVAL;
 
+	if (cmpxchg(&frmr->fr_state,
+		    FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE)
+		return -EBUSY;
+
+	atomic_inc(&ibmr->ic->i_fastreg_inuse_count);
+
 	/* Perform a WR for the fast_reg_mr. Each individual page
 	 * in the sg list is added to the fast reg page list and placed
 	 * inside the fast_reg_mr WR.  The key used is a rolling 8bit
 	 * counter, which should guarantee uniqueness.
 	 */
 	ib_update_fast_reg_key(frmr->mr, ibmr->remap_count++);
-	frmr->fr_state = FRMR_IS_INUSE;
 	frmr->fr_reg = true;
 
 	memset(&reg_wr, 0, sizeof(reg_wr));
@@ -141,7 +164,8 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
 	ret = ib_post_send(ibmr->ic->i_cm_id->qp, &reg_wr.wr, NULL);
 	if (unlikely(ret)) {
 		/* Failure here can be because of -ENOMEM as well */
-		frmr->fr_state = FRMR_IS_STALE;
+		rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE);
+
 		atomic_inc(&ibmr->ic->i_fastreg_wrs);
 		if (printk_ratelimit())
 			pr_warn("RDS/IB: %s returned error(%d)\n",
@@ -268,8 +292,12 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
 
 	ret = ib_post_send(i_cm_id->qp, s_wr, NULL);
 	if (unlikely(ret)) {
-		frmr->fr_state = FRMR_IS_STALE;
+		rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE);
 		frmr->fr_inv = false;
+		/* enforce order of frmr->fr_inv update
+		 * before incrementing i_fastreg_wrs
+		 */
+		smp_mb__before_atomic();
 		atomic_inc(&ibmr->ic->i_fastreg_wrs);
 		pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
 		goto out;
@@ -297,7 +325,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 	struct rds_ib_frmr *frmr = &ibmr->u.frmr;
 
 	if (wc->status != IB_WC_SUCCESS) {
-		frmr->fr_state = FRMR_IS_STALE;
+		rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE);
 		if (rds_conn_up(ic->conn))
 			rds_ib_conn_error(ic->conn,
 					  "frmr completion <%pI4,%pI4> status %u(%s), vendor_err 0x%x, disconnecting and reconnecting\n",
@@ -309,8 +337,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 	}
 
 	if (frmr->fr_inv) {
-		if (frmr->fr_state == FRMR_IS_INUSE)
-			frmr->fr_state = FRMR_IS_FREE;
+		rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_FREE);
 		frmr->fr_inv = false;
 		wake_up(&frmr->fr_inv_done);
 	}
@@ -320,6 +347,10 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 		wake_up(&frmr->fr_reg_done);
 	}
 
+	/* enforce order of frmr->{fr_reg,fr_inv} update
+	 * before incrementing i_fastreg_wrs
+	 */
+	smp_mb__before_atomic();
 	atomic_inc(&ic->i_fastreg_wrs);
 }
 
-- 
cgit v1.2.3


From aa4948937b7d5b4a8e6553f4938f8431b2fd783a Mon Sep 17 00:00:00 2001
From: Gerd Rausch
Date: Tue, 16 Jul 2019 15:29:23 -0700
Subject: net/rds: Initialize ic->i_fastreg_wrs upon allocation

Otherwise, if an IB connection is torn down before "rds_ib_setup_qp"
is called, the value of "ic->i_fastreg_wrs" is still at zero
(as it wasn't initialized by "rds_ib_setup_qp").
Consequently "rds_ib_conn_path_shutdown" will spin forever,
waiting for it to go back to "RDS_IB_DEFAULT_FR_WR",
which of course will never happen as there are no
outstanding work requests.

Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/ib_cm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 1b6fd6c8b12b..4de0214da63c 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -527,7 +527,6 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
 	attr.qp_type = IB_QPT_RC;
 	attr.send_cq = ic->i_send_cq;
 	attr.recv_cq = ic->i_recv_cq;
-	atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
 
 	/*
 	 * XXX this can fail if max_*_wr is too large?  Are we supposed
@@ -1139,6 +1138,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 	spin_lock_init(&ic->i_ack_lock);
 #endif
 	atomic_set(&ic->i_signaled_sends, 0);
+	atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
 
 	/*
 	 * rds_ib_conn_shutdown() waits for these to be emptied so they
-- 
cgit v1.2.3


From 3f05e6886a595c9a29a309c52f45326be917823c Mon Sep 17 00:00:00 2001
From: Cong Wang
Date: Tue, 16 Jul 2019 13:57:30 -0700
Subject: net_sched: unset TCQ_F_CAN_BYPASS when adding filters

For qdisc's that support TC filters and set TCQ_F_CAN_BYPASS,
notably fq_codel, it makes no sense to let packets bypass the TC
filters we setup in any scenario, otherwise our packets steering
policy could not be enforced.

This can be reproduced easily with the following script:

 ip li add dev dummy0 type dummy
 ifconfig dummy0 up
 tc qd add dev dummy0 root fq_codel
 tc filter add dev dummy0 parent 8001: protocol arp basic action mirred egress redirect dev lo
 tc filter add dev dummy0 parent 8001: protocol ip basic action mirred egress redirect dev lo
 ping -I dummy0 192.168.112.1

Without this patch, packets are sent directly to dummy0 without
hitting any of the filters. With this patch, packets are redirected
to loopback as expected.

This fix is not perfect, it only unsets the flag but does not set it back
because we have to save the information somewhere in the qdisc if we
really want that. Note, both fq_codel and sfq clear this flag in their
->bind_tcf() but this is clearly not sufficient when we don't use any
class ID.

Fixes: 23624935e0c4 ("net_sched: TCQ_F_CAN_BYPASS generalization")
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c      | 1 +
 net/sched/sch_fq_codel.c | 2 --
 net/sched/sch_sfq.c      | 2 --
 3 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 278014e26aec..d144233423c5 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -2152,6 +2152,7 @@ replay:
 		tfilter_notify(net, skb, n, tp, block, q, parent, fh,
 			       RTM_NEWTFILTER, false, rtnl_held);
 		tfilter_put(tp, fh);
+		q->flags &= ~TCQ_F_CAN_BYPASS;
 	}
 
 errout:
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index e2faf33d282b..d59fbcc745d1 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -596,8 +596,6 @@ static unsigned long fq_codel_find(struct Qdisc *sch, u32 classid)
 static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent,
 			      u32 classid)
 {
-	/* we cannot bypass queue discipline anymore */
-	sch->flags &= ~TCQ_F_CAN_BYPASS;
 	return 0;
 }
 
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 420bd8411677..68404a9d2ce4 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -824,8 +824,6 @@ static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 			      u32 classid)
 {
-	/* we cannot bypass queue discipline anymore */
-	sch->flags &= ~TCQ_F_CAN_BYPASS;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 66f8209547cc11d8e139d45cb7c937c1bbcce182 Mon Sep 17 00:00:00 2001
From: Cong Wang
Date: Wed, 17 Jul 2019 14:41:58 -0700
Subject: fib: relax source validation check for loopback packets

In a rare case where we redirect local packets from veth to lo,
these packets fail to pass the source validation when rp_filter
is turned on, as the tracing shows:

  <...>-311708 [040] ..s1 7951180.957825: fib_table_lookup: table 254 oif 0 iif 1 src 10.53.180.130 dst 10.53.180.130 tos 0 scope 0 flags 0
  <...>-311708 [040] ..s1 7951180.957826: fib_table_lookup_nh: nexthop dev eth0 oif 4 src 10.53.180.130

So, the fib table lookup returns eth0 as the nexthop even though
the packets are local and should be routed to loopback nonetheless,
but they can't pass the dev match check in fib_info_nh_uses_dev()
without this patch.

It should be safe to relax this check for this special case, as
normally packets coming out of loopback device still have skb_dst
so they won't even hit this slow path.

Cc: Julian Anastasov <ja@ssi.bg>
Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 317339cd7f03..e8bc939b56dd 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -388,6 +388,11 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	fib_combine_itag(itag, &res);
 
 	dev_match = fib_info_nh_uses_dev(res.fi, dev);
+	/* This is not common, loopback packets retain skb_dst so normally they
+	 * would not even hit this slow path.
+	 */
+	dev_match = dev_match || (res.type == RTN_LOCAL &&
+				  dev == net->loopback_dev);
 	if (dev_match) {
 		ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
 		return ret;
-- 
cgit v1.2.3


From 866e5fd8a7123444d865340ff21c1673f74cdecd Mon Sep 17 00:00:00 2001
From: Jon Maloy
Date: Wed, 17 Jul 2019 23:43:44 +0200
Subject: tipc: initialize 'validated' field of received packets

The tipc_msg_validate() function leaves a boolean flag 'validated' in
the validated buffer's control block, to avoid performing this action
more than once. However, at reception of new packets, the position of
this field may already have been set by lower layer protocols, so
that the packet is erroneously perceived as already validated by TIPC.

We fix this by initializing the said field to 'false' before performing
the initial validation.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/node.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/tipc/node.c b/net/tipc/node.c
index 324a1f91b394..3a5be1d7e572 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1807,6 +1807,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
 	__skb_queue_head_init(&xmitq);
 
 	/* Ensure message is well-formed before touching the header */
+	TIPC_SKB_CB(skb)->validated = false;
 	if (unlikely(!tipc_msg_validate(&skb)))
 		goto discard;
 	hdr = buf_msg(skb);
-- 
cgit v1.2.3


From 49d05fe2c9d1b4a27761c9807fec39b8155bef9e Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Wed, 17 Jul 2019 15:08:43 -0700
Subject: ipv6: rt6_check should return NULL if 'from' is NULL

Paul reported that l2tp sessions were broken after the commit referenced
in the Fixes tag. Prior to this commit rt6_check returned NULL if the
rt6_info 'from' was NULL - ie., the dst_entry was disconnected from a FIB
entry. Restore that behavior.

Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes")
Reported-by: Paul Donohue <linux-kernel@PaulSD.com>
Tested-by: Paul Donohue <linux-kernel@PaulSD.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4d2e6b31a8d6..6fe3097b9ab7 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2563,7 +2563,7 @@ static struct dst_entry *rt6_check(struct rt6_info *rt,
 {
 	u32 rt_cookie = 0;
 
-	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
+	if (!from || !fib6_get_cookie_safe(from, &rt_cookie) ||
 	    rt_cookie != cookie)
 		return NULL;
 
-- 
cgit v1.2.3


From 666a3d6e1e6b78df34f59e6c0b8907aa3c8dbb2e Mon Sep 17 00:00:00 2001
From: Su Yanjun
Date: Thu, 18 Jul 2019 10:19:23 +0800
Subject: udp: Fix typo in net/ipv4/udp.c

Signed-off-by: Su Yanjun <suyj.fnst@cn.fujitsu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c21862ba9c02..d88821c794fb 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2170,7 +2170,7 @@ start_lookup:
 
 /* Initialize UDP checksum. If exited with zero value (success),
  * CHECKSUM_UNNECESSARY means, that no more checks are required.
- * Otherwise, csum completion requires chacksumming packet body,
+ * Otherwise, csum completion requires checksumming packet body,
  * including udp header and folding it to skb->csum.
  */
 static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
-- 
cgit v1.2.3


From 54851aa90cf27041d64b12f65ac72e9f97bd90fd Mon Sep 17 00:00:00 2001
From: Ido Schimmel
Date: Wed, 17 Jul 2019 23:39:33 +0300
Subject: ipv6: Unlink sibling route in case of failure

When a route needs to be appended to an existing multipath route,
fib6_add_rt2node() first appends it to the siblings list and increments
the number of sibling routes on each sibling.

Later, the function notifies the route via call_fib6_entry_notifiers().
In case the notification is vetoed, the route is not unlinked from the
siblings list, which can result in a use-after-free.

Fix this by unlinking the route from the siblings list before returning
an error.

Audited the rest of the call sites from which the FIB notification chain
is called and could not find more problems.

Fixes: 2233000cba40 ("net/ipv6: Move call_fib6_entry_notifiers up for route adds")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reported-by: Alexander Petrovskiy <alexpe@mellanox.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 49884f96232b..87f47bc55c5e 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1151,8 +1151,24 @@ add:
 			err = call_fib6_entry_notifiers(info->nl_net,
 							FIB_EVENT_ENTRY_ADD,
 							rt, extack);
-			if (err)
+			if (err) {
+				struct fib6_info *sibling, *next_sibling;
+
+				/* If the route has siblings, then it first
+				 * needs to be unlinked from them.
+				 */
+				if (!rt->fib6_nsiblings)
+					return err;
+
+				list_for_each_entry_safe(sibling, next_sibling,
+							 &rt->fib6_siblings,
+							 fib6_siblings)
+					sibling->fib6_nsiblings--;
+				rt->fib6_nsiblings = 0;
+				list_del_init(&rt->fib6_siblings);
+				rt6_multipath_rebalance(next_sibling);
 				return err;
+			}
 		}
 
 		rcu_assign_pointer(rt->fib6_next, iter);
-- 
cgit v1.2.3


From 008cfbaa3f9f84efead76d2cea12b4dd05cce67d Mon Sep 17 00:00:00 2001
From: Wei Yongjun
Date: Wed, 17 Jul 2019 06:29:56 +0000
Subject: net: dsa: sja1105: Fix missing unlock on error in sk_buff()

Add the missing unlock before return from function sk_buff()
in the error handling case.

Fixes: f3097be21bf1 ("net: dsa: sja1105: Add a state machine for RX timestamping")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Vivien Didelot <vivien.didelot@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/tag_sja1105.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 1d96c9d4a8e9..26363d72d25b 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -216,6 +216,7 @@ static struct sk_buff
 		if (!skb) {
 			dev_err_ratelimited(dp->ds->dev,
 					    "Failed to copy stampable skb\n");
+			spin_unlock(&sp->data->meta_lock);
 			return NULL;
 		}
 		sja1105_transfer_meta(skb, meta);
-- 
cgit v1.2.3


From 8d650cdedaabb33e85e9b7c517c0c71fcecc1de9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet
Date: Thu, 18 Jul 2019 19:28:14 -0700
Subject: tcp: fix tcp_set_congestion_control() use from bpf hook

Neal reported incorrect use of ns_capable() from bpf hook.

bpf_setsockopt(...TCP_CONGESTION...)
  -> tcp_set_congestion_control()
   -> ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)
    -> ns_capable_common()
     -> current_cred()
      -> rcu_dereference_protected(current->cred, 1)

Accessing 'current' in bpf context makes no sense, since packets
are processed from softirq context.

As Neal stated : The capability check in tcp_set_congestion_control()
was written assuming a system call context, and then was reused from
a BPF call site.

The fix is to add a new parameter to tcp_set_congestion_control(),
so that the ns_capable() call is only performed under the right
context.

Fixes: 91b5b21c7c16 ("bpf: Add support for changing congestion control")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Lawrence Brakmo <brakmo@fb.com>
Reported-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   | 3 ++-
 net/core/filter.c   | 2 +-
 net/ipv4/tcp.c      | 4 +++-
 net/ipv4/tcp_cong.c | 6 +++---
 4 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index cca3c59b98bf..f42d300f0cfa 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1064,7 +1064,8 @@ void tcp_get_default_congestion_control(struct net *net, char *name);
 void tcp_get_available_congestion_control(char *buf, size_t len);
 void tcp_get_allowed_congestion_control(char *buf, size_t len);
 int tcp_set_allowed_congestion_control(char *allowed);
-int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit);
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
+			       bool reinit, bool cap_net_admin);
 u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 0f6854ccf894..4e2a79b2fd77 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4335,7 +4335,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 						    TCP_CA_NAME_MAX-1));
 			name[TCP_CA_NAME_MAX-1] = 0;
 			ret = tcp_set_congestion_control(sk, name, false,
-							 reinit);
+							 reinit, true);
 		} else {
 			struct tcp_sock *tp = tcp_sk(sk);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7846afacdf0b..776905899ac0 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2785,7 +2785,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		name[val] = 0;
 
 		lock_sock(sk);
-		err = tcp_set_congestion_control(sk, name, true, true);
+		err = tcp_set_congestion_control(sk, name, true, true,
+						 ns_capable(sock_net(sk)->user_ns,
+							    CAP_NET_ADMIN));
 		release_sock(sk);
 		return err;
 	}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index e1862b64a90f..c445a81d144e 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -333,7 +333,8 @@ out:
  * tcp_reinit_congestion_control (if the current congestion control was
  * already initialized.
  */
-int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit)
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
+			       bool reinit, bool cap_net_admin)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_congestion_ops *ca;
@@ -369,8 +370,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo
 		} else {
 			err = -EBUSY;
 		}
-	} else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
-		     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) {
+	} else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) {
 		err = -EPERM;
 	} else if (!try_module_get(ca->owner)) {
 		err = -EBUSY;
-- 
cgit v1.2.3