From 5133498f4ad1123a5ffd4c08df6431dab882cc32 Mon Sep 17 00:00:00 2001
From: Lorenz Bauer
Date: Fri, 13 Dec 2019 18:08:17 +0000
Subject: bpf: Clear skb->tstamp in bpf_redirect when necessary

Redirecting a packet from ingress to egress by using bpf_redirect
breaks if the egress interface has an fq qdisc installed. This is the same
problem as fixed in 'commit 8203e2d844d3 ("net: clear skb->tstamp in forwarding paths")

Clear skb->tstamp when redirecting into the egress path.

Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit time.")
Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC")
Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/bpf/20191213180817.2510-1-lmb@cloudflare.com
---
 net/core/filter.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/core/filter.c b/net/core/filter.c
index f1e703eed3d2..d914257763b5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2055,6 +2055,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
 	}
 
 	skb->dev = dev;
+	skb->tstamp = 0;
 
 	dev_xmit_recursion_inc();
 	ret = dev_queue_xmit(skb);
-- 
cgit v1.2.3


From a2ea07465c8d7984cc6b8b1f0b3324f9b138094a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann
Date: Mon, 16 Dec 2019 17:49:00 +0100
Subject: bpf: Fix missing prog untrack in release_maps

Commit da765a2f5993 ("bpf: Add poke dependency tracking for prog array
maps") wrongly assumed that in case of prog load errors, we're cleaning
up all program tracking via bpf_free_used_maps().

However, it can happen that we're still at the point where we didn't copy
map pointers into the prog's aux section such that env->prog->aux->used_maps
is still zero, running into a UAF. In such case, the verifier has similar
release_maps() helper that drops references to used maps from its env.

Consolidate the release code into __bpf_free_used_maps() and call it from
all sides to fix it.

Fixes: da765a2f5993 ("bpf: Add poke dependency tracking for prog array maps")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/1c2909484ca524ae9f55109b06f22b6213e76376.1576514756.git.daniel@iogearbox.net
---
 include/linux/bpf.h   |  2 ++
 kernel/bpf/core.c     | 14 ++++++++++----
 kernel/bpf/verifier.c | 14 ++------------
 3 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ac7de5291509..085a59afba85 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -818,6 +818,8 @@ struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog);
 void bpf_prog_put(struct bpf_prog *prog);
 int __bpf_prog_charge(struct user_struct *user, u32 pages);
 void __bpf_prog_uncharge(struct user_struct *user, u32 pages);
+void __bpf_free_used_maps(struct bpf_prog_aux *aux,
+			  struct bpf_map **used_maps, u32 len);
 
 void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock);
 void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 49e32acad7d8..6231858df723 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2048,18 +2048,24 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)
 	}
 }
 
-static void bpf_free_used_maps(struct bpf_prog_aux *aux)
+void __bpf_free_used_maps(struct bpf_prog_aux *aux,
+			  struct bpf_map **used_maps, u32 len)
 {
 	struct bpf_map *map;
-	int i;
+	u32 i;
 
 	bpf_free_cgroup_storage(aux);
-	for (i = 0; i < aux->used_map_cnt; i++) {
-		map = aux->used_maps[i];
+	for (i = 0; i < len; i++) {
+		map = used_maps[i];
 		if (map->ops->map_poke_untrack)
 			map->ops->map_poke_untrack(map, aux);
 		bpf_map_put(map);
 	}
+}
+
+static void bpf_free_used_maps(struct bpf_prog_aux *aux)
+{
+	__bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt);
 	kfree(aux->used_maps);
 }
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 034ef81f935b..a1acdce77070 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8298,18 +8298,8 @@ next_insn:
 /* drop refcnt of maps used by the rejected program */
 static void release_maps(struct bpf_verifier_env *env)
 {
-	enum bpf_cgroup_storage_type stype;
-	int i;
-
-	for_each_cgroup_storage_type(stype) {
-		if (!env->prog->aux->cgroup_storage[stype])
-			continue;
-		bpf_cgroup_storage_release(env->prog,
-			env->prog->aux->cgroup_storage[stype]);
-	}
-
-	for (i = 0; i < env->used_map_cnt; i++)
-		bpf_map_put(env->used_maps[i]);
+	__bpf_free_used_maps(env->prog->aux, env->used_maps,
+			     env->used_map_cnt);
 }
 
 /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
-- 
cgit v1.2.3


From e47304232b373362228bf233f17bd12b11c9aafc Mon Sep 17 00:00:00 2001
From: Daniel Borkmann
Date: Tue, 17 Dec 2019 13:28:16 +0100
Subject: bpf: Fix cgroup local storage prog tracking

Recently noticed that we're tracking programs related to local storage maps
through their prog pointer. This is a wrong assumption since the prog pointer
can still change throughout the verification process, for example, whenever
bpf_patch_insn_single() is called.

Therefore, the prog pointer that was assigned via bpf_cgroup_storage_assign()
is not guaranteed to be the same as we pass in bpf_cgroup_storage_release()
and the map would therefore remain in busy state forever. Fix this by using
the prog's aux pointer which is stable throughout verification and beyond.

Fixes: de9cbbaadba5 ("bpf: introduce cgroup storage maps")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/1471c69eca3022218666f909bc927a92388fd09e.1576580332.git.daniel@iogearbox.net
---
 include/linux/bpf-cgroup.h |  8 ++++----
 kernel/bpf/core.c          |  3 +--
 kernel/bpf/local_storage.c | 24 ++++++++++++------------
 kernel/bpf/verifier.c      |  2 +-
 4 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 169fd25f6bc2..9be71c195d74 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -157,8 +157,8 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
 			     struct cgroup *cgroup,
 			     enum bpf_attach_type type);
 void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
-int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
-void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
+int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map);
+void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *map);
 
 int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
@@ -360,9 +360,9 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
 
 static inline void bpf_cgroup_storage_set(
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {}
-static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog,
+static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux,
 					    struct bpf_map *map) { return 0; }
-static inline void bpf_cgroup_storage_release(struct bpf_prog *prog,
+static inline void bpf_cgroup_storage_release(struct bpf_prog_aux *aux,
 					      struct bpf_map *map) {}
 static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
 	struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return NULL; }
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 6231858df723..af6b738cf435 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2043,8 +2043,7 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)
 	for_each_cgroup_storage_type(stype) {
 		if (!aux->cgroup_storage[stype])
 			continue;
-		bpf_cgroup_storage_release(aux->prog,
-					   aux->cgroup_storage[stype]);
+		bpf_cgroup_storage_release(aux, aux->cgroup_storage[stype]);
 	}
 }
 
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 2ba750725cb2..6bf605dd4b94 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -20,7 +20,7 @@ struct bpf_cgroup_storage_map {
 	struct bpf_map map;
 
 	spinlock_t lock;
-	struct bpf_prog *prog;
+	struct bpf_prog_aux *aux;
 	struct rb_root root;
 	struct list_head list;
 };
@@ -420,7 +420,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
 	.map_seq_show_elem = cgroup_storage_seq_show_elem,
 };
 
-int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
+int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map)
 {
 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
 	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
@@ -428,14 +428,14 @@ int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map)
 
 	spin_lock_bh(&map->lock);
 
-	if (map->prog && map->prog != prog)
+	if (map->aux && map->aux != aux)
 		goto unlock;
-	if (prog->aux->cgroup_storage[stype] &&
-	    prog->aux->cgroup_storage[stype] != _map)
+	if (aux->cgroup_storage[stype] &&
+	    aux->cgroup_storage[stype] != _map)
 		goto unlock;
 
-	map->prog = prog;
-	prog->aux->cgroup_storage[stype] = _map;
+	map->aux = aux;
+	aux->cgroup_storage[stype] = _map;
 	ret = 0;
 unlock:
 	spin_unlock_bh(&map->lock);
@@ -443,16 +443,16 @@ unlock:
 	return ret;
 }
 
-void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map)
+void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *_map)
 {
 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
 	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
 
 	spin_lock_bh(&map->lock);
-	if (map->prog == prog) {
-		WARN_ON(prog->aux->cgroup_storage[stype] != _map);
-		map->prog = NULL;
-		prog->aux->cgroup_storage[stype] = NULL;
+	if (map->aux == aux) {
+		WARN_ON(aux->cgroup_storage[stype] != _map);
+		map->aux = NULL;
+		aux->cgroup_storage[stype] = NULL;
 	}
 	spin_unlock_bh(&map->lock);
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a1acdce77070..6ef71429d997 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -8268,7 +8268,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 			env->used_maps[env->used_map_cnt++] = map;
 
 			if (bpf_map_is_cgroup_storage(map) &&
-			    bpf_cgroup_storage_assign(env->prog, map)) {
+			    bpf_cgroup_storage_assign(env->prog->aux, map)) {
 				verbose(env, "only one cgroup storage of each type is allowed\n");
 				fdput(f);
 				return -EBUSY;
-- 
cgit v1.2.3


From 06870682087b58398671e8cdc896cd62314c4399 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy
Date: Tue, 17 Dec 2019 16:20:42 +0000
Subject: xsk: Add rcu_read_lock around the XSK wakeup

The XSK wakeup callback in drivers makes some sanity checks before
triggering NAPI. However, some configuration changes may occur during
this function that affect the result of those checks. For example, the
interface can go down, and all the resources will be destroyed after the
checks in the wakeup function, but before it attempts to use these
resources. Wrap this callback in rcu_read_lock to allow driver to
synchronize_rcu before actually destroying the resources.

xsk_wakeup is a new function that encapsulates calling ndo_xsk_wakeup
wrapped into the RCU lock. After this commit, xsk_poll starts using
xsk_wakeup and checks xs->zc instead of ndo_xsk_wakeup != NULL to decide
ndo_xsk_wakeup should be called. It also fixes a bug introduced with the
need_wakeup feature: a non-zero-copy socket may be used with a driver
supporting zero-copy, and in this case ndo_xsk_wakeup should not be
called, so the xs->zc check is the correct one.

Fixes: 77cd0d7b3f25 ("xsk: add support for need_wakeup flag in AF_XDP rings")
Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20191217162023.16011-2-maximmi@mellanox.com
---
 net/xdp/xsk.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 956793893c9d..328f661b83b2 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -334,12 +334,21 @@ out:
 }
 EXPORT_SYMBOL(xsk_umem_consume_tx);
 
-static int xsk_zc_xmit(struct xdp_sock *xs)
+static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
 {
 	struct net_device *dev = xs->dev;
+	int err;
+
+	rcu_read_lock();
+	err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
+	rcu_read_unlock();
+
+	return err;
+}
 
-	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
-					       XDP_WAKEUP_TX);
+static int xsk_zc_xmit(struct xdp_sock *xs)
+{
+	return xsk_wakeup(xs, XDP_WAKEUP_TX);
 }
 
 static void xsk_destruct_skb(struct sk_buff *skb)
@@ -453,19 +462,16 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock,
 	__poll_t mask = datagram_poll(file, sock, wait);
 	struct sock *sk = sock->sk;
 	struct xdp_sock *xs = xdp_sk(sk);
-	struct net_device *dev;
 	struct xdp_umem *umem;
 
 	if (unlikely(!xsk_is_bound(xs)))
 		return mask;
 
-	dev = xs->dev;
 	umem = xs->umem;
 
 	if (umem->need_wakeup) {
-		if (dev->netdev_ops->ndo_xsk_wakeup)
-			dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
-							umem->need_wakeup);
+		if (xs->zc)
+			xsk_wakeup(xs, umem->need_wakeup);
 		else
 			/* Poll needs to drive Tx also in copy mode */
 			__xsk_sendmsg(sk);
-- 
cgit v1.2.3


From 9cf88808ad6a0f1e958e00abd9a081295fe6da0c Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy
Date: Tue, 17 Dec 2019 16:20:44 +0000
Subject: net/mlx5e: Fix concurrency issues between config flow and XSK

After disabling resources necessary for XSK (the XDP program, channels,
XSK queues), use synchronize_rcu to wait until the XSK wakeup function
finishes, before freeing the resources.

Suspend XSK wakeups during switching channels. If the XDP program is
being removed, synchronize_rcu before closing the old channels to allow
XSK wakeup to complete.

Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20191217162023.16011-3-maximmi@mellanox.com
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h   | 22 +++++++++-------------
 .../net/ethernet/mellanox/mlx5/core/en/xsk/setup.c |  1 +
 .../net/ethernet/mellanox/mlx5/core/en/xsk/tx.c    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 19 +------------------
 5 files changed, 13 insertions(+), 33 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 2c16add0b642..9c8427698238 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -760,7 +760,7 @@ enum {
 	MLX5E_STATE_OPENED,
 	MLX5E_STATE_DESTROYING,
 	MLX5E_STATE_XDP_TX_ENABLED,
-	MLX5E_STATE_XDP_OPEN,
+	MLX5E_STATE_XDP_ACTIVE,
 };
 
 struct mlx5e_rqt {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
index 36ac1e3816b9..d7587f40ecae 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.h
@@ -75,12 +75,18 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 static inline void mlx5e_xdp_tx_enable(struct mlx5e_priv *priv)
 {
 	set_bit(MLX5E_STATE_XDP_TX_ENABLED, &priv->state);
+
+	if (priv->channels.params.xdp_prog)
+		set_bit(MLX5E_STATE_XDP_ACTIVE, &priv->state);
 }
 
 static inline void mlx5e_xdp_tx_disable(struct mlx5e_priv *priv)
 {
+	if (priv->channels.params.xdp_prog)
+		clear_bit(MLX5E_STATE_XDP_ACTIVE, &priv->state);
+
 	clear_bit(MLX5E_STATE_XDP_TX_ENABLED, &priv->state);
-	/* let other device's napi(s) see our new state */
+	/* Let other device's napi(s) and XSK wakeups see our new state. */
 	synchronize_rcu();
 }
 
@@ -89,19 +95,9 @@ static inline bool mlx5e_xdp_tx_is_enabled(struct mlx5e_priv *priv)
 	return test_bit(MLX5E_STATE_XDP_TX_ENABLED, &priv->state);
 }
 
-static inline void mlx5e_xdp_set_open(struct mlx5e_priv *priv)
-{
-	set_bit(MLX5E_STATE_XDP_OPEN, &priv->state);
-}
-
-static inline void mlx5e_xdp_set_closed(struct mlx5e_priv *priv)
-{
-	clear_bit(MLX5E_STATE_XDP_OPEN, &priv->state);
-}
-
-static inline bool mlx5e_xdp_is_open(struct mlx5e_priv *priv)
+static inline bool mlx5e_xdp_is_active(struct mlx5e_priv *priv)
 {
-	return test_bit(MLX5E_STATE_XDP_OPEN, &priv->state);
+	return test_bit(MLX5E_STATE_XDP_ACTIVE, &priv->state);
 }
 
 static inline void mlx5e_xmit_xdp_doorbell(struct mlx5e_xdpsq *sq)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
index 631af8dee517..c28cbae42331 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
@@ -144,6 +144,7 @@ void mlx5e_close_xsk(struct mlx5e_channel *c)
 {
 	clear_bit(MLX5E_CHANNEL_STATE_XSK, c->state);
 	napi_synchronize(&c->napi);
+	synchronize_rcu(); /* Sync with the XSK wakeup. */
 
 	mlx5e_close_rq(&c->xskrq);
 	mlx5e_close_cq(&c->xskrq.cq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
index 87827477d38c..fe2d596cb361 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
@@ -14,7 +14,7 @@ int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
 	struct mlx5e_channel *c;
 	u16 ix;
 
-	if (unlikely(!mlx5e_xdp_is_open(priv)))
+	if (unlikely(!mlx5e_xdp_is_active(priv)))
 		return -ENETDOWN;
 
 	if (unlikely(!mlx5e_qid_get_ch_if_in_group(params, qid, MLX5E_RQ_GROUP_XSK, &ix)))
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 4980e80a5e85..4997b8a51994 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3000,12 +3000,9 @@ void mlx5e_timestamp_init(struct mlx5e_priv *priv)
 int mlx5e_open_locked(struct net_device *netdev)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
-	bool is_xdp = priv->channels.params.xdp_prog;
 	int err;
 
 	set_bit(MLX5E_STATE_OPENED, &priv->state);
-	if (is_xdp)
-		mlx5e_xdp_set_open(priv);
 
 	err = mlx5e_open_channels(priv, &priv->channels);
 	if (err)
@@ -3020,8 +3017,6 @@ int mlx5e_open_locked(struct net_device *netdev)
 	return 0;
 
 err_clear_state_opened_flag:
-	if (is_xdp)
-		mlx5e_xdp_set_closed(priv);
 	clear_bit(MLX5E_STATE_OPENED, &priv->state);
 	return err;
 }
@@ -3053,8 +3048,6 @@ int mlx5e_close_locked(struct net_device *netdev)
 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
 		return 0;
 
-	if (priv->channels.params.xdp_prog)
-		mlx5e_xdp_set_closed(priv);
 	clear_bit(MLX5E_STATE_OPENED, &priv->state);
 
 	netif_carrier_off(priv->netdev);
@@ -4371,16 +4364,6 @@ static int mlx5e_xdp_allowed(struct mlx5e_priv *priv, struct bpf_prog *prog)
 	return 0;
 }
 
-static int mlx5e_xdp_update_state(struct mlx5e_priv *priv)
-{
-	if (priv->channels.params.xdp_prog)
-		mlx5e_xdp_set_open(priv);
-	else
-		mlx5e_xdp_set_closed(priv);
-
-	return 0;
-}
-
 static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
 {
 	struct mlx5e_priv *priv = netdev_priv(netdev);
@@ -4415,7 +4398,7 @@ static int mlx5e_xdp_set(struct net_device *netdev, struct bpf_prog *prog)
 		mlx5e_set_rq_type(priv->mdev, &new_channels.params);
 		old_prog = priv->channels.params.xdp_prog;
 
-		err = mlx5e_safe_switch_channels(priv, &new_channels, mlx5e_xdp_update_state);
+		err = mlx5e_safe_switch_channels(priv, &new_channels, NULL);
 		if (err)
 			goto unlock;
 	} else {
-- 
cgit v1.2.3


From b3873a5be757b44d51af542a50a6f2a3b6f95284 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy
Date: Tue, 17 Dec 2019 16:20:45 +0000
Subject: net/i40e: Fix concurrency issues between config flow and XSK

Use synchronize_rcu to wait until the XSK wakeup function finishes
before destroying the resources it uses:

1. i40e_down already calls synchronize_rcu. On i40e_down either
__I40E_VSI_DOWN or __I40E_CONFIG_BUSY is set. Check the latter in
i40e_xsk_wakeup (the former is already checked there).

2. After switching the XDP program, call synchronize_rcu to let
i40e_xsk_wakeup exit before the XDP program is freed.

3. Changing the number of channels brings the interface down (see
i40e_prep_for_reset and i40e_pf_quiesce_all_vsi).

4. Disabling UMEM sets __I40E_CONFIG_BUSY, too.

Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20191217162023.16011-4-maximmi@mellanox.com
---
 drivers/net/ethernet/intel/i40e/i40e.h      |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c | 10 +++++++---
 drivers/net/ethernet/intel/i40e/i40e_xsk.c  |  4 ++++
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h
index cb6367334ca7..4833187bd259 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -1152,7 +1152,7 @@ void i40e_set_fec_in_flags(u8 fec_cfg, u32 *flags);
 
 static inline bool i40e_enabled_xdp_vsi(struct i40e_vsi *vsi)
 {
-	return !!vsi->xdp_prog;
+	return !!READ_ONCE(vsi->xdp_prog);
 }
 
 int i40e_create_queue_channel(struct i40e_vsi *vsi, struct i40e_channel *ch);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 1ccabeafa44c..2c5af6d4a6b1 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -6823,8 +6823,8 @@ void i40e_down(struct i40e_vsi *vsi)
 	for (i = 0; i < vsi->num_queue_pairs; i++) {
 		i40e_clean_tx_ring(vsi->tx_rings[i]);
 		if (i40e_enabled_xdp_vsi(vsi)) {
-			/* Make sure that in-progress ndo_xdp_xmit
-			 * calls are completed.
+			/* Make sure that in-progress ndo_xdp_xmit and
+			 * ndo_xsk_wakeup calls are completed.
 			 */
 			synchronize_rcu();
 			i40e_clean_tx_ring(vsi->xdp_rings[i]);
@@ -12546,8 +12546,12 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi,
 
 	old_prog = xchg(&vsi->xdp_prog, prog);
 
-	if (need_reset)
+	if (need_reset) {
+		if (!prog)
+			/* Wait until ndo_xsk_wakeup completes. */
+			synchronize_rcu();
 		i40e_reset_and_rebuild(pf, true, true);
+	}
 
 	for (i = 0; i < vsi->num_queue_pairs; i++)
 		WRITE_ONCE(vsi->rx_rings[i]->xdp_prog, vsi->xdp_prog);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index d07e1a890428..f73cd917c44f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -787,8 +787,12 @@ int i40e_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
 {
 	struct i40e_netdev_priv *np = netdev_priv(dev);
 	struct i40e_vsi *vsi = np->vsi;
+	struct i40e_pf *pf = vsi->back;
 	struct i40e_ring *ring;
 
+	if (test_bit(__I40E_CONFIG_BUSY, pf->state))
+		return -ENETDOWN;
+
 	if (test_bit(__I40E_VSI_DOWN, vsi->state))
 		return -ENETDOWN;
 
-- 
cgit v1.2.3


From c0fdccfd226a1424683d3000d9e08384391210a2 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy
Date: Tue, 17 Dec 2019 16:20:47 +0000
Subject: net/ixgbe: Fix concurrency issues between config flow and XSK

Use synchronize_rcu to wait until the XSK wakeup function finishes
before destroying the resources it uses:

1. ixgbe_down already calls synchronize_rcu after setting __IXGBE_DOWN.

2. After switching the XDP program, call synchronize_rcu to let
ixgbe_xsk_wakeup exit before the XDP program is freed.

3. Changing the number of channels brings the interface down.

4. Disabling UMEM sets __IXGBE_TX_DISABLED before closing hardware
resources and resetting xsk_umem. Check that bit in ixgbe_xsk_wakeup to
avoid using the XDP ring when it's already destroyed. synchronize_rcu is
called from ixgbe_txrx_ring_disable.

Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20191217162023.16011-5-maximmi@mellanox.com
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 7 ++++++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c  | 8 ++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 25c097cd8100..82a30b597cf9 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10261,7 +10261,12 @@ static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog)
 
 	/* If transitioning XDP modes reconfigure rings */
 	if (need_reset) {
-		int err = ixgbe_setup_tc(dev, adapter->hw_tcs);
+		int err;
+
+		if (!prog)
+			/* Wait until ndo_xsk_wakeup completes. */
+			synchronize_rcu();
+		err = ixgbe_setup_tc(dev, adapter->hw_tcs);
 
 		if (err) {
 			rcu_assign_pointer(adapter->xdp_prog, old_prog);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
index d6feaacfbf89..b43be9f14105 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
@@ -709,10 +709,14 @@ int ixgbe_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
 	if (qid >= adapter->num_xdp_queues)
 		return -ENXIO;
 
-	if (!adapter->xdp_ring[qid]->xsk_umem)
+	ring = adapter->xdp_ring[qid];
+
+	if (test_bit(__IXGBE_TX_DISABLED, &ring->state))
+		return -ENETDOWN;
+
+	if (!ring->xsk_umem)
 		return -ENXIO;
 
-	ring = adapter->xdp_ring[qid];
 	if (!napi_if_scheduled_mark_missed(&ring->q_vector->napi)) {
 		u64 eics = BIT_ULL(ring->q_vector->v_idx);
 
-- 
cgit v1.2.3


From 1148f9adbe71415836a18a36c1b4ece999ab0973 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin
Date: Wed, 18 Dec 2019 12:18:21 +0300
Subject: net, sysctl: Fix compiler warning when only cBPF is present

proc_dointvec_minmax_bpf_restricted() has been firstly introduced
in commit 2e4a30983b0f ("bpf: restrict access to core bpf sysctls")
under CONFIG_HAVE_EBPF_JIT. Then, this ifdef has been removed in
ede95a63b5e8 ("bpf: add bpf_jit_limit knob to restrict unpriv
allocations"), because a new sysctl, bpf_jit_limit, made use of it.
Finally, this parameter has become long instead of integer with
fdadd04931c2 ("bpf: fix bpf_jit_limit knob for PAGE_SIZE >= 64K")
and thus, a new proc_dolongvec_minmax_bpf_restricted() has been
added.

With this last change, we got back to that
proc_dointvec_minmax_bpf_restricted() is used only under
CONFIG_HAVE_EBPF_JIT, but the corresponding ifdef has not been
brought back.

So, in configurations like CONFIG_BPF_JIT=y && CONFIG_HAVE_EBPF_JIT=n
since v4.20 we have:

  CC      net/core/sysctl_net_core.o
net/core/sysctl_net_core.c:292:1: warning: ‘proc_dointvec_minmax_bpf_restricted’ defined but not used [-Wunused-function]
  292 | proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
      | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Suppress this by guarding it with CONFIG_HAVE_EBPF_JIT again.

Fixes: fdadd04931c2 ("bpf: fix bpf_jit_limit knob for PAGE_SIZE >= 64K")
Signed-off-by: Alexander Lobakin <alobakin@dlink.ru>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20191218091821.7080-1-alobakin@dlink.ru
---
 net/core/sysctl_net_core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index eb29e5adc84d..9f9e00ba3ad7 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -288,6 +288,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
 	return ret;
 }
 
+# ifdef CONFIG_HAVE_EBPF_JIT
 static int
 proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
 				    void __user *buffer, size_t *lenp,
@@ -298,6 +299,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
 
 	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 }
+# endif /* CONFIG_HAVE_EBPF_JIT */
 
 static int
 proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
-- 
cgit v1.2.3


From cc52d9140aa920d8d61c7f6de3fff5fea6692ea9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann
Date: Thu, 19 Dec 2019 22:19:50 +0100
Subject: bpf: Fix record_func_key to perform backtracking on r3

While testing Cilium with /unreleased/ Linus' tree under BPF-based NodePort
implementation, I noticed a strange BPF SNAT engine behavior from time to
time. In some cases it would do the correct SNAT/DNAT service translation,
but at a random point in time it would just stop and perform an unexpected
translation after SYN, SYN/ACK and stack would send a RST back. While initially
assuming that there is some sort of a race condition in BPF code, adding
trace_printk()s for debugging purposes at some point seemed to have resolved
the issue auto-magically.

Digging deeper on this Heisenbug and reducing the trace_printk() calls to
an absolute minimum, it turns out that a single call would suffice to
trigger / not trigger the seen RST issue, even though the logic of the
program itself remains unchanged. Turns out the single call changed verifier
pruning behavior to get everything to work. Reconstructing a minimal test
case, the incorrect JIT dump looked as follows:

  # bpftool p d j i 11346
  0xffffffffc0cba96c:
  [...]
    21:   movzbq 0x30(%rdi),%rax
    26:   cmp    $0xd,%rax
    2a:   je     0x000000000000003a
    2c:   xor    %edx,%edx
    2e:   movabs $0xffff89cc74e85800,%rsi
    38:   jmp    0x0000000000000049
    3a:   mov    $0x2,%edx
    3f:   movabs $0xffff89cc74e85800,%rsi
    49:   mov    -0x224(%rbp),%eax
    4f:   cmp    $0x20,%eax
    52:   ja     0x0000000000000062
    54:   add    $0x1,%eax
    57:   mov    %eax,-0x224(%rbp)
    5d:   jmpq   0xffffffffffff6911
    62:   mov    $0x1,%eax
  [...]

Hence, unexpectedly, JIT emitted a direct jump even though retpoline based
one would have been needed since in line 2c and 3a we have different slot
keys in BPF reg r3. Verifier log of the test case reveals what happened:

  0: (b7) r0 = 14
  1: (73) *(u8 *)(r1 +48) = r0
  2: (71) r0 = *(u8 *)(r1 +48)
  3: (15) if r0 == 0xd goto pc+4
   R0_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff)) R1=ctx(id=0,off=0,imm=0) R10=fp0
  4: (b7) r3 = 0
  5: (18) r2 = 0xffff89cc74d54a00
  7: (05) goto pc+3
  11: (85) call bpf_tail_call#12
  12: (b7) r0 = 1
  13: (95) exit
  from 3 to 8: R0_w=inv13 R1=ctx(id=0,off=0,imm=0) R10=fp0
  8: (b7) r3 = 2
  9: (18) r2 = 0xffff89cc74d54a00
  11: safe
  processed 13 insns (limit 1000000) [...]

Second branch is pruned by verifier since considered safe, but issue is that
record_func_key() couldn't have seen the index in line 3a and therefore
decided that emitting a direct jump at this location was okay.

Fix this by reusing our backtracking logic for precise scalar verification
in order to prevent pruning on the slot key. This means verifier will track
content of r3 all the way backwards and only prune if both scalars were
unknown in state equivalence check and therefore poisoned in the first place
in record_func_key(). The range is [x,x] in record_func_key() case since
the slot always would have to be constant immediate. Correct verification
after fix:

  0: (b7) r0 = 14
  1: (73) *(u8 *)(r1 +48) = r0
  2: (71) r0 = *(u8 *)(r1 +48)
  3: (15) if r0 == 0xd goto pc+4
   R0_w=invP(id=0,umax_value=255,var_off=(0x0; 0xff)) R1=ctx(id=0,off=0,imm=0) R10=fp0
  4: (b7) r3 = 0
  5: (18) r2 = 0x0
  7: (05) goto pc+3
  11: (85) call bpf_tail_call#12
  12: (b7) r0 = 1
  13: (95) exit
  from 3 to 8: R0_w=invP13 R1=ctx(id=0,off=0,imm=0) R10=fp0
  8: (b7) r3 = 2
  9: (18) r2 = 0x0
  11: (85) call bpf_tail_call#12
  12: (b7) r0 = 1
  13: (95) exit
  processed 15 insns (limit 1000000) [...]

And correct corresponding JIT dump:

  # bpftool p d j i 11
  0xffffffffc0dc34c4:
  [...]
    21:	  movzbq 0x30(%rdi),%rax
    26:	  cmp    $0xd,%rax
    2a:	  je     0x000000000000003a
    2c:	  xor    %edx,%edx
    2e:	  movabs $0xffff9928b4c02200,%rsi
    38:	  jmp    0x0000000000000049
    3a:	  mov    $0x2,%edx
    3f:	  movabs $0xffff9928b4c02200,%rsi
    49:	  cmp    $0x4,%rdx
    4d:	  jae    0x0000000000000093
    4f:	  and    $0x3,%edx
    52:	  mov    %edx,%edx
    54:	  cmp    %edx,0x24(%rsi)
    57:	  jbe    0x0000000000000093
    59:	  mov    -0x224(%rbp),%eax
    5f:	  cmp    $0x20,%eax
    62:	  ja     0x0000000000000093
    64:	  add    $0x1,%eax
    67:	  mov    %eax,-0x224(%rbp)
    6d:	  mov    0x110(%rsi,%rdx,8),%rax
    75:	  test   %rax,%rax
    78:	  je     0x0000000000000093
    7a:	  mov    0x30(%rax),%rax
    7e:	  add    $0x19,%rax
    82:   callq  0x000000000000008e
    87:   pause
    89:   lfence
    8c:   jmp    0x0000000000000087
    8e:   mov    %rax,(%rsp)
    92:   retq
    93:   mov    $0x1,%eax
  [...]

Also explicitly adding explicit env->allow_ptr_leaks to fixup_bpf_calls() since
backtracking is enabled under former (direct jumps as well, but use different
test). In case of only tracking different map pointers as in c93552c443eb ("bpf:
properly enforce index mask to prevent out-of-bounds speculation"), pruning
cannot make such short-cuts, neither if there are paths with scalar and non-scalar
types as r3. mark_chain_precision() is only needed after we know that
register_is_const(). If it was not the case, we already poison the key on first
path and non-const key in later paths are not matching the scalar range in regsafe()
either. Cilium NodePort testing passes fine as well now. Note, released kernels
not affected.

Fixes: d2e4c1e6c294 ("bpf: Constant map key tracking for prog array pokes")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/ac43ffdeb7386c5bd688761ed266f3722bb39823.1576789878.git.daniel@iogearbox.net
---
 kernel/bpf/verifier.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6ef71429d997..4983940cbdca 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4134,6 +4134,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 	struct bpf_map *map = meta->map_ptr;
 	struct tnum range;
 	u64 val;
+	int err;
 
 	if (func_id != BPF_FUNC_tail_call)
 		return 0;
@@ -4150,6 +4151,10 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 		return 0;
 	}
 
+	err = mark_chain_precision(env, BPF_REG_3);
+	if (err)
+		return err;
+
 	val = reg->var_off.value;
 	if (bpf_map_key_unseen(aux))
 		bpf_map_key_store(aux, val);
@@ -9272,7 +9277,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			insn->code = BPF_JMP | BPF_TAIL_CALL;
 
 			aux = &env->insn_aux_data[i + delta];
-			if (prog->jit_requested && !expect_blinding &&
+			if (env->allow_ptr_leaks && !expect_blinding &&
+			    prog->jit_requested &&
 			    !bpf_map_key_poisoned(aux) &&
 			    !bpf_map_ptr_poisoned(aux) &&
 			    !bpf_map_ptr_unpriv(aux)) {
-- 
cgit v1.2.3


From 3123d8018d4686cf193806c4e27a9853550ed895 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann
Date: Thu, 19 Dec 2019 22:19:51 +0100
Subject: bpf: Add further test_verifier cases for record_func_key

Expand dummy prog generation such that we can easily check on return
codes and add few more test cases to make sure we keep on tracking
pruning behavior.

  # ./test_verifier
  [...]
  #1066/p XDP pkt read, pkt_data <= pkt_meta', bad access 1 OK
  #1067/p XDP pkt read, pkt_data <= pkt_meta', bad access 2 OK
  Summary: 1580 PASSED, 0 SKIPPED, 0 FAILED

Also verified that JIT dump of added test cases looks good.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/df7200b6021444fd369376d227de917357285b65.1576789878.git.daniel@iogearbox.net
---
 tools/testing/selftests/bpf/test_verifier.c        |  43 +++---
 .../testing/selftests/bpf/verifier/ref_tracking.c  |   6 +-
 tools/testing/selftests/bpf/verifier/runtime_jit.c | 151 +++++++++++++++++++++
 3 files changed, 176 insertions(+), 24 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index d27fd929abb9..87eaa49609a0 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -408,10 +408,10 @@ static void update_map(int fd, int index)
 	assert(!bpf_map_update_elem(fd, &index, &value, 0));
 }
 
-static int create_prog_dummy1(enum bpf_prog_type prog_type)
+static int create_prog_dummy_simple(enum bpf_prog_type prog_type, int ret)
 {
 	struct bpf_insn prog[] = {
-		BPF_MOV64_IMM(BPF_REG_0, 42),
+		BPF_MOV64_IMM(BPF_REG_0, ret),
 		BPF_EXIT_INSN(),
 	};
 
@@ -419,14 +419,15 @@ static int create_prog_dummy1(enum bpf_prog_type prog_type)
 				ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
 }
 
-static int create_prog_dummy2(enum bpf_prog_type prog_type, int mfd, int idx)
+static int create_prog_dummy_loop(enum bpf_prog_type prog_type, int mfd,
+				  int idx, int ret)
 {
 	struct bpf_insn prog[] = {
 		BPF_MOV64_IMM(BPF_REG_3, idx),
 		BPF_LD_MAP_FD(BPF_REG_2, mfd),
 		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
 			     BPF_FUNC_tail_call),
-		BPF_MOV64_IMM(BPF_REG_0, 41),
+		BPF_MOV64_IMM(BPF_REG_0, ret),
 		BPF_EXIT_INSN(),
 	};
 
@@ -435,10 +436,9 @@ static int create_prog_dummy2(enum bpf_prog_type prog_type, int mfd, int idx)
 }
 
 static int create_prog_array(enum bpf_prog_type prog_type, uint32_t max_elem,
-			     int p1key)
+			     int p1key, int p2key, int p3key)
 {
-	int p2key = 1;
-	int mfd, p1fd, p2fd;
+	int mfd, p1fd, p2fd, p3fd;
 
 	mfd = bpf_create_map(BPF_MAP_TYPE_PROG_ARRAY, sizeof(int),
 			     sizeof(int), max_elem, 0);
@@ -449,23 +449,24 @@ static int create_prog_array(enum bpf_prog_type prog_type, uint32_t max_elem,
 		return -1;
 	}
 
-	p1fd = create_prog_dummy1(prog_type);
-	p2fd = create_prog_dummy2(prog_type, mfd, p2key);
-	if (p1fd < 0 || p2fd < 0)
-		goto out;
+	p1fd = create_prog_dummy_simple(prog_type, 42);
+	p2fd = create_prog_dummy_loop(prog_type, mfd, p2key, 41);
+	p3fd = create_prog_dummy_simple(prog_type, 24);
+	if (p1fd < 0 || p2fd < 0 || p3fd < 0)
+		goto err;
 	if (bpf_map_update_elem(mfd, &p1key, &p1fd, BPF_ANY) < 0)
-		goto out;
+		goto err;
 	if (bpf_map_update_elem(mfd, &p2key, &p2fd, BPF_ANY) < 0)
-		goto out;
+		goto err;
+	if (bpf_map_update_elem(mfd, &p3key, &p3fd, BPF_ANY) < 0) {
+err:
+		close(mfd);
+		mfd = -1;
+	}
+	close(p3fd);
 	close(p2fd);
 	close(p1fd);
-
 	return mfd;
-out:
-	close(p2fd);
-	close(p1fd);
-	close(mfd);
-	return -1;
 }
 
 static int create_map_in_map(void)
@@ -684,7 +685,7 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
 	}
 
 	if (*fixup_prog1) {
-		map_fds[4] = create_prog_array(prog_type, 4, 0);
+		map_fds[4] = create_prog_array(prog_type, 4, 0, 1, 2);
 		do {
 			prog[*fixup_prog1].imm = map_fds[4];
 			fixup_prog1++;
@@ -692,7 +693,7 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
 	}
 
 	if (*fixup_prog2) {
-		map_fds[5] = create_prog_array(prog_type, 8, 7);
+		map_fds[5] = create_prog_array(prog_type, 8, 7, 1, 2);
 		do {
 			prog[*fixup_prog2].imm = map_fds[5];
 			fixup_prog2++;
diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c
index ebcbf154c460..604b46151736 100644
--- a/tools/testing/selftests/bpf/verifier/ref_tracking.c
+++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c
@@ -455,7 +455,7 @@
 	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
 	BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 7),
 	/* bpf_tail_call() */
-	BPF_MOV64_IMM(BPF_REG_3, 2),
+	BPF_MOV64_IMM(BPF_REG_3, 3),
 	BPF_LD_MAP_FD(BPF_REG_2, 0),
 	BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
@@ -478,7 +478,7 @@
 	BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
 	BPF_EMIT_CALL(BPF_FUNC_sk_release),
 	/* bpf_tail_call() */
-	BPF_MOV64_IMM(BPF_REG_3, 2),
+	BPF_MOV64_IMM(BPF_REG_3, 3),
 	BPF_LD_MAP_FD(BPF_REG_2, 0),
 	BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
@@ -497,7 +497,7 @@
 	BPF_SK_LOOKUP(sk_lookup_tcp),
 	/* bpf_tail_call() */
 	BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
-	BPF_MOV64_IMM(BPF_REG_3, 2),
+	BPF_MOV64_IMM(BPF_REG_3, 3),
 	BPF_LD_MAP_FD(BPF_REG_2, 0),
 	BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
diff --git a/tools/testing/selftests/bpf/verifier/runtime_jit.c b/tools/testing/selftests/bpf/verifier/runtime_jit.c
index a9a8f620e71c..94c399d1faca 100644
--- a/tools/testing/selftests/bpf/verifier/runtime_jit.c
+++ b/tools/testing/selftests/bpf/verifier/runtime_jit.c
@@ -27,6 +27,19 @@
 {
 	"runtime/jit: tail_call within bounds, no prog",
 	.insns = {
+	BPF_MOV64_IMM(BPF_REG_3, 3),
+	BPF_LD_MAP_FD(BPF_REG_2, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+	BPF_MOV64_IMM(BPF_REG_0, 1),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_prog1 = { 1 },
+	.result = ACCEPT,
+	.retval = 1,
+},
+{
+	"runtime/jit: tail_call within bounds, key 2",
+	.insns = {
 	BPF_MOV64_IMM(BPF_REG_3, 2),
 	BPF_LD_MAP_FD(BPF_REG_2, 0),
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
@@ -35,8 +48,146 @@
 	},
 	.fixup_prog1 = { 1 },
 	.result = ACCEPT,
+	.retval = 24,
+},
+{
+	"runtime/jit: tail_call within bounds, key 2 / key 2, first branch",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 13),
+	BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+		    offsetof(struct __sk_buff, cb[0])),
+	BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+		    offsetof(struct __sk_buff, cb[0])),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 13, 4),
+	BPF_MOV64_IMM(BPF_REG_3, 2),
+	BPF_LD_MAP_FD(BPF_REG_2, 0),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 3),
+	BPF_MOV64_IMM(BPF_REG_3, 2),
+	BPF_LD_MAP_FD(BPF_REG_2, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+	BPF_MOV64_IMM(BPF_REG_0, 1),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_prog1 = { 5, 9 },
+	.result = ACCEPT,
+	.retval = 24,
+},
+{
+	"runtime/jit: tail_call within bounds, key 2 / key 2, second branch",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 14),
+	BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+		    offsetof(struct __sk_buff, cb[0])),
+	BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+		    offsetof(struct __sk_buff, cb[0])),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 13, 4),
+	BPF_MOV64_IMM(BPF_REG_3, 2),
+	BPF_LD_MAP_FD(BPF_REG_2, 0),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 3),
+	BPF_MOV64_IMM(BPF_REG_3, 2),
+	BPF_LD_MAP_FD(BPF_REG_2, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+	BPF_MOV64_IMM(BPF_REG_0, 1),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_prog1 = { 5, 9 },
+	.result = ACCEPT,
+	.retval = 24,
+},
+{
+	"runtime/jit: tail_call within bounds, key 0 / key 2, first branch",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 13),
+	BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+		    offsetof(struct __sk_buff, cb[0])),
+	BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+		    offsetof(struct __sk_buff, cb[0])),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 13, 4),
+	BPF_MOV64_IMM(BPF_REG_3, 0),
+	BPF_LD_MAP_FD(BPF_REG_2, 0),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 3),
+	BPF_MOV64_IMM(BPF_REG_3, 2),
+	BPF_LD_MAP_FD(BPF_REG_2, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+	BPF_MOV64_IMM(BPF_REG_0, 1),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_prog1 = { 5, 9 },
+	.result = ACCEPT,
+	.retval = 24,
+},
+{
+	"runtime/jit: tail_call within bounds, key 0 / key 2, second branch",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 14),
+	BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+		    offsetof(struct __sk_buff, cb[0])),
+	BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+		    offsetof(struct __sk_buff, cb[0])),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 13, 4),
+	BPF_MOV64_IMM(BPF_REG_3, 0),
+	BPF_LD_MAP_FD(BPF_REG_2, 0),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 3),
+	BPF_MOV64_IMM(BPF_REG_3, 2),
+	BPF_LD_MAP_FD(BPF_REG_2, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+	BPF_MOV64_IMM(BPF_REG_0, 1),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_prog1 = { 5, 9 },
+	.result = ACCEPT,
+	.retval = 42,
+},
+{
+	"runtime/jit: tail_call within bounds, different maps, first branch",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 13),
+	BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+		    offsetof(struct __sk_buff, cb[0])),
+	BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+		    offsetof(struct __sk_buff, cb[0])),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 13, 4),
+	BPF_MOV64_IMM(BPF_REG_3, 0),
+	BPF_LD_MAP_FD(BPF_REG_2, 0),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 3),
+	BPF_MOV64_IMM(BPF_REG_3, 0),
+	BPF_LD_MAP_FD(BPF_REG_2, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+	BPF_MOV64_IMM(BPF_REG_0, 1),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_prog1 = { 5 },
+	.fixup_prog2 = { 9 },
+	.result_unpriv = REJECT,
+	.errstr_unpriv = "tail_call abusing map_ptr",
+	.result = ACCEPT,
 	.retval = 1,
 },
+{
+	"runtime/jit: tail_call within bounds, different maps, second branch",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_0, 14),
+	BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+		    offsetof(struct __sk_buff, cb[0])),
+	BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+		    offsetof(struct __sk_buff, cb[0])),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 13, 4),
+	BPF_MOV64_IMM(BPF_REG_3, 0),
+	BPF_LD_MAP_FD(BPF_REG_2, 0),
+	BPF_JMP_IMM(BPF_JA, 0, 0, 3),
+	BPF_MOV64_IMM(BPF_REG_3, 0),
+	BPF_LD_MAP_FD(BPF_REG_2, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+	BPF_MOV64_IMM(BPF_REG_0, 1),
+	BPF_EXIT_INSN(),
+	},
+	.fixup_prog1 = { 5 },
+	.fixup_prog2 = { 9 },
+	.result_unpriv = REJECT,
+	.errstr_unpriv = "tail_call abusing map_ptr",
+	.result = ACCEPT,
+	.retval = 42,
+},
 {
 	"runtime/jit: tail_call out of bounds",
 	.insns = {
-- 
cgit v1.2.3