From 51de082570e5374d4578cb159738485ddb0fddfe Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Thu, 30 Nov 2017 09:02:29 -0800
Subject: samples/bpf: Convert magic numbers to names in multi-prog cgroup test
 case

Attach flag 1 == BPF_F_ALLOW_OVERRIDE; attach flag 2 == BPF_F_ALLOW_MULTI.
Update the calls to bpf_prog_attach() in test_cgrp2_attach2.c to use the
names over the magic numbers.

Fixes: 39323e788cb67 ("samples/bpf: add multi-prog cgroup test case")
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/test_cgrp2_attach2.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

(limited to 'samples')

diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c
index 3e8232cc04a8..1af412ec6007 100644
--- a/samples/bpf/test_cgrp2_attach2.c
+++ b/samples/bpf/test_cgrp2_attach2.c
@@ -78,7 +78,8 @@ static int test_foo_bar(void)
 	if (join_cgroup(FOO))
 		goto err;
 
-	if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) {
+	if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS,
+			    BPF_F_ALLOW_OVERRIDE)) {
 		log_err("Attaching prog to /foo");
 		goto err;
 	}
@@ -97,7 +98,8 @@ static int test_foo_bar(void)
 	printf("Attached DROP prog. This ping in cgroup /foo/bar should fail...\n");
 	assert(system(PING_CMD) != 0);
 
-	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
+	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+			    BPF_F_ALLOW_OVERRIDE)) {
 		log_err("Attaching prog to /foo/bar");
 		goto err;
 	}
@@ -114,7 +116,8 @@ static int test_foo_bar(void)
 	       "This ping in cgroup /foo/bar should fail...\n");
 	assert(system(PING_CMD) != 0);
 
-	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
+	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+			    BPF_F_ALLOW_OVERRIDE)) {
 		log_err("Attaching prog to /foo/bar");
 		goto err;
 	}
@@ -128,7 +131,8 @@ static int test_foo_bar(void)
 	       "This ping in cgroup /foo/bar should pass...\n");
 	assert(system(PING_CMD) == 0);
 
-	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
+	if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+			    BPF_F_ALLOW_OVERRIDE)) {
 		log_err("Attaching prog to /foo/bar");
 		goto err;
 	}
@@ -161,13 +165,15 @@ static int test_foo_bar(void)
 		goto err;
 	}
 
-	if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
+	if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+			     BPF_F_ALLOW_OVERRIDE)) {
 		errno = 0;
 		log_err("Unexpected success attaching overridable prog to /foo/bar");
 		goto err;
 	}
 
-	if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) {
+	if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS,
+			     BPF_F_ALLOW_OVERRIDE)) {
 		errno = 0;
 		log_err("Unexpected success attaching overridable prog to /foo");
 		goto err;
@@ -273,27 +279,33 @@ static int test_multiprog(void)
 	if (join_cgroup("/cg1/cg2/cg3/cg4/cg5"))
 		goto err;
 
-	if (bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, 2)) {
+	if (bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS,
+			    BPF_F_ALLOW_MULTI)) {
 		log_err("Attaching prog to cg1");
 		goto err;
 	}
-	if (!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, 2)) {
+	if (!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS,
+			     BPF_F_ALLOW_MULTI)) {
 		log_err("Unexpected success attaching the same prog to cg1");
 		goto err;
 	}
-	if (bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS, 2)) {
+	if (bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS,
+			    BPF_F_ALLOW_MULTI)) {
 		log_err("Attaching prog2 to cg1");
 		goto err;
 	}
-	if (bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS, 1)) {
+	if (bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS,
+			    BPF_F_ALLOW_OVERRIDE)) {
 		log_err("Attaching prog to cg2");
 		goto err;
 	}
-	if (bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS, 2)) {
+	if (bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS,
+			    BPF_F_ALLOW_MULTI)) {
 		log_err("Attaching prog to cg3");
 		goto err;
 	}
-	if (bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS, 1)) {
+	if (bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS,
+			    BPF_F_ALLOW_OVERRIDE)) {
 		log_err("Attaching prog to cg4");
 		goto err;
 	}
-- 
cgit v1.2.3


From 56ddd30280d5730ad349c425de7811d596b19476 Mon Sep 17 00:00:00 2001
From: William Tu
Date: Fri, 1 Dec 2017 15:26:10 -0800
Subject: samples/bpf: extend test_tunnel_bpf.sh with ip6gre

Extend existing tests for vxlan, gre, geneve, ipip, erspan,
to include ip6 gre and gretap tunnel.

Signed-off-by: William Tu <u9012063@gmail.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 samples/bpf/tcbpf2_kern.c      | 43 ++++++++++++++++++++++++++++
 samples/bpf/test_tunnel_bpf.sh | 65 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+)

(limited to 'samples')

diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c
index 370b749f5ee6..15a469220e19 100644
--- a/samples/bpf/tcbpf2_kern.c
+++ b/samples/bpf/tcbpf2_kern.c
@@ -81,6 +81,49 @@ int _gre_get_tunnel(struct __sk_buff *skb)
 	return TC_ACT_OK;
 }
 
+SEC("ip6gretap_set_tunnel")
+int _ip6gretap_set_tunnel(struct __sk_buff *skb)
+{
+	struct bpf_tunnel_key key;
+	int ret;
+
+	__builtin_memset(&key, 0x0, sizeof(key));
+	key.remote_ipv6[3] = _htonl(0x11); /* ::11 */
+	key.tunnel_id = 2;
+	key.tunnel_tos = 0;
+	key.tunnel_ttl = 64;
+	key.tunnel_label = 0xabcde;
+
+	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_OK;
+}
+
+SEC("ip6gretap_get_tunnel")
+int _ip6gretap_get_tunnel(struct __sk_buff *skb)
+{
+	char fmt[] = "key %d remote ip6 ::%x label %x\n";
+	struct bpf_tunnel_key key;
+	int ret;
+
+	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	bpf_trace_printk(fmt, sizeof(fmt),
+			 key.tunnel_id, key.remote_ipv6[3], key.tunnel_label);
+
+	return TC_ACT_OK;
+}
+
 SEC("erspan_set_tunnel")
 int _erspan_set_tunnel(struct __sk_buff *skb)
 {
diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh
index 312e1722a39f..226f45381b76 100755
--- a/samples/bpf/test_tunnel_bpf.sh
+++ b/samples/bpf/test_tunnel_bpf.sh
@@ -33,6 +33,30 @@ function add_gre_tunnel {
 	ip addr add dev $DEV 10.1.1.200/24
 }
 
+function add_ip6gretap_tunnel {
+
+	# assign ipv6 address
+	ip netns exec at_ns0 ip addr add ::11/96 dev veth0
+	ip netns exec at_ns0 ip link set dev veth0 up
+	ip addr add dev veth1 ::22/96
+	ip link set dev veth1 up
+
+	# in namespace
+	ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE flowlabel 0xbcdef key 2 \
+		local ::11 remote ::22
+
+	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+	ip netns exec at_ns0 ip addr add dev $DEV_NS fc80::100/96
+	ip netns exec at_ns0 ip link set dev $DEV_NS up
+
+	# out of namespace
+	ip link add dev $DEV type $TYPE external
+	ip addr add dev $DEV 10.1.1.200/24
+	ip addr add dev $DEV fc80::200/24
+	ip link set dev $DEV up
+}
+
 function add_erspan_tunnel {
 	# in namespace
 	ip netns exec at_ns0 \
@@ -113,6 +137,41 @@ function test_gre {
 	cleanup
 }
 
+function test_ip6gre {
+	TYPE=ip6gre
+	DEV_NS=ip6gre00
+	DEV=ip6gre11
+	config_device
+	# reuse the ip6gretap function
+	add_ip6gretap_tunnel
+	attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel
+	# underlay
+	ping6 -c 4 ::11
+	# overlay: ipv4 over ipv6
+	ip netns exec at_ns0 ping -c 1 10.1.1.200
+	ping -c 1 10.1.1.100
+	# overlay: ipv6 over ipv6
+	ip netns exec at_ns0 ping6 -c 1 fc80::200
+	cleanup
+}
+
+function test_ip6gretap {
+	TYPE=ip6gretap
+	DEV_NS=ip6gretap00
+	DEV=ip6gretap11
+	config_device
+	add_ip6gretap_tunnel
+	attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel
+	# underlay
+	ping6 -c 4 ::11
+	# overlay: ipv4 over ipv6
+	ip netns exec at_ns0 ping -i .2 -c 1 10.1.1.200
+	ping -c 1 10.1.1.100
+	# overlay: ipv6 over ipv6
+	ip netns exec at_ns0 ping6 -c 1 fc80::200
+	cleanup
+}
+
 function test_erspan {
 	TYPE=erspan
 	DEV_NS=erspan00
@@ -175,6 +234,8 @@ function cleanup {
 	ip link del veth1
 	ip link del ipip11
 	ip link del gretap11
+	ip link del ip6gre11
+	ip link del ip6gretap11
 	ip link del vxlan11
 	ip link del geneve11
 	ip link del erspan11
@@ -187,6 +248,10 @@ trap cleanup 0 2 3 6 9
 cleanup
 echo "Testing GRE tunnel..."
 test_gre
+echo "Testing IP6GRE tunnel..."
+test_ip6gre
+echo "Testing IP6GRETAP tunnel..."
+test_ip6gretap
 echo "Testing ERSPAN tunnel..."
 test_erspan
 echo "Testing VXLAN tunnel..."
-- 
cgit v1.2.3


From d37e3bb774ee922a7b5b8c26d798d1575b225858 Mon Sep 17 00:00:00 2001
From: William Tu
Date: Tue, 5 Dec 2017 15:15:45 -0800
Subject: samples/bpf: add ip6erspan sample code

Extend the existing tests for ip6erspan.

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 samples/bpf/tcbpf2_kern.c      | 58 ++++++++++++++++++++++++++++++++++++++++++
 samples/bpf/test_tunnel_bpf.sh | 37 +++++++++++++++++++++++++++
 2 files changed, 95 insertions(+)

(limited to 'samples')

diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c
index 15a469220e19..79ad061079dd 100644
--- a/samples/bpf/tcbpf2_kern.c
+++ b/samples/bpf/tcbpf2_kern.c
@@ -181,6 +181,64 @@ int _erspan_get_tunnel(struct __sk_buff *skb)
 	return TC_ACT_OK;
 }
 
+SEC("ip4ip6erspan_set_tunnel")
+int _ip4ip6erspan_set_tunnel(struct __sk_buff *skb)
+{
+	struct bpf_tunnel_key key;
+	struct erspan_metadata md;
+	int ret;
+
+	__builtin_memset(&key, 0x0, sizeof(key));
+	key.remote_ipv6[3] = _htonl(0x11);
+	key.tunnel_id = 2;
+	key.tunnel_tos = 0;
+	key.tunnel_ttl = 64;
+
+	ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+				     BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	md.index = htonl(123);
+	ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_OK;
+}
+
+SEC("ip4ip6erspan_get_tunnel")
+int _ip4ip6erspan_get_tunnel(struct __sk_buff *skb)
+{
+	char fmt[] = "key %d remote ip6 ::%x erspan index 0x%x\n";
+	struct bpf_tunnel_key key;
+	struct erspan_metadata md;
+	u32 index;
+	int ret;
+
+	ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md));
+	if (ret < 0) {
+		ERROR(ret);
+		return TC_ACT_SHOT;
+	}
+
+	index = bpf_ntohl(md.index);
+	bpf_trace_printk(fmt, sizeof(fmt),
+			key.tunnel_id, key.remote_ipv6[0], index);
+
+	return TC_ACT_OK;
+}
+
 SEC("vxlan_set_tunnel")
 int _vxlan_set_tunnel(struct __sk_buff *skb)
 {
diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh
index 226f45381b76..f53efb62f699 100755
--- a/samples/bpf/test_tunnel_bpf.sh
+++ b/samples/bpf/test_tunnel_bpf.sh
@@ -70,6 +70,28 @@ function add_erspan_tunnel {
 	ip addr add dev $DEV 10.1.1.200/24
 }
 
+function add_ip6erspan_tunnel {
+
+	# assign ipv6 address
+	ip netns exec at_ns0 ip addr add ::11/96 dev veth0
+	ip netns exec at_ns0 ip link set dev veth0 up
+	ip addr add dev veth1 ::22/96
+	ip link set dev veth1 up
+
+	# in namespace
+	ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE seq key 2 erspan 123 \
+		local ::11 remote ::22
+
+	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+	ip netns exec at_ns0 ip link set dev $DEV_NS up
+
+	# out of namespace
+	ip link add dev $DEV type $TYPE external
+	ip addr add dev $DEV 10.1.1.200/24
+	ip link set dev $DEV up
+}
+
 function add_vxlan_tunnel {
 	# Set static ARP entry here because iptables set-mark works
 	# on L3 packet, as a result not applying to ARP packets,
@@ -184,6 +206,18 @@ function test_erspan {
 	cleanup
 }
 
+function test_ip6erspan {
+	TYPE=ip6erspan
+	DEV_NS=ip6erspan00
+	DEV=ip6erspan11
+	config_device
+	add_ip6erspan_tunnel
+	attach_bpf $DEV ip4ip6erspan_set_tunnel ip4ip6erspan_get_tunnel
+	ping6 -c 3 ::11
+	ip netns exec at_ns0 ping -c 1 10.1.1.200
+	cleanup
+}
+
 function test_vxlan {
 	TYPE=vxlan
 	DEV_NS=vxlan00
@@ -239,6 +273,7 @@ function cleanup {
 	ip link del vxlan11
 	ip link del geneve11
 	ip link del erspan11
+	ip link del ip6erspan11
 	pkill tcpdump
 	pkill cat
 	set -ex
@@ -254,6 +289,8 @@ echo "Testing IP6GRETAP tunnel..."
 test_ip6gretap
 echo "Testing ERSPAN tunnel..."
 test_erspan
+echo "Testing IP6ERSPAN tunnel..."
+test_ip6erspan
 echo "Testing VXLAN tunnel..."
 test_vxlan
 echo "Testing GENEVE tunnel..."
-- 
cgit v1.2.3


From 965de87e54b803223bff703ea6b2a76c056695ae Mon Sep 17 00:00:00 2001
From: Josef Bacik
Date: Mon, 11 Dec 2017 11:36:49 -0500
Subject: samples/bpf: add a test for bpf_override_return

This adds a basic test for bpf_override_return to verify it works.  We
override the main function for mounting a btrfs fs so it'll return
-ENOMEM and then make sure that trying to mount a btrfs fs will fail.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/Makefile                      |  4 ++++
 samples/bpf/test_override_return.sh       | 15 +++++++++++++++
 samples/bpf/tracex7_kern.c                | 16 ++++++++++++++++
 samples/bpf/tracex7_user.c                | 28 ++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h            |  7 ++++++-
 tools/testing/selftests/bpf/bpf_helpers.h |  3 ++-
 6 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100755 samples/bpf/test_override_return.sh
 create mode 100644 samples/bpf/tracex7_kern.c
 create mode 100644 samples/bpf/tracex7_user.c

(limited to 'samples')

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index adeaa1302f34..4fb944a7ecf8 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -12,6 +12,7 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
+hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
@@ -58,6 +59,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
+tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -101,6 +103,7 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
+always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
@@ -155,6 +158,7 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_tracex7 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh
new file mode 100755
index 000000000000..e68b9ee6814b
--- /dev/null
+++ b/samples/bpf/test_override_return.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+rm -f testfile.img
+dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
+DEVICE=$(losetup --show -f testfile.img)
+mkfs.btrfs -f $DEVICE
+mkdir tmpmnt
+./tracex7 $DEVICE
+if [ $? -eq 0 ]
+then
+	echo "SUCCESS!"
+else
+	echo "FAILED!"
+fi
+losetup -d $DEVICE
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
new file mode 100644
index 000000000000..1ab308a43e0f
--- /dev/null
+++ b/samples/bpf/tracex7_kern.c
@@ -0,0 +1,16 @@
+#include <uapi/linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+
+SEC("kprobe/open_ctree")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	unsigned long rc = -12;
+
+	bpf_override_return(ctx, rc);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
new file mode 100644
index 000000000000..8a52ac492e8b
--- /dev/null
+++ b/samples/bpf/tracex7_user.c
@@ -0,0 +1,28 @@
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int argc, char **argv)
+{
+	FILE *f;
+	char filename[256];
+	char command[256];
+	int ret;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
+	f = popen(command, "r");
+	ret = pclose(f);
+
+	return ret ? 0 : 1;
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4c223ab30293..cf446c25c0ec 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -677,6 +677,10 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ *	@pt_regs: pointer to struct pt_regs
+ *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -736,7 +740,8 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),
+	FN(getsockopt),			\
+	FN(override_return),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index fd9a17fa8a8b..33cb00e46c49 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -82,7 +82,8 @@ static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags,
 static int (*bpf_perf_prog_read_value)(void *ctx, void *buf,
 				       unsigned int buf_size) =
 	(void *) BPF_FUNC_perf_prog_read_value;
-
+static int (*bpf_override_return)(void *ctx, unsigned long rc) =
+	(void *) BPF_FUNC_override_return;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
cgit v1.2.3


From ac80c2a165af02a0cca3d17d534a85d37fdc1271 Mon Sep 17 00:00:00 2001
From: William Tu
Date: Wed, 13 Dec 2017 16:38:58 -0800
Subject: samples/bpf: add erspan v2 sample code

Extend the existing tests for ipv4 ipv6 erspan version 2.

Signed-off-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 samples/bpf/tcbpf2_kern.c      | 77 +++++++++++++++++++++++++++++++++++++-----
 samples/bpf/test_tunnel_bpf.sh | 38 +++++++++++++++------
 2 files changed, 96 insertions(+), 19 deletions(-)

(limited to 'samples')

diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c
index 79ad061079dd..f6bbf8f50da3 100644
--- a/samples/bpf/tcbpf2_kern.c
+++ b/samples/bpf/tcbpf2_kern.c
@@ -35,12 +35,22 @@ struct geneve_opt {
 	u8	opt_data[8]; /* hard-coded to 8 byte */
 };
 
+struct erspan_md2 {
+	__be32 timestamp;
+	__be16 sgt;
+	__be16 flags;
+};
+
 struct vxlan_metadata {
 	u32     gbp;
 };
 
 struct erspan_metadata {
-	__be32 index;
+	union {
+		__be32 index;
+		struct erspan_md2 md2;
+	} u;
+	int version;
 };
 
 SEC("gre_set_tunnel")
@@ -143,7 +153,18 @@ int _erspan_set_tunnel(struct __sk_buff *skb)
 		return TC_ACT_SHOT;
 	}
 
-	md.index = htonl(123);
+	__builtin_memset(&md, 0, sizeof(md));
+#ifdef ERSPAN_V1
+	md.version = 1;
+	md.u.index = htonl(123);
+#else
+	u8 direction = 1;
+	u16 hwid = 7;
+
+	md.version = 2;
+	md.u.md2.flags = htons((direction << 3) | (hwid << 4));
+#endif
+
 	ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
 	if (ret < 0) {
 		ERROR(ret);
@@ -156,7 +177,7 @@ int _erspan_set_tunnel(struct __sk_buff *skb)
 SEC("erspan_get_tunnel")
 int _erspan_get_tunnel(struct __sk_buff *skb)
 {
-	char fmt[] = "key %d remote ip 0x%x erspan index 0x%x\n";
+	char fmt[] = "key %d remote ip 0x%x erspan version %d\n";
 	struct bpf_tunnel_key key;
 	struct erspan_metadata md;
 	u32 index;
@@ -174,9 +195,22 @@ int _erspan_get_tunnel(struct __sk_buff *skb)
 		return TC_ACT_SHOT;
 	}
 
-	index = bpf_ntohl(md.index);
 	bpf_trace_printk(fmt, sizeof(fmt),
-			key.tunnel_id, key.remote_ipv4, index);
+			key.tunnel_id, key.remote_ipv4, md.version);
+
+#ifdef ERSPAN_V1
+	char fmt2[] = "\tindex %x\n";
+
+	index = bpf_ntohl(md.u.index);
+	bpf_trace_printk(fmt2, sizeof(fmt2), index);
+#else
+	char fmt2[] = "\tdirection %d hwid %x timestamp %u\n";
+
+	bpf_trace_printk(fmt2, sizeof(fmt2),
+		(ntohs(md.u.md2.flags) >> 3) & 0x1,
+		(ntohs(md.u.md2.flags) >> 4) & 0x3f,
+		bpf_ntohl(md.u.md2.timestamp));
+#endif
 
 	return TC_ACT_OK;
 }
@@ -201,7 +235,19 @@ int _ip4ip6erspan_set_tunnel(struct __sk_buff *skb)
 		return TC_ACT_SHOT;
 	}
 
-	md.index = htonl(123);
+	__builtin_memset(&md, 0, sizeof(md));
+
+#ifdef ERSPAN_V1
+	md.u.index = htonl(123);
+	md.version = 1;
+#else
+	u8 direction = 0;
+	u16 hwid = 17;
+
+	md.version = 2;
+	md.u.md2.flags = htons((direction << 3) | (hwid << 4));
+#endif
+
 	ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
 	if (ret < 0) {
 		ERROR(ret);
@@ -214,7 +260,7 @@ int _ip4ip6erspan_set_tunnel(struct __sk_buff *skb)
 SEC("ip4ip6erspan_get_tunnel")
 int _ip4ip6erspan_get_tunnel(struct __sk_buff *skb)
 {
-	char fmt[] = "key %d remote ip6 ::%x erspan index 0x%x\n";
+	char fmt[] = "ip6erspan get key %d remote ip6 ::%x erspan version %d\n";
 	struct bpf_tunnel_key key;
 	struct erspan_metadata md;
 	u32 index;
@@ -232,9 +278,22 @@ int _ip4ip6erspan_get_tunnel(struct __sk_buff *skb)
 		return TC_ACT_SHOT;
 	}
 
-	index = bpf_ntohl(md.index);
 	bpf_trace_printk(fmt, sizeof(fmt),
-			key.tunnel_id, key.remote_ipv6[0], index);
+			key.tunnel_id, key.remote_ipv4, md.version);
+
+#ifdef ERSPAN_V1
+	char fmt2[] = "\tindex %x\n";
+
+	index = bpf_ntohl(md.u.index);
+	bpf_trace_printk(fmt2, sizeof(fmt2), index);
+#else
+	char fmt2[] = "\tdirection %d hwid %x timestamp %u\n";
+
+	bpf_trace_printk(fmt2, sizeof(fmt2),
+		(ntohs(md.u.md2.flags) >> 3) & 0x1,
+		(ntohs(md.u.md2.flags) >> 4) & 0x3f,
+		bpf_ntohl(md.u.md2.timestamp));
+#endif
 
 	return TC_ACT_OK;
 }
diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh
index f53efb62f699..ae7f7c38309b 100755
--- a/samples/bpf/test_tunnel_bpf.sh
+++ b/samples/bpf/test_tunnel_bpf.sh
@@ -59,8 +59,17 @@ function add_ip6gretap_tunnel {
 
 function add_erspan_tunnel {
 	# in namespace
-	ip netns exec at_ns0 \
-		ip link add dev $DEV_NS type $TYPE seq key 2 local 172.16.1.100 remote 172.16.1.200 erspan 123
+	if [ "$1" == "v1" ]; then
+		ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE seq key 2 \
+		local 172.16.1.100 remote 172.16.1.200 \
+		erspan_ver 1 erspan 123
+	else
+		ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE seq key 2 \
+		local 172.16.1.100 remote 172.16.1.200 \
+		erspan_ver 2 erspan_dir 1 erspan_hwid 3
+	fi
 	ip netns exec at_ns0 ip link set dev $DEV_NS up
 	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
 
@@ -79,10 +88,17 @@ function add_ip6erspan_tunnel {
 	ip link set dev veth1 up
 
 	# in namespace
-	ip netns exec at_ns0 \
-		ip link add dev $DEV_NS type $TYPE seq key 2 erspan 123 \
-		local ::11 remote ::22
-
+	if [ "$1" == "v1" ]; then
+		ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE seq key 2 \
+		local ::11 remote ::22 \
+		erspan_ver 1 erspan 123
+	else
+		ip netns exec at_ns0 \
+		ip link add dev $DEV_NS type $TYPE seq key 2 \
+		local ::11 remote ::22 \
+		erspan_ver 2 erspan_dir 1 erspan_hwid 7
+	fi
 	ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
 	ip netns exec at_ns0 ip link set dev $DEV_NS up
 
@@ -199,7 +215,7 @@ function test_erspan {
 	DEV_NS=erspan00
 	DEV=erspan11
 	config_device
-	add_erspan_tunnel
+	add_erspan_tunnel $1
 	attach_bpf $DEV erspan_set_tunnel erspan_get_tunnel
 	ping -c 1 10.1.1.100
 	ip netns exec at_ns0 ping -c 1 10.1.1.200
@@ -211,7 +227,7 @@ function test_ip6erspan {
 	DEV_NS=ip6erspan00
 	DEV=ip6erspan11
 	config_device
-	add_ip6erspan_tunnel
+	add_ip6erspan_tunnel $1
 	attach_bpf $DEV ip4ip6erspan_set_tunnel ip4ip6erspan_get_tunnel
 	ping6 -c 3 ::11
 	ip netns exec at_ns0 ping -c 1 10.1.1.200
@@ -288,9 +304,11 @@ test_ip6gre
 echo "Testing IP6GRETAP tunnel..."
 test_ip6gretap
 echo "Testing ERSPAN tunnel..."
-test_erspan
+test_erspan v1
+test_erspan v2
 echo "Testing IP6ERSPAN tunnel..."
-test_ip6erspan
+test_ip6erspan v1
+test_ip6erspan v2
 echo "Testing VXLAN tunnel..."
 test_vxlan
 echo "Testing GENEVE tunnel..."
-- 
cgit v1.2.3


From 0fca931a6f21c11f675363b92b5a4fe86da59f30 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer
Date: Wed, 3 Jan 2018 11:26:19 +0100
Subject: samples/bpf: program demonstrating access to xdp_rxq_info

This sample program can be used for monitoring and reporting how many
packets per sec (pps) are received per NIC RX queue index and which
CPU processed the packet. In itself it is a useful tool for quickly
identifying RSS imbalance issues, see below.

The default XDP action is XDP_PASS in-order to provide a monitor
mode. For benchmarking purposes it is possible to specify other XDP
actions on the cmdline --action.

Output below shows an imbalance RSS case where most RXQ's deliver to
CPU-0 while CPU-2 only get packets from a single RXQ.  Looking at
things from a CPU level the two CPUs are processing approx the same
amount, BUT looking at the rx_queue_index levels it is clear that
RXQ-2 receive much better service, than other RXQs which all share CPU-0.

Running XDP on dev:i40e1 (ifindex:3) action:XDP_PASS
XDP stats       CPU     pps         issue-pps
XDP-RX CPU      0       900,473     0
XDP-RX CPU      2       906,921     0
XDP-RX CPU      total   1,807,395

RXQ stats       RXQ:CPU pps         issue-pps
rx_queue_index    0:0   180,098     0
rx_queue_index    0:sum 180,098
rx_queue_index    1:0   180,098     0
rx_queue_index    1:sum 180,098
rx_queue_index    2:2   906,921     0
rx_queue_index    2:sum 906,921
rx_queue_index    3:0   180,098     0
rx_queue_index    3:sum 180,098
rx_queue_index    4:0   180,082     0
rx_queue_index    4:sum 180,082
rx_queue_index    5:0   180,093     0
rx_queue_index    5:sum 180,093

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/Makefile            |   4 +
 samples/bpf/xdp_rxq_info_kern.c |  96 ++++++++
 samples/bpf/xdp_rxq_info_user.c | 531 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 631 insertions(+)
 create mode 100644 samples/bpf/xdp_rxq_info_kern.c
 create mode 100644 samples/bpf/xdp_rxq_info_user.c

(limited to 'samples')

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4fb944a7ecf8..3ff7a05bea9a 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -41,6 +41,7 @@ hostprogs-y += xdp_redirect
 hostprogs-y += xdp_redirect_map
 hostprogs-y += xdp_redirect_cpu
 hostprogs-y += xdp_monitor
+hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
 
 # Libbpf dependencies
@@ -90,6 +91,7 @@ xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o
 xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o
 xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o
 xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
+xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
 
 # Tell kbuild to always build the programs
@@ -139,6 +141,7 @@ always += xdp_redirect_kern.o
 always += xdp_redirect_map_kern.o
 always += xdp_redirect_cpu_kern.o
 always += xdp_monitor_kern.o
+always += xdp_rxq_info_kern.o
 always += syscall_tp_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
@@ -182,6 +185,7 @@ HOSTLOADLIBES_xdp_redirect += -lelf
 HOSTLOADLIBES_xdp_redirect_map += -lelf
 HOSTLOADLIBES_xdp_redirect_cpu += -lelf
 HOSTLOADLIBES_xdp_monitor += -lelf
+HOSTLOADLIBES_xdp_rxq_info += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
diff --git a/samples/bpf/xdp_rxq_info_kern.c b/samples/bpf/xdp_rxq_info_kern.c
new file mode 100644
index 000000000000..3fd209291653
--- /dev/null
+++ b/samples/bpf/xdp_rxq_info_kern.c
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ *
+ *  Example howto extract XDP RX-queue info
+ */
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+/* Config setup from with userspace
+ *
+ * User-side setup ifindex in config_map, to verify that
+ * ctx->ingress_ifindex is correct (against configured ifindex)
+ */
+struct config {
+	__u32 action;
+	int ifindex;
+};
+struct bpf_map_def SEC("maps") config_map = {
+	.type		= BPF_MAP_TYPE_ARRAY,
+	.key_size	= sizeof(int),
+	.value_size	= sizeof(struct config),
+	.max_entries	= 1,
+};
+
+/* Common stats data record (shared with userspace) */
+struct datarec {
+	__u64 processed;
+	__u64 issue;
+};
+
+struct bpf_map_def SEC("maps") stats_global_map = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= 1,
+};
+
+#define MAX_RXQs 64
+
+/* Stats per rx_queue_index (per CPU) */
+struct bpf_map_def SEC("maps") rx_queue_index_map = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= MAX_RXQs + 1,
+};
+
+SEC("xdp_prog0")
+int  xdp_prognum0(struct xdp_md *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data     = (void *)(long)ctx->data;
+	struct datarec *rec, *rxq_rec;
+	int ingress_ifindex;
+	struct config *config;
+	u32 key = 0;
+
+	/* Global stats record */
+	rec = bpf_map_lookup_elem(&stats_global_map, &key);
+	if (!rec)
+		return XDP_ABORTED;
+	rec->processed++;
+
+	/* Accessing ctx->ingress_ifindex, cause BPF to rewrite BPF
+	 * instructions inside kernel to access xdp_rxq->dev->ifindex
+	 */
+	ingress_ifindex = ctx->ingress_ifindex;
+
+	config = bpf_map_lookup_elem(&config_map, &key);
+	if (!config)
+		return XDP_ABORTED;
+
+	/* Simple test: check ctx provided ifindex is as expected */
+	if (ingress_ifindex != config->ifindex) {
+		/* count this error case */
+		rec->issue++;
+		return XDP_ABORTED;
+	}
+
+	/* Update stats per rx_queue_index. Handle if rx_queue_index
+	 * is larger than stats map can contain info for.
+	 */
+	key = ctx->rx_queue_index;
+	if (key >= MAX_RXQs)
+		key = MAX_RXQs;
+	rxq_rec = bpf_map_lookup_elem(&rx_queue_index_map, &key);
+	if (!rxq_rec)
+		return XDP_ABORTED;
+	rxq_rec->processed++;
+	if (key == MAX_RXQs)
+		rxq_rec->issue++;
+
+	return config->action;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c
new file mode 100644
index 000000000000..32430e8b3a6a
--- /dev/null
+++ b/samples/bpf/xdp_rxq_info_user.c
@@ -0,0 +1,531 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ */
+static const char *__doc__ = " XDP RX-queue info extract example\n\n"
+	"Monitor how many packets per sec (pps) are received\n"
+	"per NIC RX queue index and which CPU processed the packet\n"
+	;
+
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <locale.h>
+#include <sys/resource.h>
+#include <getopt.h>
+#include <net/if.h>
+#include <time.h>
+
+#include <arpa/inet.h>
+#include <linux/if_link.h>
+
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "bpf_util.h"
+
+static int ifindex = -1;
+static char ifname_buf[IF_NAMESIZE];
+static char *ifname;
+
+static __u32 xdp_flags;
+
+/* Exit return codes */
+#define EXIT_OK		0
+#define EXIT_FAIL		1
+#define EXIT_FAIL_OPTION	2
+#define EXIT_FAIL_XDP		3
+#define EXIT_FAIL_BPF		4
+#define EXIT_FAIL_MEM		5
+
+static const struct option long_options[] = {
+	{"help",	no_argument,		NULL, 'h' },
+	{"dev",		required_argument,	NULL, 'd' },
+	{"skb-mode",	no_argument,		NULL, 'S' },
+	{"sec",		required_argument,	NULL, 's' },
+	{"no-separators", no_argument,		NULL, 'z' },
+	{"action",	required_argument,	NULL, 'a' },
+	{0, 0, NULL,  0 }
+};
+
+static void int_exit(int sig)
+{
+	fprintf(stderr,
+		"Interrupted: Removing XDP program on ifindex:%d device:%s\n",
+		ifindex, ifname);
+	if (ifindex > -1)
+		set_link_xdp_fd(ifindex, -1, xdp_flags);
+	exit(EXIT_OK);
+}
+
+struct config {
+	__u32 action;
+	int ifindex;
+};
+#define XDP_ACTION_MAX (XDP_TX + 1)
+#define XDP_ACTION_MAX_STRLEN 11
+static const char *xdp_action_names[XDP_ACTION_MAX] = {
+	[XDP_ABORTED]	= "XDP_ABORTED",
+	[XDP_DROP]	= "XDP_DROP",
+	[XDP_PASS]	= "XDP_PASS",
+	[XDP_TX]	= "XDP_TX",
+};
+
+static const char *action2str(int action)
+{
+	if (action < XDP_ACTION_MAX)
+		return xdp_action_names[action];
+	return NULL;
+}
+
+static int parse_xdp_action(char *action_str)
+{
+	size_t maxlen;
+	__u64 action = -1;
+	int i;
+
+	for (i = 0; i < XDP_ACTION_MAX; i++) {
+		maxlen = XDP_ACTION_MAX_STRLEN;
+		if (strncmp(xdp_action_names[i], action_str, maxlen) == 0) {
+			action = i;
+			break;
+		}
+	}
+	return action;
+}
+
+static void list_xdp_actions(void)
+{
+	int i;
+
+	printf("Available XDP --action <options>\n");
+	for (i = 0; i < XDP_ACTION_MAX; i++)
+		printf("\t%s\n", xdp_action_names[i]);
+	printf("\n");
+}
+
+static void usage(char *argv[])
+{
+	int i;
+
+	printf("\nDOCUMENTATION:\n%s\n", __doc__);
+	printf(" Usage: %s (options-see-below)\n", argv[0]);
+	printf(" Listing options:\n");
+	for (i = 0; long_options[i].name != 0; i++) {
+		printf(" --%-12s", long_options[i].name);
+		if (long_options[i].flag != NULL)
+			printf(" flag (internal value:%d)",
+				*long_options[i].flag);
+		else
+			printf(" short-option: -%c",
+				long_options[i].val);
+		printf("\n");
+	}
+	printf("\n");
+	list_xdp_actions();
+}
+
+#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
+static __u64 gettime(void)
+{
+	struct timespec t;
+	int res;
+
+	res = clock_gettime(CLOCK_MONOTONIC, &t);
+	if (res < 0) {
+		fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
+		exit(EXIT_FAIL);
+	}
+	return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
+}
+
+/* Common stats data record shared with _kern.c */
+struct datarec {
+	__u64 processed;
+	__u64 issue;
+};
+struct record {
+	__u64 timestamp;
+	struct datarec total;
+	struct datarec *cpu;
+};
+struct stats_record {
+	struct record stats;
+	struct record *rxq;
+};
+
+static struct datarec *alloc_record_per_cpu(void)
+{
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	struct datarec *array;
+	size_t size;
+
+	size = sizeof(struct datarec) * nr_cpus;
+	array = malloc(size);
+	memset(array, 0, size);
+	if (!array) {
+		fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
+		exit(EXIT_FAIL_MEM);
+	}
+	return array;
+}
+
+static struct record *alloc_record_per_rxq(void)
+{
+	unsigned int nr_rxqs = map_data[2].def.max_entries;
+	struct record *array;
+	size_t size;
+
+	size = sizeof(struct record) * nr_rxqs;
+	array = malloc(size);
+	memset(array, 0, size);
+	if (!array) {
+		fprintf(stderr, "Mem alloc error (nr_rxqs:%u)\n", nr_rxqs);
+		exit(EXIT_FAIL_MEM);
+	}
+	return array;
+}
+
+static struct stats_record *alloc_stats_record(void)
+{
+	unsigned int nr_rxqs = map_data[2].def.max_entries;
+	struct stats_record *rec;
+	int i;
+
+	rec = malloc(sizeof(*rec));
+	memset(rec, 0, sizeof(*rec));
+	if (!rec) {
+		fprintf(stderr, "Mem alloc error\n");
+		exit(EXIT_FAIL_MEM);
+	}
+	rec->rxq = alloc_record_per_rxq();
+	for (i = 0; i < nr_rxqs; i++)
+		rec->rxq[i].cpu = alloc_record_per_cpu();
+
+	rec->stats.cpu = alloc_record_per_cpu();
+	return rec;
+}
+
+static void free_stats_record(struct stats_record *r)
+{
+	unsigned int nr_rxqs = map_data[2].def.max_entries;
+	int i;
+
+	for (i = 0; i < nr_rxqs; i++)
+		free(r->rxq[i].cpu);
+
+	free(r->rxq);
+	free(r->stats.cpu);
+	free(r);
+}
+
+static bool map_collect_percpu(int fd, __u32 key, struct record *rec)
+{
+	/* For percpu maps, userspace gets a value per possible CPU */
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	struct datarec values[nr_cpus];
+	__u64 sum_processed = 0;
+	__u64 sum_issue = 0;
+	int i;
+
+	if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+		fprintf(stderr,
+			"ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+		return false;
+	}
+	/* Get time as close as possible to reading map contents */
+	rec->timestamp = gettime();
+
+	/* Record and sum values from each CPU */
+	for (i = 0; i < nr_cpus; i++) {
+		rec->cpu[i].processed = values[i].processed;
+		sum_processed        += values[i].processed;
+		rec->cpu[i].issue = values[i].issue;
+		sum_issue        += values[i].issue;
+	}
+	rec->total.processed = sum_processed;
+	rec->total.issue     = sum_issue;
+	return true;
+}
+
+static void stats_collect(struct stats_record *rec)
+{
+	int fd, i, max_rxqs;
+
+	fd = map_data[1].fd; /* map: stats_global_map */
+	map_collect_percpu(fd, 0, &rec->stats);
+
+	fd = map_data[2].fd; /* map: rx_queue_index_map */
+	max_rxqs = map_data[2].def.max_entries;
+	for (i = 0; i < max_rxqs; i++)
+		map_collect_percpu(fd, i, &rec->rxq[i]);
+}
+
+static double calc_period(struct record *r, struct record *p)
+{
+	double period_ = 0;
+	__u64 period = 0;
+
+	period = r->timestamp - p->timestamp;
+	if (period > 0)
+		period_ = ((double) period / NANOSEC_PER_SEC);
+
+	return period_;
+}
+
+static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_)
+{
+	__u64 packets = 0;
+	__u64 pps = 0;
+
+	if (period_ > 0) {
+		packets = r->processed - p->processed;
+		pps = packets / period_;
+	}
+	return pps;
+}
+
+static __u64 calc_errs_pps(struct datarec *r,
+			    struct datarec *p, double period_)
+{
+	__u64 packets = 0;
+	__u64 pps = 0;
+
+	if (period_ > 0) {
+		packets = r->issue - p->issue;
+		pps = packets / period_;
+	}
+	return pps;
+}
+
+static void stats_print(struct stats_record *stats_rec,
+			struct stats_record *stats_prev,
+			int action)
+{
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	unsigned int nr_rxqs = map_data[2].def.max_entries;
+	double pps = 0, err = 0;
+	struct record *rec, *prev;
+	double t;
+	int rxq;
+	int i;
+
+	/* Header */
+	printf("\nRunning XDP on dev:%s (ifindex:%d) action:%s\n",
+	       ifname, ifindex, action2str(action));
+
+	/* stats_global_map */
+	{
+		char *fmt_rx = "%-15s %-7d %'-11.0f %'-10.0f %s\n";
+		char *fm2_rx = "%-15s %-7s %'-11.0f\n";
+		char *errstr = "";
+
+		printf("%-15s %-7s %-11s %-11s\n",
+		       "XDP stats", "CPU", "pps", "issue-pps");
+
+		rec  =  &stats_rec->stats;
+		prev = &stats_prev->stats;
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps = calc_pps     (r, p, t);
+			err = calc_errs_pps(r, p, t);
+			if (err > 0)
+				errstr = "invalid-ifindex";
+			if (pps > 0)
+				printf(fmt_rx, "XDP-RX CPU",
+					i, pps, err, errstr);
+		}
+		pps  = calc_pps     (&rec->total, &prev->total, t);
+		err  = calc_errs_pps(&rec->total, &prev->total, t);
+		printf(fm2_rx, "XDP-RX CPU", "total", pps, err);
+	}
+
+	/* rx_queue_index_map */
+	printf("\n%-15s %-7s %-11s %-11s\n",
+	       "RXQ stats", "RXQ:CPU", "pps", "issue-pps");
+
+	for (rxq = 0; rxq < nr_rxqs; rxq++) {
+		char *fmt_rx = "%-15s %3d:%-3d %'-11.0f %'-10.0f %s\n";
+		char *fm2_rx = "%-15s %3d:%-3s %'-11.0f\n";
+		char *errstr = "";
+		int rxq_ = rxq;
+
+		/* Last RXQ in map catch overflows */
+		if (rxq_ == nr_rxqs - 1)
+			rxq_ = -1;
+
+		rec  =  &stats_rec->rxq[rxq];
+		prev = &stats_prev->rxq[rxq];
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps = calc_pps     (r, p, t);
+			err = calc_errs_pps(r, p, t);
+			if (err > 0) {
+				if (rxq_ == -1)
+					errstr = "map-overflow-RXQ";
+				else
+					errstr = "err";
+			}
+			if (pps > 0)
+				printf(fmt_rx, "rx_queue_index",
+				       rxq_, i, pps, err, errstr);
+		}
+		pps  = calc_pps     (&rec->total, &prev->total, t);
+		err  = calc_errs_pps(&rec->total, &prev->total, t);
+		if (pps || err)
+			printf(fm2_rx, "rx_queue_index", rxq_, "sum", pps, err);
+	}
+}
+
+
+/* Pointer swap trick */
+static inline void swap(struct stats_record **a, struct stats_record **b)
+{
+	struct stats_record *tmp;
+
+	tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
+static void stats_poll(int interval, int action)
+{
+	struct stats_record *record, *prev;
+
+	record = alloc_stats_record();
+	prev   = alloc_stats_record();
+	stats_collect(record);
+
+	while (1) {
+		swap(&prev, &record);
+		stats_collect(record);
+		stats_print(record, prev, action);
+		sleep(interval);
+	}
+
+	free_stats_record(record);
+	free_stats_record(prev);
+}
+
+
+int main(int argc, char **argv)
+{
+	struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
+	bool use_separators = true;
+	struct config cfg = { 0 };
+	char filename[256];
+	int longindex = 0;
+	int interval = 2;
+	__u32 key = 0;
+	int opt, err;
+
+	char action_str_buf[XDP_ACTION_MAX_STRLEN + 1 /* for \0 */] = { 0 };
+	int action = XDP_PASS; /* Default action */
+	char *action_str = NULL;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return 1;
+	}
+
+	if (load_bpf_file(filename)) {
+		fprintf(stderr, "ERR in load_bpf_file(): %s", bpf_log_buf);
+		return EXIT_FAIL;
+	}
+
+	if (!prog_fd[0]) {
+		fprintf(stderr, "ERR: load_bpf_file: %s\n", strerror(errno));
+		return EXIT_FAIL;
+	}
+
+	/* Parse commands line args */
+	while ((opt = getopt_long(argc, argv, "hSd:",
+				  long_options, &longindex)) != -1) {
+		switch (opt) {
+		case 'd':
+			if (strlen(optarg) >= IF_NAMESIZE) {
+				fprintf(stderr, "ERR: --dev name too long\n");
+				goto error;
+			}
+			ifname = (char *)&ifname_buf;
+			strncpy(ifname, optarg, IF_NAMESIZE);
+			ifindex = if_nametoindex(ifname);
+			if (ifindex == 0) {
+				fprintf(stderr,
+					"ERR: --dev name unknown err(%d):%s\n",
+					errno, strerror(errno));
+				goto error;
+			}
+			break;
+		case 's':
+			interval = atoi(optarg);
+			break;
+		case 'S':
+			xdp_flags |= XDP_FLAGS_SKB_MODE;
+			break;
+		case 'z':
+			use_separators = false;
+			break;
+		case 'a':
+			action_str = (char *)&action_str_buf;
+			strncpy(action_str, optarg, XDP_ACTION_MAX_STRLEN);
+			break;
+		case 'h':
+		error:
+		default:
+			usage(argv);
+			return EXIT_FAIL_OPTION;
+		}
+	}
+	/* Required option */
+	if (ifindex == -1) {
+		fprintf(stderr, "ERR: required option --dev missing\n");
+		usage(argv);
+		return EXIT_FAIL_OPTION;
+	}
+	cfg.ifindex = ifindex;
+
+	/* Parse action string */
+	if (action_str) {
+		action = parse_xdp_action(action_str);
+		if (action < 0) {
+			fprintf(stderr, "ERR: Invalid XDP --action: %s\n",
+				action_str);
+			list_xdp_actions();
+			return EXIT_FAIL_OPTION;
+		}
+	}
+	cfg.action = action;
+
+	/* Trick to pretty printf with thousands separators use %' */
+	if (use_separators)
+		setlocale(LC_NUMERIC, "en_US");
+
+	/* User-side setup ifindex in config_map */
+	err = bpf_map_update_elem(map_fd[0], &key, &cfg, 0);
+	if (err) {
+		fprintf(stderr, "Store config failed (err:%d)\n", err);
+		exit(EXIT_FAIL_BPF);
+	}
+
+	/* Remove XDP program when program is interrupted */
+	signal(SIGINT, int_exit);
+
+	if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
+		fprintf(stderr, "link set xdp fd failed\n");
+		return EXIT_FAIL_XDP;
+	}
+
+	stats_poll(interval, action);
+	return EXIT_OK;
+}
-- 
cgit v1.2.3


From 36e04a2d78d97cc3a02a168541dfa00c8e4b30f2 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer
Date: Wed, 10 Jan 2018 18:21:44 +0100
Subject: samples/bpf: xdp2skb_meta shows transferring info from XDP to SKB

Creating a bpf sample that shows howto use the XDP 'data_meta'
infrastructure, created by Daniel Borkmann.  Very few drivers support
this feature, but I wanted a functional sample to begin with, when
working on adding driver support.

XDP data_meta is about creating a communication channel between BPF
programs.  This can be XDP tail-progs, but also other SKB based BPF
hooks, like in this case the TC clsact hook. In this sample I show
that XDP can store info named "mark", and TC/clsact chooses to use
this info and store it into the skb->mark.

It is a bit annoying that XDP and TC samples uses different tools/libs
when attaching their BPF hooks.  As the XDP and TC programs need to
cooperate and agree on a struct-layout, it is best/easiest if the two
programs can be contained within the same BPF restricted-C file.

As the bpf-loader, I choose to not use bpf_load.c (or libbpf), but
instead wrote a bash shell scripted named xdp2skb_meta.sh, which
demonstrate howto use the iproute cmdline tools 'tc' and 'ip' for
loading BPF programs.  To make it easy for first time users, the shell
script have command line parsing, and support --verbose and --dry-run
mode, if you just want to see/learn the tc+ip command syntax:

 # ./xdp2skb_meta.sh --dev ixgbe2 --dry-run
 # Dry-run mode: enable VERBOSE and don't call TC+IP
 tc qdisc del dev ixgbe2 clsact
 tc qdisc add dev ixgbe2 clsact
 tc filter add dev ixgbe2 ingress prio 1 handle 1 bpf da obj ./xdp2skb_meta_kern.o sec tc_mark
 # Flush XDP on device: ixgbe2
 ip link set dev ixgbe2 xdp off
 ip link set dev ixgbe2 xdp obj ./xdp2skb_meta_kern.o sec xdp_mark

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/Makefile            |   1 +
 samples/bpf/xdp2skb_meta.sh     | 220 ++++++++++++++++++++++++++++++++++++++++
 samples/bpf/xdp2skb_meta_kern.c | 103 +++++++++++++++++++
 3 files changed, 324 insertions(+)
 create mode 100755 samples/bpf/xdp2skb_meta.sh
 create mode 100644 samples/bpf/xdp2skb_meta_kern.c

(limited to 'samples')

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 3ff7a05bea9a..7f61a3d57fa7 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -142,6 +142,7 @@ always += xdp_redirect_map_kern.o
 always += xdp_redirect_cpu_kern.o
 always += xdp_monitor_kern.o
 always += xdp_rxq_info_kern.o
+always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
diff --git a/samples/bpf/xdp2skb_meta.sh b/samples/bpf/xdp2skb_meta.sh
new file mode 100755
index 000000000000..b9c9549c4c27
--- /dev/null
+++ b/samples/bpf/xdp2skb_meta.sh
@@ -0,0 +1,220 @@
+#!/bin/bash
+#
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc.
+#
+# Bash-shell example on using iproute2 tools 'tc' and 'ip' to load
+# eBPF programs, both for XDP and clsbpf.  Shell script function
+# wrappers and even long options parsing is illustrated, for ease of
+# use.
+#
+# Related to sample/bpf/xdp2skb_meta_kern.c, which contains BPF-progs
+# that need to collaborate between XDP and TC hooks.  Thus, it is
+# convenient that the same tool load both programs that need to work
+# together.
+#
+BPF_FILE=xdp2skb_meta_kern.o
+DIR=$(dirname $0)
+
+export TC=/usr/sbin/tc
+export IP=/usr/sbin/ip
+
+function usage() {
+    echo ""
+    echo "Usage: $0 [-vfh] --dev ethX"
+    echo "  -d | --dev     :             Network device (required)"
+    echo "  --flush        :             Cleanup flush TC and XDP progs"
+    echo "  --list         : (\$LIST)     List TC and XDP progs"
+    echo "  -v | --verbose : (\$VERBOSE)  Verbose"
+    echo "  --dry-run      : (\$DRYRUN)   Dry-run only (echo commands)"
+    echo ""
+}
+
+## -- General shell logging cmds --
+function err() {
+    local exitcode=$1
+    shift
+    echo "ERROR: $@" >&2
+    exit $exitcode
+}
+
+function info() {
+    if [[ -n "$VERBOSE" ]]; then
+	echo "# $@"
+    fi
+}
+
+## -- Helper function calls --
+
+# Wrapper call for TC and IP
+# - Will display the offending command on failure
+function _call_cmd() {
+    local cmd="$1"
+    local allow_fail="$2"
+    shift 2
+    if [[ -n "$VERBOSE" ]]; then
+	echo "$(basename $cmd) $@"
+    fi
+    if [[ -n "$DRYRUN" ]]; then
+	return
+    fi
+    $cmd "$@"
+    local status=$?
+    if (( $status != 0 )); then
+	if [[ "$allow_fail" == "" ]]; then
+	    err 2 "Exec error($status) occurred cmd: \"$cmd $@\""
+	fi
+    fi
+}
+function call_tc() {
+    _call_cmd "$TC" "" "$@"
+}
+function call_tc_allow_fail() {
+    _call_cmd "$TC" "allow_fail" "$@"
+}
+function call_ip() {
+    _call_cmd "$IP" "" "$@"
+}
+
+##  --- Parse command line arguments / parameters ---
+# Using external program "getopt" to get --long-options
+OPTIONS=$(getopt -o vfhd: \
+    --long verbose,flush,help,list,dev:,dry-run -- "$@")
+if (( $? != 0 )); then
+    err 4 "Error calling getopt"
+fi
+eval set -- "$OPTIONS"
+
+unset DEV
+unset FLUSH
+while true; do
+    case "$1" in
+	-d | --dev ) # device
+	    DEV=$2
+	    info "Device set to: DEV=$DEV" >&2
+	    shift 2
+	    ;;
+	-v | --verbose)
+	    VERBOSE=yes
+	    # info "Verbose mode: VERBOSE=$VERBOSE" >&2
+	    shift
+	    ;;
+	--dry-run )
+	    DRYRUN=yes
+	    VERBOSE=yes
+	    info "Dry-run mode: enable VERBOSE and don't call TC+IP" >&2
+	    shift
+            ;;
+	-f | --flush )
+	    FLUSH=yes
+	    shift
+	    ;;
+	--list )
+	    LIST=yes
+	    shift
+	    ;;
+	-- )
+	    shift
+	    break
+	    ;;
+	-h | --help )
+	    usage;
+	    exit 0
+	    ;;
+	* )
+	    shift
+	    break
+	    ;;
+    esac
+done
+
+FILE="$DIR/$BPF_FILE"
+if [[ ! -e $FILE ]]; then
+    err 3 "Missing BPF object file ($FILE)"
+fi
+
+if [[ -z $DEV ]]; then
+    usage
+    err 2 "Please specify network device -- required option --dev"
+fi
+
+## -- Function calls --
+
+function list_tc()
+{
+    local device="$1"
+    shift
+    info "Listing current TC ingress rules"
+    call_tc filter show dev $device ingress
+}
+
+function list_xdp()
+{
+    local device="$1"
+    shift
+    info "Listing current XDP device($device) setting"
+    call_ip link show dev $device | grep --color=auto xdp
+}
+
+function flush_tc()
+{
+    local device="$1"
+    shift
+    info "Flush TC on device: $device"
+    call_tc_allow_fail filter del dev $device ingress
+    call_tc_allow_fail qdisc del dev $device clsact
+}
+
+function flush_xdp()
+{
+    local device="$1"
+    shift
+    info "Flush XDP on device: $device"
+    call_ip link set dev $device xdp off
+}
+
+function attach_tc_mark()
+{
+    local device="$1"
+    local file="$2"
+    local prog="tc_mark"
+    shift 2
+
+    # Re-attach clsact to clear/flush existing role
+    call_tc_allow_fail qdisc del dev $device clsact 2> /dev/null
+    call_tc            qdisc add dev $device clsact
+
+    # Attach BPF prog
+    call_tc filter add dev $device ingress \
+	    prio 1 handle 1 bpf da obj $file sec $prog
+}
+
+function attach_xdp_mark()
+{
+    local device="$1"
+    local file="$2"
+    local prog="xdp_mark"
+    shift 2
+
+    # Remove XDP prog in-case it's already loaded
+    # TODO: Need ip-link option to override/replace existing XDP prog
+    flush_xdp $device
+
+    # Attach XDP/BPF prog
+    call_ip link set dev $device xdp obj $file sec $prog
+}
+
+if [[ -n $FLUSH ]]; then
+    flush_tc  $DEV
+    flush_xdp $DEV
+    exit 0
+fi
+
+if [[ -n $LIST ]]; then
+    list_tc  $DEV
+    list_xdp $DEV
+    exit 0
+fi
+
+attach_tc_mark  $DEV $FILE
+attach_xdp_mark $DEV $FILE
diff --git a/samples/bpf/xdp2skb_meta_kern.c b/samples/bpf/xdp2skb_meta_kern.c
new file mode 100644
index 000000000000..12e1024069c2
--- /dev/null
+++ b/samples/bpf/xdp2skb_meta_kern.c
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc.
+ *
+ * Example howto transfer info from XDP to SKB, e.g. skb->mark
+ * -----------------------------------------------------------
+ * This uses the XDP data_meta infrastructure, and is a cooperation
+ * between two bpf-programs (1) XDP and (2) clsact at TC-ingress hook.
+ *
+ * Notice: This example does not use the BPF C-loader (bpf_load.c),
+ * but instead rely on the iproute2 TC tool for loading BPF-objects.
+ */
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/pkt_cls.h>
+
+#include "bpf_helpers.h"
+
+/*
+ * This struct is stored in the XDP 'data_meta' area, which is located
+ * just in-front-of the raw packet payload data.  The meaning is
+ * specific to these two BPF programs that use it as a communication
+ * channel.  XDP adjust/increase the area via a bpf-helper, and TC use
+ * boundary checks to see if data have been provided.
+ *
+ * The struct must be 4 byte aligned, which here is enforced by the
+ * struct __attribute__((aligned(4))).
+ */
+struct meta_info {
+	__u32 mark;
+} __attribute__((aligned(4)));
+
+SEC("xdp_mark")
+int _xdp_mark(struct xdp_md *ctx)
+{
+	struct meta_info *meta;
+	void *data, *data_end;
+	int ret;
+
+	/* Reserve space in-front data pointer for our meta info.
+	 * (Notice drivers not supporting data_meta will fail here!)
+	 */
+	ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(*meta));
+	if (ret < 0)
+		return XDP_ABORTED;
+
+	/* For some unknown reason, these ctx pointers must be read
+	 * after bpf_xdp_adjust_meta, else verifier will reject prog.
+	 */
+	data = (void *)(unsigned long)ctx->data;
+
+	/* Check data_meta have room for meta_info struct */
+	meta = (void *)(unsigned long)ctx->data_meta;
+	if (meta + 1 > data)
+		return XDP_ABORTED;
+
+	meta->mark = 42;
+
+	return XDP_PASS;
+}
+
+SEC("tc_mark")
+int _tc_mark(struct __sk_buff *ctx)
+{
+	void *data      = (void *)(unsigned long)ctx->data;
+	void *data_end  = (void *)(unsigned long)ctx->data_end;
+	void *data_meta = (void *)(unsigned long)ctx->data_meta;
+	struct meta_info *meta = data_meta;
+
+	/* Check XDP gave us some data_meta */
+	if (meta + 1 > data) {
+		ctx->mark = 41;
+		 /* Skip "accept" if no data_meta is avail */
+		return TC_ACT_OK;
+	}
+
+	/* Hint: See func tc_cls_act_is_valid_access() for BPF_WRITE access */
+	ctx->mark = meta->mark; /* Transfer XDP-mark to SKB-mark */
+
+	return TC_ACT_OK;
+}
+
+/* Manually attaching these programs:
+export DEV=ixgbe2
+export FILE=xdp2skb_meta_kern.o
+
+# via TC command
+tc qdisc del dev $DEV clsact 2> /dev/null
+tc qdisc add dev $DEV clsact
+tc filter  add dev $DEV ingress prio 1 handle 1 bpf da obj $FILE sec tc_mark
+tc filter show dev $DEV ingress
+
+# XDP via IP command:
+ip link set dev $DEV xdp off
+ip link set dev $DEV xdp obj $FILE sec xdp_mark
+
+# Use iptable to "see" if SKBs are marked
+iptables -I INPUT -p icmp -m mark --mark 41  # == 0x29
+iptables -I INPUT -p icmp -m mark --mark 42  # == 0x2a
+
+# Hint: catch XDP_ABORTED errors via
+perf record -e xdp:*
+perf script
+
+*/
-- 
cgit v1.2.3


From 4c38f74c9186e2bc32789ddab3a95ed384c695d7 Mon Sep 17 00:00:00 2001
From: Luis de Bethencourt
Date: Tue, 16 Jan 2018 14:15:30 +0000
Subject: samples/bpf: Fix trailing semicolon

The trailing semicolon is an empty statement that does no operation.
Removing it since it doesn't do anything.

Signed-off-by: Luis de Bethencourt <luisbg@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/xdp_monitor_kern.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'samples')

diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index 2fe2f761a0d0..c969141bfa8b 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -104,7 +104,7 @@ struct xdp_exception_ctx {
 SEC("tracepoint/xdp/xdp_exception")
 int trace_xdp_exception(struct xdp_exception_ctx *ctx)
 {
-	u64 *cnt;;
+	u64 *cnt;
 	u32 key;
 
 	key = ctx->act;
-- 
cgit v1.2.3


From e2e3224122e64ebe15fe02a63e8fe09b64a8c743 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer
Date: Wed, 17 Jan 2018 11:17:37 +0100
Subject: samples/bpf: xdp2skb_meta comment explain why pkt-data pointers are
 invalidated

Improve the 'unknown reason' comment, with an actual explaination of why
the ctx pkt-data pointers need to be loaded after the helper function
bpf_xdp_adjust_meta().  Based on the explaination Daniel gave.

Fixes: 36e04a2d78d9 ("samples/bpf: xdp2skb_meta shows transferring info from XDP to SKB")
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/xdp2skb_meta_kern.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'samples')

diff --git a/samples/bpf/xdp2skb_meta_kern.c b/samples/bpf/xdp2skb_meta_kern.c
index 12e1024069c2..0c12048ac79f 100644
--- a/samples/bpf/xdp2skb_meta_kern.c
+++ b/samples/bpf/xdp2skb_meta_kern.c
@@ -35,15 +35,17 @@ int _xdp_mark(struct xdp_md *ctx)
 	void *data, *data_end;
 	int ret;
 
-	/* Reserve space in-front data pointer for our meta info.
+	/* Reserve space in-front of data pointer for our meta info.
 	 * (Notice drivers not supporting data_meta will fail here!)
 	 */
 	ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(*meta));
 	if (ret < 0)
 		return XDP_ABORTED;
 
-	/* For some unknown reason, these ctx pointers must be read
-	 * after bpf_xdp_adjust_meta, else verifier will reject prog.
+	/* Notice: Kernel-side verifier requires that loading of
+	 * ctx->data MUST happen _after_ helper bpf_xdp_adjust_meta(),
+	 * as pkt-data pointers are invalidated.  Helpers that require
+	 * this are determined/marked by bpf_helper_changes_pkt_data()
 	 */
 	data = (void *)(unsigned long)ctx->data;
 
-- 
cgit v1.2.3


From 417f1d9f217922d822b64e8323458d7d03a12d4f Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer
Date: Fri, 19 Jan 2018 17:15:50 +0100
Subject: samples/bpf: xdp_monitor include cpumap tracepoints in monitoring

The xdp_redirect_cpu sample have some "builtin" monitoring of the
tracepoints for xdp_cpumap_*, but it is practical to have an external
tool that can monitor these transpoint as an easy way to troubleshoot
an application using XDP + cpumap.

Specifically I need such external tool when working on Suricata and
XDP cpumap redirect. Extend the xdp_monitor tool sample with
monitoring of these xdp_cpumap_* tracepoints.  Model the output format
like xdp_redirect_cpu.

Given I needed to handle per CPU decoding for cpumap, this patch also
add per CPU info on the existing monitor events.  This resembles part
of the builtin monitoring output from sample xdp_rxq_info.  Thus, also
covering part of that sample in an external monitoring tool.

Performance wise, the cpumap tracepoints uses bulking, which cause
them to have very little overhead.  Thus, they are enabled by default.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/xdp_monitor_kern.c |  94 +++++++++-
 samples/bpf/xdp_monitor_user.c | 416 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 443 insertions(+), 67 deletions(-)

(limited to 'samples')

diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index c969141bfa8b..211db8ded0de 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -1,6 +1,7 @@
-/* XDP monitor tool, based on tracepoints
+/* SPDX-License-Identifier: GPL-2.0
+ *  Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc.
  *
- *  Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * XDP monitor tool, based on tracepoints
  */
 #include <uapi/linux/bpf.h>
 #include "bpf_helpers.h"
@@ -118,3 +119,92 @@ int trace_xdp_exception(struct xdp_exception_ctx *ctx)
 
 	return 0;
 }
+
+/* Common stats data record shared with _user.c */
+struct datarec {
+	u64 processed;
+	u64 dropped;
+	u64 info;
+};
+#define MAX_CPUS 64
+
+struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= MAX_CPUS,
+};
+
+struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
+	.type		= BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size	= sizeof(u32),
+	.value_size	= sizeof(struct datarec),
+	.max_entries	= 1,
+};
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
+ * Code in:         kernel/include/trace/events/xdp.h
+ */
+struct cpumap_enqueue_ctx {
+	u64 __pad;		// First 8 bytes are not accessible by bpf code
+	int map_id;		//	offset:8;  size:4; signed:1;
+	u32 act;		//	offset:12; size:4; signed:0;
+	int cpu;		//	offset:16; size:4; signed:1;
+	unsigned int drops;	//	offset:20; size:4; signed:0;
+	unsigned int processed;	//	offset:24; size:4; signed:0;
+	int to_cpu;		//	offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_enqueue")
+int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
+{
+	u32 to_cpu = ctx->to_cpu;
+	struct datarec *rec;
+
+	if (to_cpu >= MAX_CPUS)
+		return 1;
+
+	rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
+	if (!rec)
+		return 0;
+	rec->processed += ctx->processed;
+	rec->dropped   += ctx->drops;
+
+	/* Record bulk events, then userspace can calc average bulk size */
+	if (ctx->processed > 0)
+		rec->info += 1;
+
+	return 0;
+}
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
+ * Code in:         kernel/include/trace/events/xdp.h
+ */
+struct cpumap_kthread_ctx {
+	u64 __pad;		// First 8 bytes are not accessible by bpf code
+	int map_id;		//	offset:8;  size:4; signed:1;
+	u32 act;		//	offset:12; size:4; signed:0;
+	int cpu;		//	offset:16; size:4; signed:1;
+	unsigned int drops;	//	offset:20; size:4; signed:0;
+	unsigned int processed;	//	offset:24; size:4; signed:0;
+	int sched;		//	offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_kthread")
+int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
+{
+	struct datarec *rec;
+	u32 key = 0;
+
+	rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
+	if (!rec)
+		return 0;
+	rec->processed += ctx->processed;
+	rec->dropped   += ctx->drops;
+
+	/* Count times kthread yielded CPU via schedule call */
+	if (ctx->sched)
+		rec->info++;
+
+	return 0;
+}
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index eaba165b3549..eec14520d513 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -1,4 +1,5 @@
-/* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
  */
 static const char *__doc__=
  "XDP monitor tool, based on tracepoints\n"
@@ -40,6 +41,9 @@ static const struct option long_options[] = {
 	{0, 0, NULL,  0 }
 };
 
+/* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */
+#define EXIT_FAIL_MEM	5
+
 static void usage(char *argv[])
 {
 	int i;
@@ -108,23 +112,93 @@ static const char *action2str(int action)
 	return NULL;
 }
 
+/* Common stats data record shared with _kern.c */
+struct datarec {
+	__u64 processed;
+	__u64 dropped;
+	__u64 info;
+};
+#define MAX_CPUS 64
+
+/* Userspace structs for collection of stats from maps */
 struct record {
-	__u64 counter;
 	__u64 timestamp;
+	struct datarec total;
+	struct datarec *cpu;
+};
+struct u64rec {
+	__u64 processed;
+};
+struct record_u64 {
+	/* record for _kern side __u64 values */
+	__u64 timestamp;
+	struct u64rec total;
+	struct u64rec *cpu;
 };
 
 struct stats_record {
-	struct record xdp_redir[REDIR_RES_MAX];
-	struct record xdp_exception[XDP_ACTION_MAX];
+	struct record_u64 xdp_redirect[REDIR_RES_MAX];
+	struct record_u64 xdp_exception[XDP_ACTION_MAX];
+	struct record xdp_cpumap_kthread;
+	struct record xdp_cpumap_enqueue[MAX_CPUS];
 };
 
-static void stats_print_headers(bool err_only)
+static bool map_collect_record(int fd, __u32 key, struct record *rec)
 {
-	if (err_only)
-		printf("\n%s\n", __doc_err_only__);
+	/* For percpu maps, userspace gets a value per possible CPU */
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	struct datarec values[nr_cpus];
+	__u64 sum_processed = 0;
+	__u64 sum_dropped = 0;
+	__u64 sum_info = 0;
+	int i;
+
+	if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+		fprintf(stderr,
+			"ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+		return false;
+	}
+	/* Get time as close as possible to reading map contents */
+	rec->timestamp = gettime();
 
-	printf("%-14s %-11s %-10s %-18s %-9s\n",
-	       "ACTION", "result", "pps ", "pps-human-readable", "measure-period");
+	/* Record and sum values from each CPU */
+	for (i = 0; i < nr_cpus; i++) {
+		rec->cpu[i].processed = values[i].processed;
+		sum_processed        += values[i].processed;
+		rec->cpu[i].dropped = values[i].dropped;
+		sum_dropped        += values[i].dropped;
+		rec->cpu[i].info = values[i].info;
+		sum_info        += values[i].info;
+	}
+	rec->total.processed = sum_processed;
+	rec->total.dropped   = sum_dropped;
+	rec->total.info      = sum_info;
+	return true;
+}
+
+static bool map_collect_record_u64(int fd, __u32 key, struct record_u64 *rec)
+{
+	/* For percpu maps, userspace gets a value per possible CPU */
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	struct u64rec values[nr_cpus];
+	__u64 sum_total = 0;
+	int i;
+
+	if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+		fprintf(stderr,
+			"ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+		return false;
+	}
+	/* Get time as close as possible to reading map contents */
+	rec->timestamp = gettime();
+
+	/* Record and sum values from each CPU */
+	for (i = 0; i < nr_cpus; i++) {
+		rec->cpu[i].processed = values[i].processed;
+		sum_total            += values[i].processed;
+	}
+	rec->total.processed = sum_total;
+	return true;
 }
 
 static double calc_period(struct record *r, struct record *p)
@@ -139,77 +213,203 @@ static double calc_period(struct record *r, struct record *p)
 	return period_;
 }
 
-static double calc_pps(struct record *r, struct record *p, double period)
+static double calc_period_u64(struct record_u64 *r, struct record_u64 *p)
+{
+	double period_ = 0;
+	__u64 period = 0;
+
+	period = r->timestamp - p->timestamp;
+	if (period > 0)
+		period_ = ((double) period / NANOSEC_PER_SEC);
+
+	return period_;
+}
+
+static double calc_pps(struct datarec *r, struct datarec *p, double period)
+{
+	__u64 packets = 0;
+	double pps = 0;
+
+	if (period > 0) {
+		packets = r->processed - p->processed;
+		pps = packets / period;
+	}
+	return pps;
+}
+
+static double calc_pps_u64(struct u64rec *r, struct u64rec *p, double period)
+{
+	__u64 packets = 0;
+	double pps = 0;
+
+	if (period > 0) {
+		packets = r->processed - p->processed;
+		pps = packets / period;
+	}
+	return pps;
+}
+
+static double calc_drop(struct datarec *r, struct datarec *p, double period)
+{
+	__u64 packets = 0;
+	double pps = 0;
+
+	if (period > 0) {
+		packets = r->dropped - p->dropped;
+		pps = packets / period;
+	}
+	return pps;
+}
+
+static double calc_info(struct datarec *r, struct datarec *p, double period)
 {
 	__u64 packets = 0;
 	double pps = 0;
 
 	if (period > 0) {
-		packets = r->counter - p->counter;
+		packets = r->info - p->info;
 		pps = packets / period;
 	}
 	return pps;
 }
 
-static void stats_print(struct stats_record *rec,
-			struct stats_record *prev,
+static void stats_print(struct stats_record *stats_rec,
+			struct stats_record *stats_prev,
 			bool err_only)
 {
-	double period = 0, pps = 0;
-	struct record *r, *p;
-	int i = 0;
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	int rec_i = 0, i, to_cpu;
+	double t = 0, pps = 0;
 
-	char *fmt = "%-14s %-11s %-10.0f %'-18.0f %f\n";
+	/* Header */
+	printf("%-15s %-7s %-12s %-12s %-9s\n",
+	       "XDP-event", "CPU:to", "pps", "drop-pps", "extra-info");
 
 	/* tracepoint: xdp:xdp_redirect_* */
 	if (err_only)
-		i = REDIR_ERROR;
-
-	for (; i < REDIR_RES_MAX; i++) {
-		r = &rec->xdp_redir[i];
-		p = &prev->xdp_redir[i];
-
-		if (p->timestamp) {
-			period = calc_period(r, p);
-			pps = calc_pps(r, p, period);
+		rec_i = REDIR_ERROR;
+
+	for (; rec_i < REDIR_RES_MAX; rec_i++) {
+		struct record_u64 *rec, *prev;
+		char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n";
+		char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n";
+
+		rec  =  &stats_rec->xdp_redirect[rec_i];
+		prev = &stats_prev->xdp_redirect[rec_i];
+		t = calc_period_u64(rec, prev);
+
+		for (i = 0; i < nr_cpus; i++) {
+			struct u64rec *r = &rec->cpu[i];
+			struct u64rec *p = &prev->cpu[i];
+
+			pps = calc_pps_u64(r, p, t);
+			if (pps > 0)
+				printf(fmt1, "XDP_REDIRECT", i,
+				       rec_i ? 0.0: pps, rec_i ? pps : 0.0,
+				       err2str(rec_i));
 		}
-		printf(fmt, "XDP_REDIRECT", err2str(i), pps, pps, period);
+		pps = calc_pps_u64(&rec->total, &prev->total, t);
+		printf(fmt2, "XDP_REDIRECT", "total",
+		       rec_i ? 0.0: pps, rec_i ? pps : 0.0, err2str(rec_i));
 	}
 
 	/* tracepoint: xdp:xdp_exception */
-	for (i = 0; i < XDP_ACTION_MAX; i++) {
-		r = &rec->xdp_exception[i];
-		p = &prev->xdp_exception[i];
-		if (p->timestamp) {
-			period = calc_period(r, p);
-			pps = calc_pps(r, p, period);
+	for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) {
+		struct record_u64 *rec, *prev;
+		char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n";
+		char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n";
+
+		rec  =  &stats_rec->xdp_exception[rec_i];
+		prev = &stats_prev->xdp_exception[rec_i];
+		t = calc_period_u64(rec, prev);
+
+		for (i = 0; i < nr_cpus; i++) {
+			struct u64rec *r = &rec->cpu[i];
+			struct u64rec *p = &prev->cpu[i];
+
+			pps = calc_pps_u64(r, p, t);
+			if (pps > 0)
+				printf(fmt1, "Exception", i,
+				       0.0, pps, err2str(rec_i));
 		}
+		pps = calc_pps_u64(&rec->total, &prev->total, t);
 		if (pps > 0)
-			printf(fmt, action2str(i), "Exception",
-			       pps, pps, period);
+			printf(fmt2, "Exception", "total",
+			       0.0, pps, action2str(rec_i));
 	}
-	printf("\n");
-}
 
-static __u64 get_key32_value64_percpu(int fd, __u32 key)
-{
-	/* For percpu maps, userspace gets a value per possible CPU */
-	unsigned int nr_cpus = bpf_num_possible_cpus();
-	__u64 values[nr_cpus];
-	__u64 sum = 0;
-	int i;
-
-	if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
-		fprintf(stderr,
-			"ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
-		return 0;
+	/* cpumap enqueue stats */
+	for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) {
+		char *fmt1 = "%-15s %3d:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n";
+		char *fmt2 = "%-15s %3s:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n";
+		struct record *rec, *prev;
+		char *info_str = "";
+		double drop, info;
+
+		rec  =  &stats_rec->xdp_cpumap_enqueue[to_cpu];
+		prev = &stats_prev->xdp_cpumap_enqueue[to_cpu];
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps  = calc_pps(r, p, t);
+			drop = calc_drop(r, p, t);
+			info = calc_info(r, p, t);
+			if (info > 0) {
+				info_str = "bulk-average";
+				info = pps / info; /* calc average bulk size */
+			}
+			if (pps > 0)
+				printf(fmt1, "cpumap-enqueue",
+				       i, to_cpu, pps, drop, info, info_str);
+		}
+		pps = calc_pps(&rec->total, &prev->total, t);
+		if (pps > 0) {
+			drop = calc_drop(&rec->total, &prev->total, t);
+			info = calc_info(&rec->total, &prev->total, t);
+			if (info > 0) {
+				info_str = "bulk-average";
+				info = pps / info; /* calc average bulk size */
+			}
+			printf(fmt2, "cpumap-enqueue",
+			       "sum", to_cpu, pps, drop, info, info_str);
+		}
 	}
 
-	/* Sum values from each CPU */
-	for (i = 0; i < nr_cpus; i++) {
-		sum += values[i];
+	/* cpumap kthread stats */
+	{
+		char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.0f %s\n";
+		char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.0f %s\n";
+		struct record *rec, *prev;
+		double drop, info;
+		char *i_str = "";
+
+		rec  =  &stats_rec->xdp_cpumap_kthread;
+		prev = &stats_prev->xdp_cpumap_kthread;
+		t = calc_period(rec, prev);
+		for (i = 0; i < nr_cpus; i++) {
+			struct datarec *r = &rec->cpu[i];
+			struct datarec *p = &prev->cpu[i];
+
+			pps  = calc_pps(r, p, t);
+			drop = calc_drop(r, p, t);
+			info = calc_info(r, p, t);
+			if (info > 0)
+				i_str = "sched";
+			if (pps > 0)
+				printf(fmt1, "cpumap-kthread",
+				       i, pps, drop, info, i_str);
+		}
+		pps = calc_pps(&rec->total, &prev->total, t);
+		drop = calc_drop(&rec->total, &prev->total, t);
+		info = calc_info(&rec->total, &prev->total, t);
+		if (info > 0)
+			i_str = "sched-sum";
+		printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str);
 	}
-	return sum;
+
+	printf("\n");
 }
 
 static bool stats_collect(struct stats_record *rec)
@@ -222,25 +422,109 @@ static bool stats_collect(struct stats_record *rec)
 	 */
 
 	fd = map_data[0].fd; /* map0: redirect_err_cnt */
-	for (i = 0; i < REDIR_RES_MAX; i++) {
-		rec->xdp_redir[i].timestamp = gettime();
-		rec->xdp_redir[i].counter = get_key32_value64_percpu(fd, i);
-	}
+	for (i = 0; i < REDIR_RES_MAX; i++)
+		map_collect_record_u64(fd, i, &rec->xdp_redirect[i]);
 
 	fd = map_data[1].fd; /* map1: exception_cnt */
 	for (i = 0; i < XDP_ACTION_MAX; i++) {
-		rec->xdp_exception[i].timestamp = gettime();
-		rec->xdp_exception[i].counter = get_key32_value64_percpu(fd, i);
+		map_collect_record_u64(fd, i, &rec->xdp_exception[i]);
 	}
 
+	fd = map_data[2].fd; /* map2: cpumap_enqueue_cnt */
+	for (i = 0; i < MAX_CPUS; i++)
+		map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]);
+
+	fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */
+	map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
+
 	return true;
 }
 
+static void *alloc_rec_per_cpu(int record_size)
+{
+	unsigned int nr_cpus = bpf_num_possible_cpus();
+	void *array;
+	size_t size;
+
+	size = record_size * nr_cpus;
+	array = malloc(size);
+	memset(array, 0, size);
+	if (!array) {
+		fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
+		exit(EXIT_FAIL_MEM);
+	}
+	return array;
+}
+
+static struct stats_record *alloc_stats_record(void)
+{
+	struct stats_record *rec;
+	int rec_sz;
+	int i;
+
+	/* Alloc main stats_record structure */
+	rec = malloc(sizeof(*rec));
+	memset(rec, 0, sizeof(*rec));
+	if (!rec) {
+		fprintf(stderr, "Mem alloc error\n");
+		exit(EXIT_FAIL_MEM);
+	}
+
+	/* Alloc stats stored per CPU for each record */
+	rec_sz = sizeof(struct u64rec);
+	for (i = 0; i < REDIR_RES_MAX; i++)
+		rec->xdp_redirect[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+	for (i = 0; i < XDP_ACTION_MAX; i++)
+		rec->xdp_exception[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+	rec_sz = sizeof(struct datarec);
+	rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz);
+
+	for (i = 0; i < MAX_CPUS; i++)
+		rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+	return rec;
+}
+
+static void free_stats_record(struct stats_record *r)
+{
+	int i;
+
+	for (i = 0; i < REDIR_RES_MAX; i++)
+		free(r->xdp_redirect[i].cpu);
+
+	for (i = 0; i < XDP_ACTION_MAX; i++)
+		free(r->xdp_exception[i].cpu);
+
+	free(r->xdp_cpumap_kthread.cpu);
+
+	for (i = 0; i < MAX_CPUS; i++)
+		free(r->xdp_cpumap_enqueue[i].cpu);
+
+	free(r);
+}
+
+/* Pointer swap trick */
+static inline void swap(struct stats_record **a, struct stats_record **b)
+{
+	struct stats_record *tmp;
+
+	tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
 static void stats_poll(int interval, bool err_only)
 {
-	struct stats_record rec, prev;
+	struct stats_record *rec, *prev;
 
-	memset(&rec, 0, sizeof(rec));
+	rec  = alloc_stats_record();
+	prev = alloc_stats_record();
+	stats_collect(rec);
+
+	if (err_only)
+		printf("\n%s\n", __doc_err_only__);
 
 	/* Trick to pretty printf with thousands separators use %' */
 	setlocale(LC_NUMERIC, "en_US");
@@ -258,13 +542,15 @@ static void stats_poll(int interval, bool err_only)
 	fflush(stdout);
 
 	while (1) {
-		memcpy(&prev, &rec, sizeof(rec));
-		stats_collect(&rec);
-		stats_print_headers(err_only);
-		stats_print(&rec, &prev, err_only);
+		swap(&prev, &rec);
+		stats_collect(rec);
+		stats_print(rec, prev, err_only);
 		fflush(stdout);
 		sleep(interval);
 	}
+
+	free_stats_record(rec);
+	free_stats_record(prev);
 }
 
 static void print_bpf_prog_info(void)
-- 
cgit v1.2.3


From 6627426fa2741866f1bdd194216a91a82ec063e4 Mon Sep 17 00:00:00 2001
From: John Fastabend
Date: Mon, 22 Jan 2018 10:35:27 -0800
Subject: bpf: refactor sockmap sample program update for arg parsing

sockmap sample program takes arguments from cmd line but it reads them
in using offsets into the array. Because we want to add more arguments
in the future lets do proper argument handling.

Also refactor code to pull apart sock init and ping/pong test. This
allows us to add new tests in the future.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/sockmap/sockmap_user.c | 165 ++++++++++++++++++++++++++++-------------
 1 file changed, 114 insertions(+), 51 deletions(-)

(limited to 'samples')

diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c
index 7cc9d228216f..ffd1d127a9d3 100644
--- a/samples/sockmap/sockmap_user.c
+++ b/samples/sockmap/sockmap_user.c
@@ -35,6 +35,8 @@
 #include <assert.h>
 #include <libgen.h>
 
+#include <getopt.h>
+
 #include "../bpf/bpf_load.h"
 #include "../bpf/bpf_util.h"
 #include "../bpf/libbpf.h"
@@ -46,15 +48,39 @@ void running_handler(int a);
 #define S1_PORT 10000
 #define S2_PORT 10001
 
-static int sockmap_test_sockets(int rate, int dot)
+/* global sockets */
+int s1, s2, c1, c2, p1, p2;
+
+static const struct option long_options[] = {
+	{"help",	no_argument,		NULL, 'h' },
+	{"cgroup",	required_argument,	NULL, 'c' },
+	{"rate",	required_argument,	NULL, 'r' },
+	{"verbose",	no_argument,		NULL, 'v' },
+	{0, 0, NULL, 0 }
+};
+
+static void usage(char *argv[])
 {
-	int i, sc, err, max_fd, one = 1;
-	int s1, s2, c1, c2, p1, p2;
+	int i;
+
+	printf(" Usage: %s --cgroup <cgroup_path>\n", argv[0]);
+	printf(" options:\n");
+	for (i = 0; long_options[i].name != 0; i++) {
+		printf(" --%-12s", long_options[i].name);
+		if (long_options[i].flag != NULL)
+			printf(" flag (internal value:%d)\n",
+				*long_options[i].flag);
+		else
+			printf(" -%c\n", long_options[i].val);
+	}
+	printf("\n");
+}
+
+static int sockmap_init_sockets(void)
+{
+	int i, err, one = 1;
 	struct sockaddr_in addr;
-	struct timeval timeout;
-	char buf[1024] = {0};
 	int *fds[4] = {&s1, &s2, &c1, &c2};
-	fd_set w;
 
 	s1 = s2 = p1 = p2 = c1 = c2 = 0;
 
@@ -63,8 +89,7 @@ static int sockmap_test_sockets(int rate, int dot)
 		*fds[i] = socket(AF_INET, SOCK_STREAM, 0);
 		if (*fds[i] < 0) {
 			perror("socket s1 failed()");
-			err = *fds[i];
-			goto out;
+			return errno;
 		}
 	}
 
@@ -74,7 +99,7 @@ static int sockmap_test_sockets(int rate, int dot)
 				 (char *)&one, sizeof(one));
 		if (err) {
 			perror("setsockopt failed()");
-			goto out;
+			return errno;
 		}
 	}
 
@@ -83,7 +108,7 @@ static int sockmap_test_sockets(int rate, int dot)
 		err = ioctl(*fds[i], FIONBIO, (char *)&one);
 		if (err < 0) {
 			perror("ioctl s1 failed()");
-			goto out;
+			return errno;
 		}
 	}
 
@@ -96,14 +121,14 @@ static int sockmap_test_sockets(int rate, int dot)
 	err = bind(s1, (struct sockaddr *)&addr, sizeof(addr));
 	if (err < 0) {
 		perror("bind s1 failed()\n");
-		goto out;
+		return errno;
 	}
 
 	addr.sin_port = htons(S2_PORT);
 	err = bind(s2, (struct sockaddr *)&addr, sizeof(addr));
 	if (err < 0) {
 		perror("bind s2 failed()\n");
-		goto out;
+		return errno;
 	}
 
 	/* Listen server sockets */
@@ -111,14 +136,14 @@ static int sockmap_test_sockets(int rate, int dot)
 	err = listen(s1, 32);
 	if (err < 0) {
 		perror("listen s1 failed()\n");
-		goto out;
+		return errno;
 	}
 
 	addr.sin_port = htons(S2_PORT);
 	err = listen(s2, 32);
 	if (err < 0) {
 		perror("listen s1 failed()\n");
-		goto out;
+		return errno;
 	}
 
 	/* Initiate Connect */
@@ -126,46 +151,56 @@ static int sockmap_test_sockets(int rate, int dot)
 	err = connect(c1, (struct sockaddr *)&addr, sizeof(addr));
 	if (err < 0 && errno != EINPROGRESS) {
 		perror("connect c1 failed()\n");
-		goto out;
+		return errno;
 	}
 
 	addr.sin_port = htons(S2_PORT);
 	err = connect(c2, (struct sockaddr *)&addr, sizeof(addr));
 	if (err < 0 && errno != EINPROGRESS) {
 		perror("connect c2 failed()\n");
-		goto out;
+		return errno;
+	} else if (err < 0) {
+		err = 0;
 	}
 
 	/* Accept Connecrtions */
 	p1 = accept(s1, NULL, NULL);
 	if (p1 < 0) {
 		perror("accept s1 failed()\n");
-		goto out;
+		return errno;
 	}
 
 	p2 = accept(s2, NULL, NULL);
 	if (p2 < 0) {
 		perror("accept s1 failed()\n");
-		goto out;
+		return errno;
 	}
 
-	max_fd = p2;
-	timeout.tv_sec = 10;
-	timeout.tv_usec = 0;
-
 	printf("connected sockets: c1 <-> p1, c2 <-> p2\n");
 	printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n",
 		c1, s1, c2, s2);
+	return 0;
+}
+
+static int forever_ping_pong(int rate, int verbose)
+{
+	struct timeval timeout;
+	char buf[1024] = {0};
+	int sc;
+
+	timeout.tv_sec = 10;
+	timeout.tv_usec = 0;
 
 	/* Ping/Pong data from client to server */
 	sc = send(c1, buf, sizeof(buf), 0);
 	if (sc < 0) {
 		perror("send failed()\n");
-		goto out;
+		return sc;
 	}
 
 	do {
-		int s, rc, i;
+		int s, rc, i, max_fd = p2;
+		fd_set w;
 
 		/* FD sets */
 		FD_ZERO(&w);
@@ -193,7 +228,7 @@ static int sockmap_test_sockets(int rate, int dot)
 			if (rc < 0) {
 				if (errno != EWOULDBLOCK) {
 					perror("recv failed()\n");
-					break;
+					return rc;
 				}
 			}
 
@@ -205,35 +240,61 @@ static int sockmap_test_sockets(int rate, int dot)
 			sc = send(i, buf, rc, 0);
 			if (sc < 0) {
 				perror("send failed()\n");
-				break;
+				return sc;
 			}
 		}
-		sleep(rate);
-		if (dot) {
+
+		if (rate)
+			sleep(rate);
+
+		if (verbose) {
 			printf(".");
 			fflush(stdout);
 
 		}
 	} while (running);
 
-out:
-	close(s1);
-	close(s2);
-	close(p1);
-	close(p2);
-	close(c1);
-	close(c2);
-	return err;
+	return 0;
 }
 
 int main(int argc, char **argv)
 {
-	int rate = 1, dot = 1;
+	int rate = 1, verbose = 0;
+	int opt, longindex, err, cg_fd = 0;
 	char filename[256];
-	int err, cg_fd;
-	char *cg_path;
 
-	cg_path = argv[argc - 1];
+	while ((opt = getopt_long(argc, argv, "hvc:r:",
+				  long_options, &longindex)) != -1) {
+		switch (opt) {
+		/* Cgroup configuration */
+		case 'c':
+			cg_fd = open(optarg, O_DIRECTORY, O_RDONLY);
+			if (cg_fd < 0) {
+				fprintf(stderr,
+					"ERROR: (%i) open cg path failed: %s\n",
+					cg_fd, optarg);
+				return cg_fd;
+			}
+			break;
+		case 'r':
+			rate = atoi(optarg);
+			break;
+		case 'v':
+			verbose = 1;
+			break;
+		case 'h':
+		default:
+			usage(argv);
+			return -1;
+		}
+	}
+
+	if (!cg_fd) {
+		fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n",
+			argv[0]);
+		return -1;
+	}
+
 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
 
 	running = 1;
@@ -247,14 +308,6 @@ int main(int argc, char **argv)
 		return 1;
 	}
 
-	/* Cgroup configuration */
-	cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY);
-	if (cg_fd < 0) {
-		fprintf(stderr, "ERROR: (%i) open cg path failed: %s\n",
-			cg_fd, cg_path);
-		return cg_fd;
-	}
-
 	/* Attach programs to sockmap */
 	err = bpf_prog_attach(prog_fd[0], map_fd[0],
 				BPF_SK_SKB_STREAM_PARSER, 0);
@@ -280,12 +333,22 @@ int main(int argc, char **argv)
 		return err;
 	}
 
-	err = sockmap_test_sockets(rate, dot);
+	err = sockmap_init_sockets();
 	if (err) {
 		fprintf(stderr, "ERROR: test socket failed: %d\n", err);
-		return err;
+		goto out;
 	}
-	return 0;
+
+	err = forever_ping_pong(rate, verbose);
+out:
+	close(s1);
+	close(s2);
+	close(p1);
+	close(p2);
+	close(c1);
+	close(c2);
+	close(cg_fd);
+	return err;
 }
 
 void running_handler(int a)
-- 
cgit v1.2.3


From eaf8c6eec5f9ab5d9d7155d7041fb7eea7028052 Mon Sep 17 00:00:00 2001
From: John Fastabend
Date: Mon, 22 Jan 2018 10:35:45 -0800
Subject: bpf: add sendmsg option for testing BPF programs

When testing BPF programs using sockmap I often want to have more
control over how sendmsg is exercised. This becomes even more useful
as new sockmap program types are added.

This adds a test type option to select type of test to run. Currently,
only "ping" and "sendmsg" are supported, but more can be added as
needed.

The new help argument gives the following,

 Usage: ./sockmap --cgroup <cgroup_path>
 options:
 --help         -h
 --cgroup       -c
 --rate         -r
 --verbose      -v
 --iov_count    -i
 --length       -l
 --test         -t

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/sockmap/sockmap_user.c | 148 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 145 insertions(+), 3 deletions(-)

(limited to 'samples')

diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c
index ffd1d127a9d3..ccc717349eba 100644
--- a/samples/sockmap/sockmap_user.c
+++ b/samples/sockmap/sockmap_user.c
@@ -56,6 +56,9 @@ static const struct option long_options[] = {
 	{"cgroup",	required_argument,	NULL, 'c' },
 	{"rate",	required_argument,	NULL, 'r' },
 	{"verbose",	no_argument,		NULL, 'v' },
+	{"iov_count",	required_argument,	NULL, 'i' },
+	{"length",	required_argument,	NULL, 'l' },
+	{"test",	required_argument,	NULL, 't' },
 	{0, 0, NULL, 0 }
 };
 
@@ -182,6 +185,118 @@ static int sockmap_init_sockets(void)
 	return 0;
 }
 
+struct msg_stats {
+	size_t bytes_sent;
+	size_t bytes_recvd;
+};
+
+static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
+		    struct msg_stats *s, bool tx)
+{
+	struct msghdr msg = {0};
+	struct iovec *iov;
+	int i, flags = 0;
+
+	iov = calloc(iov_count, sizeof(struct iovec));
+	if (!iov)
+		return errno;
+
+	for (i = 0; i < iov_count; i++) {
+		char *d = calloc(iov_length, sizeof(char));
+
+		if (!d) {
+			fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count);
+			goto out_errno;
+		}
+		iov[i].iov_base = d;
+		iov[i].iov_len = iov_length;
+	}
+
+	msg.msg_iov = iov;
+	msg.msg_iovlen = iov_count;
+
+	if (tx) {
+		for (i = 0; i < cnt; i++) {
+			int sent = sendmsg(fd, &msg, flags);
+
+			if (sent < 0) {
+				perror("send loop error:");
+				goto out_errno;
+			}
+			s->bytes_sent += sent;
+		}
+	} else {
+		int slct, recv, max_fd = fd;
+		struct timeval timeout;
+		float total_bytes;
+		fd_set w;
+
+		total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
+		while (s->bytes_recvd < total_bytes) {
+			timeout.tv_sec = 1;
+			timeout.tv_usec = 0;
+
+			/* FD sets */
+			FD_ZERO(&w);
+			FD_SET(fd, &w);
+
+			slct = select(max_fd + 1, &w, NULL, NULL, &timeout);
+			if (slct == -1) {
+				perror("select()");
+				goto out_errno;
+			} else if (!slct) {
+				fprintf(stderr, "unexpected timeout\n");
+				errno = -EIO;
+				goto out_errno;
+			}
+
+			recv = recvmsg(fd, &msg, flags);
+			if (recv < 0) {
+				if (errno != EWOULDBLOCK) {
+					perror("recv failed()\n");
+					goto out_errno;
+				}
+			}
+
+			s->bytes_recvd += recv;
+		}
+	}
+
+	for (i = 0; i < iov_count; i++)
+		free(iov[i].iov_base);
+	free(iov);
+	return 0;
+out_errno:
+	for (i = 0; i < iov_count; i++)
+		free(iov[i].iov_base);
+	free(iov);
+	return errno;
+}
+
+static int sendmsg_test(int iov_count, int iov_buf, int cnt, int verbose)
+{
+	struct msg_stats s = {0};
+	int err;
+
+	err = msg_loop(c1, iov_count, iov_buf, cnt, &s, true);
+	if (err) {
+		fprintf(stderr,
+			"msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n",
+			iov_count, iov_buf, cnt, err);
+		return err;
+	}
+
+	err = msg_loop(p2, iov_count, iov_buf, cnt, &s, false);
+	if (err)
+		fprintf(stderr,
+			"msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
+			iov_count, iov_buf, cnt, err);
+
+	fprintf(stdout, "sendmsg: TX_bytes %zu RX_bytes %zu\n",
+		s.bytes_sent, s.bytes_recvd);
+	return err;
+}
+
 static int forever_ping_pong(int rate, int verbose)
 {
 	struct timeval timeout;
@@ -257,13 +372,19 @@ static int forever_ping_pong(int rate, int verbose)
 	return 0;
 }
 
+enum {
+	PING_PONG,
+	SENDMSG,
+};
+
 int main(int argc, char **argv)
 {
-	int rate = 1, verbose = 0;
+	int iov_count = 1, length = 1024, rate = 1, verbose = 0;
 	int opt, longindex, err, cg_fd = 0;
+	int test = PING_PONG;
 	char filename[256];
 
-	while ((opt = getopt_long(argc, argv, "hvc:r:",
+	while ((opt = getopt_long(argc, argv, "hvc:r:i:l:t:",
 				  long_options, &longindex)) != -1) {
 		switch (opt) {
 		/* Cgroup configuration */
@@ -282,6 +403,22 @@ int main(int argc, char **argv)
 		case 'v':
 			verbose = 1;
 			break;
+		case 'i':
+			iov_count = atoi(optarg);
+			break;
+		case 'l':
+			length = atoi(optarg);
+			break;
+		case 't':
+			if (strcmp(optarg, "ping") == 0) {
+				test = PING_PONG;
+			} else if (strcmp(optarg, "sendmsg") == 0) {
+				test = SENDMSG;
+			} else {
+				usage(argv);
+				return -1;
+			}
+			break;
 		case 'h':
 		default:
 			usage(argv);
@@ -339,7 +476,12 @@ int main(int argc, char **argv)
 		goto out;
 	}
 
-	err = forever_ping_pong(rate, verbose);
+	if (test == PING_PONG)
+		err = forever_ping_pong(rate, verbose);
+	else if (test == SENDMSG)
+		err = sendmsg_test(iov_count, length, rate, verbose);
+	else
+		fprintf(stderr, "unknown test\n");
 out:
 	close(s1);
 	close(s2);
-- 
cgit v1.2.3


From d7d6437acf9b4ddbbebe0a9029a30c23be141683 Mon Sep 17 00:00:00 2001
From: John Fastabend
Date: Mon, 22 Jan 2018 10:36:02 -0800
Subject: bpf: sockmap sample, use fork() for send and recv

Currently for SENDMSG tests first send completes then recv runs. This
does not work well for large data sizes and/or many iterations. So
fork the recv and send handler so that we run both send and recv. In
the future we can add a parameter to do more than a single fork of
tx/rx.

With this we can get many GBps of data which helps exercise the
sockmap code.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/sockmap/sockmap_user.c | 55 ++++++++++++++++++++++++++++++------------
 1 file changed, 39 insertions(+), 16 deletions(-)

(limited to 'samples')

diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c
index ccc717349eba..6ab3ec233bdf 100644
--- a/samples/sockmap/sockmap_user.c
+++ b/samples/sockmap/sockmap_user.c
@@ -23,6 +23,7 @@
 #include <stdbool.h>
 #include <signal.h>
 #include <fcntl.h>
+#include <sys/wait.h>
 
 #include <sys/time.h>
 #include <sys/types.h>
@@ -195,7 +196,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 {
 	struct msghdr msg = {0};
 	struct iovec *iov;
-	int i, flags = 0;
+	int i, flags = MSG_NOSIGNAL;
 
 	iov = calloc(iov_count, sizeof(struct iovec));
 	if (!iov)
@@ -275,25 +276,47 @@ out_errno:
 
 static int sendmsg_test(int iov_count, int iov_buf, int cnt, int verbose)
 {
+	int txpid, rxpid, err = 0;
 	struct msg_stats s = {0};
-	int err;
-
-	err = msg_loop(c1, iov_count, iov_buf, cnt, &s, true);
-	if (err) {
-		fprintf(stderr,
-			"msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n",
-			iov_count, iov_buf, cnt, err);
-		return err;
+	int status;
+
+	errno = 0;
+
+	rxpid = fork();
+	if (rxpid == 0) {
+		err = msg_loop(p2, iov_count, iov_buf, cnt, &s, false);
+		if (err)
+			fprintf(stderr,
+				"msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
+				iov_count, iov_buf, cnt, err);
+		fprintf(stdout, "rx_sendmsg: TX_bytes %zu RX_bytes %zu\n",
+			s.bytes_sent, s.bytes_recvd);
+		shutdown(p2, SHUT_RDWR);
+		shutdown(p1, SHUT_RDWR);
+		exit(1);
+	} else if (rxpid == -1) {
+		perror("msg_loop_rx: ");
+		return errno;
 	}
 
-	err = msg_loop(p2, iov_count, iov_buf, cnt, &s, false);
-	if (err)
-		fprintf(stderr,
-			"msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
-			iov_count, iov_buf, cnt, err);
+	txpid = fork();
+	if (txpid == 0) {
+		err = msg_loop(c1, iov_count, iov_buf, cnt, &s, true);
+		if (err)
+			fprintf(stderr,
+				"msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n",
+				iov_count, iov_buf, cnt, err);
+		fprintf(stdout, "tx_sendmsg: TX_bytes %zu RX_bytes %zu\n",
+			s.bytes_sent, s.bytes_recvd);
+		shutdown(c1, SHUT_RDWR);
+		exit(1);
+	} else if (txpid == -1) {
+		perror("msg_loop_tx: ");
+		return errno;
+	}
 
-	fprintf(stdout, "sendmsg: TX_bytes %zu RX_bytes %zu\n",
-		s.bytes_sent, s.bytes_recvd);
+	assert(waitpid(rxpid, &status, 0) == rxpid);
+	assert(waitpid(txpid, &status, 0) == txpid);
 	return err;
 }
 
-- 
cgit v1.2.3


From 66fdd1a3cdf9bcee9790bc6b6322f0cafaf0d3f2 Mon Sep 17 00:00:00 2001
From: John Fastabend
Date: Mon, 22 Jan 2018 10:36:19 -0800
Subject: bpf: sockmap sample, report bytes/sec

Report bytes/sec sent as well as total bytes. Useful to get rough
idea how different configurations and usage patterns perform with
sockmap.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/sockmap/sockmap_user.c | 47 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 42 insertions(+), 5 deletions(-)

(limited to 'samples')

diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c
index 6ab3ec233bdf..661ea7e4a350 100644
--- a/samples/sockmap/sockmap_user.c
+++ b/samples/sockmap/sockmap_user.c
@@ -24,6 +24,7 @@
 #include <signal.h>
 #include <fcntl.h>
 #include <sys/wait.h>
+#include <time.h>
 
 #include <sys/time.h>
 #include <sys/types.h>
@@ -189,14 +190,16 @@ static int sockmap_init_sockets(void)
 struct msg_stats {
 	size_t bytes_sent;
 	size_t bytes_recvd;
+	struct timespec start;
+	struct timespec end;
 };
 
 static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		    struct msg_stats *s, bool tx)
 {
 	struct msghdr msg = {0};
+	int err, i, flags = MSG_NOSIGNAL;
 	struct iovec *iov;
-	int i, flags = MSG_NOSIGNAL;
 
 	iov = calloc(iov_count, sizeof(struct iovec));
 	if (!iov)
@@ -217,6 +220,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 	msg.msg_iovlen = iov_count;
 
 	if (tx) {
+		clock_gettime(CLOCK_MONOTONIC, &s->start);
 		for (i = 0; i < cnt; i++) {
 			int sent = sendmsg(fd, &msg, flags);
 
@@ -226,6 +230,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 			}
 			s->bytes_sent += sent;
 		}
+		clock_gettime(CLOCK_MONOTONIC, &s->end);
 	} else {
 		int slct, recv, max_fd = fd;
 		struct timeval timeout;
@@ -233,6 +238,9 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 		fd_set w;
 
 		total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
+		err = clock_gettime(CLOCK_MONOTONIC, &s->start);
+		if (err < 0)
+			perror("recv start time: ");
 		while (s->bytes_recvd < total_bytes) {
 			timeout.tv_sec = 1;
 			timeout.tv_usec = 0;
@@ -244,16 +252,19 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 			slct = select(max_fd + 1, &w, NULL, NULL, &timeout);
 			if (slct == -1) {
 				perror("select()");
+				clock_gettime(CLOCK_MONOTONIC, &s->end);
 				goto out_errno;
 			} else if (!slct) {
 				fprintf(stderr, "unexpected timeout\n");
 				errno = -EIO;
+				clock_gettime(CLOCK_MONOTONIC, &s->end);
 				goto out_errno;
 			}
 
 			recv = recvmsg(fd, &msg, flags);
 			if (recv < 0) {
 				if (errno != EWOULDBLOCK) {
+					clock_gettime(CLOCK_MONOTONIC, &s->end);
 					perror("recv failed()\n");
 					goto out_errno;
 				}
@@ -261,6 +272,7 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 
 			s->bytes_recvd += recv;
 		}
+		clock_gettime(CLOCK_MONOTONIC, &s->end);
 	}
 
 	for (i = 0; i < iov_count; i++)
@@ -274,11 +286,24 @@ out_errno:
 	return errno;
 }
 
+static float giga = 1000000000;
+
+static inline float sentBps(struct msg_stats s)
+{
+	return s.bytes_sent / (s.end.tv_sec - s.start.tv_sec);
+}
+
+static inline float recvdBps(struct msg_stats s)
+{
+	return s.bytes_recvd / (s.end.tv_sec - s.start.tv_sec);
+}
+
 static int sendmsg_test(int iov_count, int iov_buf, int cnt, int verbose)
 {
 	int txpid, rxpid, err = 0;
 	struct msg_stats s = {0};
 	int status;
+	float sent_Bps = 0, recvd_Bps = 0;
 
 	errno = 0;
 
@@ -289,10 +314,16 @@ static int sendmsg_test(int iov_count, int iov_buf, int cnt, int verbose)
 			fprintf(stderr,
 				"msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
 				iov_count, iov_buf, cnt, err);
-		fprintf(stdout, "rx_sendmsg: TX_bytes %zu RX_bytes %zu\n",
-			s.bytes_sent, s.bytes_recvd);
 		shutdown(p2, SHUT_RDWR);
 		shutdown(p1, SHUT_RDWR);
+		if (s.end.tv_sec - s.start.tv_sec) {
+			sent_Bps = sentBps(s);
+			recvd_Bps = recvdBps(s);
+		}
+		fprintf(stdout,
+			"rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n",
+			s.bytes_sent, sent_Bps, sent_Bps/giga,
+			s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
 		exit(1);
 	} else if (rxpid == -1) {
 		perror("msg_loop_rx: ");
@@ -306,9 +337,15 @@ static int sendmsg_test(int iov_count, int iov_buf, int cnt, int verbose)
 			fprintf(stderr,
 				"msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n",
 				iov_count, iov_buf, cnt, err);
-		fprintf(stdout, "tx_sendmsg: TX_bytes %zu RX_bytes %zu\n",
-			s.bytes_sent, s.bytes_recvd);
 		shutdown(c1, SHUT_RDWR);
+		if (s.end.tv_sec - s.start.tv_sec) {
+			sent_Bps = sentBps(s);
+			recvd_Bps = recvdBps(s);
+		}
+		fprintf(stdout,
+			"tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n",
+			s.bytes_sent, sent_Bps, sent_Bps/giga,
+			s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
 		exit(1);
 	} else if (txpid == -1) {
 		perror("msg_loop_tx: ");
-- 
cgit v1.2.3


From ce5373be1aeac7889eb31f4bcf2b1dc2ad3c263c Mon Sep 17 00:00:00 2001
From: John Fastabend
Date: Mon, 22 Jan 2018 10:36:36 -0800
Subject: bpf: sockmap sample add base test without any BPF for comparison

Add a base test that does not use BPF hooks to test baseline case.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/sockmap/sockmap_user.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

(limited to 'samples')

diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c
index 661ea7e4a350..f9d3785fb183 100644
--- a/samples/sockmap/sockmap_user.c
+++ b/samples/sockmap/sockmap_user.c
@@ -298,18 +298,24 @@ static inline float recvdBps(struct msg_stats s)
 	return s.bytes_recvd / (s.end.tv_sec - s.start.tv_sec);
 }
 
-static int sendmsg_test(int iov_count, int iov_buf, int cnt, int verbose)
+static int sendmsg_test(int iov_count, int iov_buf, int cnt,
+			int verbose, bool base)
 {
-	int txpid, rxpid, err = 0;
+	float sent_Bps = 0, recvd_Bps = 0;
+	int rx_fd, txpid, rxpid, err = 0;
 	struct msg_stats s = {0};
 	int status;
-	float sent_Bps = 0, recvd_Bps = 0;
 
 	errno = 0;
 
+	if (base)
+		rx_fd = p1;
+	else
+		rx_fd = p2;
+
 	rxpid = fork();
 	if (rxpid == 0) {
-		err = msg_loop(p2, iov_count, iov_buf, cnt, &s, false);
+		err = msg_loop(rx_fd, iov_count, iov_buf, cnt, &s, false);
 		if (err)
 			fprintf(stderr,
 				"msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
@@ -435,6 +441,7 @@ static int forever_ping_pong(int rate, int verbose)
 enum {
 	PING_PONG,
 	SENDMSG,
+	BASE,
 };
 
 int main(int argc, char **argv)
@@ -474,6 +481,8 @@ int main(int argc, char **argv)
 				test = PING_PONG;
 			} else if (strcmp(optarg, "sendmsg") == 0) {
 				test = SENDMSG;
+			} else if (strcmp(optarg, "base") == 0) {
+				test = BASE;
 			} else {
 				usage(argv);
 				return -1;
@@ -499,6 +508,10 @@ int main(int argc, char **argv)
 	/* catch SIGINT */
 	signal(SIGINT, running_handler);
 
+	/* If base test skip BPF setup */
+	if (test == BASE)
+		goto run;
+
 	if (load_bpf_file(filename)) {
 		fprintf(stderr, "load_bpf_file: (%s) %s\n",
 			filename, strerror(errno));
@@ -530,6 +543,7 @@ int main(int argc, char **argv)
 		return err;
 	}
 
+run:
 	err = sockmap_init_sockets();
 	if (err) {
 		fprintf(stderr, "ERROR: test socket failed: %d\n", err);
@@ -539,7 +553,9 @@ int main(int argc, char **argv)
 	if (test == PING_PONG)
 		err = forever_ping_pong(rate, verbose);
 	else if (test == SENDMSG)
-		err = sendmsg_test(iov_count, length, rate, verbose);
+		err = sendmsg_test(iov_count, length, rate, verbose, false);
+	else if (test == BASE)
+		err = sendmsg_test(iov_count, length, rate, verbose, true);
 	else
 		fprintf(stderr, "unknown test\n");
 out:
-- 
cgit v1.2.3


From ede154776c8bf5b1032b1d619db15485b9f34387 Mon Sep 17 00:00:00 2001
From: John Fastabend
Date: Mon, 22 Jan 2018 10:36:53 -0800
Subject: bpf: sockmap put client sockets in blocking mode

Put client sockets in blocking mode otherwise with sendmsg tests
its easy to overrun the socket buffers which results in the test
being aborted.

The original non-blocking was added to handle listen/accept with
a single thread the client/accepted sockets do not need to be
non-blocking.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/sockmap/sockmap_user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'samples')

diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c
index f9d3785fb183..fe943c903310 100644
--- a/samples/sockmap/sockmap_user.c
+++ b/samples/sockmap/sockmap_user.c
@@ -109,7 +109,7 @@ static int sockmap_init_sockets(void)
 	}
 
 	/* Non-blocking sockets */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < 2; i++) {
 		err = ioctl(*fds[i], FIONBIO, (char *)&one);
 		if (err < 0) {
 			perror("ioctl s1 failed()");
-- 
cgit v1.2.3


From 8e0ef38052c81b08310a8e31a2e6da0a32359257 Mon Sep 17 00:00:00 2001
From: John Fastabend
Date: Mon, 22 Jan 2018 10:37:11 -0800
Subject: bpf: sockmap set rlimit

Avoid extra step of setting limit from cmdline and do it directly in
the program.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/sockmap/sockmap_user.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'samples')

diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c
index fe943c903310..7c25c0c112bc 100644
--- a/samples/sockmap/sockmap_user.c
+++ b/samples/sockmap/sockmap_user.c
@@ -27,6 +27,7 @@
 #include <time.h>
 
 #include <sys/time.h>
+#include <sys/resource.h>
 #include <sys/types.h>
 
 #include <linux/netlink.h>
@@ -447,6 +448,7 @@ enum {
 int main(int argc, char **argv)
 {
 	int iov_count = 1, length = 1024, rate = 1, verbose = 0;
+	struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
 	int opt, longindex, err, cg_fd = 0;
 	int test = PING_PONG;
 	char filename[256];
@@ -501,6 +503,11 @@ int main(int argc, char **argv)
 		return -1;
 	}
 
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return 1;
+	}
+
 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
 
 	running = 1;
-- 
cgit v1.2.3


From c25ef6a5e62fa212d298ce24995ce239f29b5f96 Mon Sep 17 00:00:00 2001
From: Mickaël Salaün
Date: Fri, 26 Jan 2018 01:39:30 +0100
Subject: samples/bpf: Partially fixes the bpf.o build

Do not build lib/bpf/bpf.o with this Makefile but use the one from the
library directory.  This avoid making a buggy bpf.o file (e.g. missing
symbols).

This patch is useful if some code (e.g. Landlock tests) needs both the
bpf.o (from tools/lib/bpf) and the bpf_load.o (from samples/bpf).

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 samples/bpf/Makefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'samples')

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 7f61a3d57fa7..64335bb94f9f 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -201,13 +201,16 @@ CLANG_ARCH_ARGS = -target $(ARCH)
 endif
 
 # Trick to allow make to be run from this directory
-all:
+all: $(LIBBPF)
 	$(MAKE) -C ../../ $(CURDIR)/
 
 clean:
 	$(MAKE) -C ../../ M=$(CURDIR) clean
 	@rm -f *~
 
+$(LIBBPF): FORCE
+	$(MAKE) -C $(dir $@) $(notdir $@)
+
 $(obj)/syscall_nrs.s:	$(src)/syscall_nrs.c
 	$(call if_changed_dep,cc_s_c)
 
-- 
cgit v1.2.3