diff options
Diffstat (limited to 'samples')
47 files changed, 705 insertions, 648 deletions
diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore index c7498457595a..74d31fd3c99c 100644 --- a/samples/bpf/.gitignore +++ b/samples/bpf/.gitignore @@ -1,6 +1,7 @@ cpustat fds_example hbm +ibumad lathist lwt_len_hist map_perf_test diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 4f0a1cdbfe7c..f90daadfbc89 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -26,7 +26,6 @@ hostprogs-y += map_perf_test hostprogs-y += test_overhead hostprogs-y += test_cgrp2_array_pin hostprogs-y += test_cgrp2_attach -hostprogs-y += test_cgrp2_attach2 hostprogs-y += test_cgrp2_sock hostprogs-y += test_cgrp2_sock2 hostprogs-y += xdp1 @@ -81,7 +80,6 @@ map_perf_test-objs := bpf_load.o map_perf_test_user.o test_overhead-objs := bpf_load.o test_overhead_user.o test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o test_cgrp2_attach-objs := test_cgrp2_attach.o -test_cgrp2_attach2-objs := test_cgrp2_attach2.o $(CGROUP_HELPERS) test_cgrp2_sock-objs := test_cgrp2_sock.o test_cgrp2_sock2-objs := bpf_load.o test_cgrp2_sock2.o xdp1-objs := xdp1_user.o @@ -156,6 +154,7 @@ always += tcp_iw_kern.o always += tcp_clamp_kern.o always += tcp_basertt_kern.o always += tcp_tos_reflect_kern.o +always += tcp_dumpstats_kern.o always += xdp_redirect_kern.o always += xdp_redirect_map_kern.o always += xdp_redirect_cpu_kern.o @@ -170,23 +169,15 @@ always += task_fd_query_kern.o always += xdp_sample_pkts_kern.o always += ibumad_kern.o always += hbm_out_kern.o +always += hbm_edt_kern.o KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include -KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/ +KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/bpf/ KBUILD_HOSTCFLAGS += -I$(srctree)/tools/testing/selftests/bpf/ KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/ -I$(srctree)/tools/include KBUILD_HOSTCFLAGS += -I$(srctree)/tools/perf HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable -HOSTCFLAGS_trace_helpers.o += -I$(srctree)/tools/lib/bpf/ - -HOSTCFLAGS_trace_output_user.o += -I$(srctree)/tools/lib/bpf/ -HOSTCFLAGS_offwaketime_user.o += -I$(srctree)/tools/lib/bpf/ -HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/ -HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/ -HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/ -HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/ -HOSTCFLAGS_xdp_sample_pkts_user.o += -I$(srctree)/tools/lib/bpf/ KBUILD_HOSTLDLIBS += $(LIBBPF) -lelf HOSTLDLIBS_tracex4 += -lrt @@ -208,6 +199,17 @@ HOSTCC = $(CROSS_COMPILE)gcc CLANG_ARCH_ARGS = -target $(ARCH) endif +# Don't evaluate probes and warnings if we need to run make recursively +ifneq ($(src),) +HDR_PROBE := $(shell echo "\#include <linux/types.h>\n struct list_head { int a; }; int main() { return 0; }" | \ + $(HOSTCC) $(KBUILD_HOSTCFLAGS) -x c - -o /dev/null 2>/dev/null && \ + echo okay) + +ifeq ($(HDR_PROBE),) +$(warning WARNING: Detected possible issues with include path.) +$(warning WARNING: Please install kernel headers locally (make headers_install).) +endif + BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris) BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF) BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 'usage.*llvm') @@ -225,6 +227,7 @@ ifneq ($(and $(BTF_LLC_PROBE),$(BTF_PAHOLE_PROBE),$(BTF_OBJCOPY_PROBE)),) DWARF2BTF = y endif endif +endif # Trick to allow make to be run from this directory all: @@ -271,6 +274,7 @@ $(src)/*.c: verify_target_bpf $(LIBBPF) $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h $(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h $(obj)/hbm.o: $(src)/hbm.h +$(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h # asm/sysreg.h - inline assembly used by it is incompatible with llvm. # But, there is no easy way to fix it, so just exclude it since it is diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 6e87cc831e84..4574b1939e49 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -40,7 +40,7 @@ int prog_cnt; int prog_array_fd = -1; struct bpf_map_data map_data[MAX_MAPS]; -int map_data_count = 0; +int map_data_count; static int populate_prog_array(const char *event, int prog_fd) { @@ -65,7 +65,7 @@ static int write_kprobe_events(const char *val) else flags = O_WRONLY | O_APPEND; - fd = open("/sys/kernel/debug/tracing/kprobe_events", flags); + fd = open(DEBUGFS "kprobe_events", flags); ret = write(fd, val, strlen(val)); close(fd); @@ -490,8 +490,8 @@ static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx, /* Verify no newer features were requested */ if (validate_zero) { - addr = (unsigned char*) def + map_sz_copy; - end = (unsigned char*) def + map_sz_elf; + addr = (unsigned char *) def + map_sz_copy; + end = (unsigned char *) def + map_sz_elf; for (; addr < end; addr++) { if (*addr != 0) { free(sym); diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh index 56c8b4115c95..ffe4c0607341 100755 --- a/samples/bpf/do_hbm_test.sh +++ b/samples/bpf/do_hbm_test.sh @@ -13,10 +13,10 @@ Usage() { echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create" echo "loads. The output is the goodput in Mbps (unless -D was used)." echo "" - echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>] [-D]" - echo " [-d=<delay>|--delay=<delay>] [--debug] [-E]" + echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>]" + echo " [-D] [-d=<delay>|--delay=<delay>] [--debug] [-E] [--edt]" echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]" - echo " [-l] [-N] [-p=<port>|--port=<port>] [-P]" + echo " [-l] [-N] [--no_cn] [-p=<port>|--port=<port>] [-P]" echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]" echo " [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]" echo " Where:" @@ -30,9 +30,11 @@ Usage() { echo " other detailed information. This information is" echo " test dependent (i.e. iperf3 or netperf)." echo " -E enable ECN (not required for dctcp)" + echo " --edt use fq's Earliest Departure Time (requires fq)" echo " -f or --flows number of concurrent flows (default=1)" echo " -i or --id cgroup id (an integer, default is 1)" echo " -N use netperf instead of iperf3" + echo " --no_cn Do not return CN notifications" echo " -l do not limit flows using loopback" echo " -h Help" echo " -p or --port iperf3 port (default is 5201)" @@ -115,6 +117,9 @@ processArgs () { -c=*|--cc=*) cc="${i#*=}" ;; + --no_cn) + flags="$flags --no_cn" + ;; --debug) flags="$flags -d" debug_flag=1 @@ -126,13 +131,12 @@ processArgs () { details=1 ;; -E) - ecn=1 + ecn=1 + ;; + --edt) + flags="$flags --edt" + qdisc="fq" ;; - # Support for upcomming fq Early Departure Time egress rate limiting - #--edt) - # prog="hbm_out_edt_kern.o" - # qdisc="fq" - # ;; -f=*|--flows=*) flows="${i#*=}" ;; @@ -224,8 +228,8 @@ if [ "$netem" -ne "0" ] ; then tc qdisc del dev lo root > /dev/null 2>&1 tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1 elif [ "$qdisc" != "" ] ; then - tc qdisc del dev lo root > /dev/null 2>&1 - tc qdisc add dev lo root $qdisc > /dev/null 2>&1 + tc qdisc del dev eth0 root > /dev/null 2>&1 + tc qdisc add dev eth0 root $qdisc > /dev/null 2>&1 fi n=0 @@ -395,7 +399,9 @@ fi if [ "$netem" -ne "0" ] ; then tc qdisc del dev lo root > /dev/null 2>&1 fi - +if [ "$qdisc" != "" ] ; then + tc qdisc del dev eth0 root > /dev/null 2>&1 +fi sleep 2 hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'` diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c index e51eb060244e..2d4b717726b6 100644 --- a/samples/bpf/fds_example.c +++ b/samples/bpf/fds_example.c @@ -14,7 +14,7 @@ #include <bpf/bpf.h> -#include "bpf/libbpf.h" +#include "libbpf.h" #include "bpf_insn.h" #include "sock_example.h" diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c index a79828ab273f..e0fbab9bec83 100644 --- a/samples/bpf/hbm.c +++ b/samples/bpf/hbm.c @@ -16,6 +16,7 @@ * -l Also limit flows doing loopback * -n <#> To create cgroup \"/hbm#\" and attach prog * Default is /hbm1 + * --no_cn Do not return cn notifications * -r <rate> Rate limit in Mbps * -s Get HBM stats (marked, dropped, etc.) * -t <time> Exit after specified seconds (default is 0) @@ -42,14 +43,15 @@ #include <linux/bpf.h> #include <bpf/bpf.h> +#include <getopt.h> #include "bpf_load.h" #include "bpf_rlimit.h" #include "cgroup_helpers.h" #include "hbm.h" #include "bpf_util.h" -#include "bpf/bpf.h" -#include "bpf/libbpf.h" +#include "bpf.h" +#include "libbpf.h" bool outFlag = true; int minRate = 1000; /* cgroup rate limit in Mbps */ @@ -59,6 +61,8 @@ bool stats_flag; bool loopback_flag; bool debugFlag; bool work_conserving_flag; +bool no_cn_flag; +bool edt_flag; static void Usage(void); static void read_trace_pipe2(void); @@ -185,6 +189,7 @@ static int run_bpf_prog(char *prog, int cg_id) qstats.rate = rate; qstats.stats = stats_flag ? 1 : 0; qstats.loopback = loopback_flag ? 1 : 0; + qstats.no_cn = no_cn_flag ? 1 : 0; if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) { printf("ERROR: Could not update map element\n"); goto err; @@ -312,6 +317,14 @@ static int run_bpf_prog(char *prog, int cg_id) double percent_pkts, percent_bytes; char fname[100]; FILE *fout; + int k; + static const char *returnValNames[] = { + "DROP_PKT", + "ALLOW_PKT", + "DROP_PKT_CWR", + "ALLOW_PKT_CWR" + }; +#define RET_VAL_COUNT 4 // Future support of ingress // if (!outFlag) @@ -346,6 +359,36 @@ static int run_bpf_prog(char *prog, int cg_id) (qstats.bytes_total + 1); fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts); fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes); + + // ECN CE markings + percent_pkts = (qstats.pkts_ecn_ce * 100.0) / + (qstats.pkts_total + 1); + fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts, + (int)qstats.pkts_ecn_ce); + + // Average cwnd + fprintf(fout, "avg cwnd:%d\n", + (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1))); + // Average rtt + fprintf(fout, "avg rtt:%d\n", + (int)(qstats.sum_rtt / (qstats.pkts_total + 1))); + // Average credit + if (edt_flag) + fprintf(fout, "avg credit_ms:%.03f\n", + (qstats.sum_credit / + (qstats.pkts_total + 1.0)) / 1000000.0); + else + fprintf(fout, "avg credit:%d\n", + (int)(qstats.sum_credit / + (1500 * ((int)qstats.pkts_total ) + 1))); + + // Return values stats + for (k = 0; k < RET_VAL_COUNT; k++) { + percent_pkts = (qstats.returnValCount[k] * 100.0) / + (qstats.pkts_total + 1); + fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k], + percent_pkts, (int)qstats.returnValCount[k]); + } fclose(fout); } @@ -366,14 +409,16 @@ static void Usage(void) { printf("This program loads a cgroup skb BPF program to enforce\n" "cgroup output (egress) bandwidth limits.\n\n" - "USAGE: hbm [-o] [-d] [-l] [-n <id>] [-r <rate>] [-s]\n" - " [-t <secs>] [-w] [-h] [prog]\n" + "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n" + " [-s] [-t <secs>] [-w] [-h] [prog]\n" " Where:\n" " -o indicates egress direction (default)\n" " -d print BPF trace debug buffer\n" + " --edt use fq's Earliest Departure Time\n" " -l also limit flows using loopback\n" " -n <#> to create cgroup \"/hbm#\" and attach prog\n" " Default is /hbm1\n" + " --no_cn disable CN notifications\n" " -r <rate> Rate in Mbps\n" " -s Update HBM stats\n" " -t <time> Exit after specified seconds (default is 0)\n" @@ -393,9 +438,21 @@ int main(int argc, char **argv) int k; int cg_id = 1; char *optstring = "iodln:r:st:wh"; + struct option loptions[] = { + {"no_cn", 0, NULL, 1}, + {"edt", 0, NULL, 2}, + {NULL, 0, NULL, 0} + }; - while ((k = getopt(argc, argv, optstring)) != -1) { + while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) { switch (k) { + case 1: + no_cn_flag = true; + break; + case 2: + prog = "hbm_edt_kern.o"; + edt_flag = true; + break; case'o': break; case 'd': diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h index 518e8147d084..f0963ed6a562 100644 --- a/samples/bpf/hbm.h +++ b/samples/bpf/hbm.h @@ -19,7 +19,8 @@ struct hbm_vqueue { struct hbm_queue_stats { unsigned long rate; /* in Mbps*/ unsigned long stats:1, /* get HBM stats (marked, dropped,..) */ - loopback:1; /* also limit flows using loopback */ + loopback:1, /* also limit flows using loopback */ + no_cn:1; /* do not use cn flags */ unsigned long long pkts_marked; unsigned long long bytes_marked; unsigned long long pkts_dropped; @@ -28,4 +29,10 @@ struct hbm_queue_stats { unsigned long long bytes_total; unsigned long long firstPacketTime; unsigned long long lastPacketTime; + unsigned long long pkts_ecn_ce; + unsigned long long returnValCount[4]; + unsigned long long sum_cwnd; + unsigned long long sum_rtt; + unsigned long long sum_cwnd_cnt; + long long sum_credit; }; diff --git a/samples/bpf/hbm_edt_kern.c b/samples/bpf/hbm_edt_kern.c new file mode 100644 index 000000000000..a65b677acdb0 --- /dev/null +++ b/samples/bpf/hbm_edt_kern.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Sample Host Bandwidth Manager (HBM) BPF program. + * + * A cgroup skb BPF egress program to limit cgroup output bandwidth. + * It uses a modified virtual token bucket queue to limit average + * egress bandwidth. The implementation uses credits instead of tokens. + * Negative credits imply that queueing would have happened (this is + * a virtual queue, so no queueing is done by it. However, queueing may + * occur at the actual qdisc (which is not used for rate limiting). + * + * This implementation uses 3 thresholds, one to start marking packets and + * the other two to drop packets: + * CREDIT + * - <--------------------------|------------------------> + + * | | | 0 + * | Large pkt | + * | drop thresh | + * Small pkt drop Mark threshold + * thresh + * + * The effect of marking depends on the type of packet: + * a) If the packet is ECN enabled and it is a TCP packet, then the packet + * is ECN marked. + * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr + * to reduce the congestion window. The current implementation uses a linear + * distribution (0% probability at marking threshold, 100% probability + * at drop threshold). + * c) If the packet is not a TCP packet, then it is dropped. + * + * If the credit is below the drop threshold, the packet is dropped. If it + * is a TCP packet, then it also calls tcp_cwr since packets dropped by + * by a cgroup skb BPF program do not automatically trigger a call to + * tcp_cwr in the current kernel code. + * + * This BPF program actually uses 2 drop thresholds, one threshold + * for larger packets (>= 120 bytes) and another for smaller packets. This + * protects smaller packets such as SYNs, ACKs, etc. + * + * The default bandwidth limit is set at 1Gbps but this can be changed by + * a user program through a shared BPF map. In addition, by default this BPF + * program does not limit connections using loopback. This behavior can be + * overwritten by the user program. There is also an option to calculate + * some statistics, such as percent of packets marked or dropped, which + * a user program, such as hbm, can access. + */ + +#include "hbm_kern.h" + +SEC("cgroup_skb/egress") +int _hbm_out_cg(struct __sk_buff *skb) +{ + long long delta = 0, delta_send; + unsigned long long curtime, sendtime; + struct hbm_queue_stats *qsp = NULL; + unsigned int queue_index = 0; + bool congestion_flag = false; + bool ecn_ce_flag = false; + struct hbm_pkt_info pkti = {}; + struct hbm_vqueue *qdp; + bool drop_flag = false; + bool cwr_flag = false; + int len = skb->len; + int rv = ALLOW_PKT; + + qsp = bpf_map_lookup_elem(&queue_stats, &queue_index); + + // Check if we should ignore loopback traffic + if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1)) + return ALLOW_PKT; + + hbm_get_pkt_info(skb, &pkti); + + // We may want to account for the length of headers in len + // calculation, like ETH header + overhead, specially if it + // is a gso packet. But I am not doing it right now. + + qdp = bpf_get_local_storage(&queue_state, 0); + if (!qdp) + return ALLOW_PKT; + if (qdp->lasttime == 0) + hbm_init_edt_vqueue(qdp, 1024); + + curtime = bpf_ktime_get_ns(); + + // Begin critical section + bpf_spin_lock(&qdp->lock); + delta = qdp->lasttime - curtime; + // bound bursts to 100us + if (delta < -BURST_SIZE_NS) { + // negative delta is a credit that allows bursts + qdp->lasttime = curtime - BURST_SIZE_NS; + delta = -BURST_SIZE_NS; + } + sendtime = qdp->lasttime; + delta_send = BYTES_TO_NS(len, qdp->rate); + __sync_add_and_fetch(&(qdp->lasttime), delta_send); + bpf_spin_unlock(&qdp->lock); + // End critical section + + // Set EDT of packet + skb->tstamp = sendtime; + + // Check if we should update rate + if (qsp != NULL && (qsp->rate * 128) != qdp->rate) + qdp->rate = qsp->rate * 128; + + // Set flags (drop, congestion, cwr) + // last packet will be sent in the future, bound latency + if (delta > DROP_THRESH_NS || (delta > LARGE_PKT_DROP_THRESH_NS && + len > LARGE_PKT_THRESH)) { + drop_flag = true; + if (pkti.is_tcp && pkti.ecn == 0) + cwr_flag = true; + } else if (delta > MARK_THRESH_NS) { + if (pkti.is_tcp) + congestion_flag = true; + else + drop_flag = true; + } + + if (congestion_flag) { + if (bpf_skb_ecn_set_ce(skb)) { + ecn_ce_flag = true; + } else { + if (pkti.is_tcp) { + unsigned int rand = bpf_get_prandom_u32(); + + if (delta >= MARK_THRESH_NS + + (rand % MARK_REGION_SIZE_NS)) { + // Do congestion control + cwr_flag = true; + } + } else if (len > LARGE_PKT_THRESH) { + // Problem if too many small packets? + drop_flag = true; + congestion_flag = false; + } + } + } + + if (pkti.is_tcp && drop_flag && pkti.packets_out <= 1) { + drop_flag = false; + cwr_flag = true; + congestion_flag = false; + } + + if (qsp != NULL && qsp->no_cn) + cwr_flag = false; + + hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag, + cwr_flag, ecn_ce_flag, &pkti, (int) delta); + + if (drop_flag) { + __sync_add_and_fetch(&(qdp->lasttime), -delta_send); + rv = DROP_PKT; + } + + if (cwr_flag) + rv |= CWR; + return rv; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h index c5635d924193..aa207a2eebbd 100644 --- a/samples/bpf/hbm_kern.h +++ b/samples/bpf/hbm_kern.h @@ -29,16 +29,10 @@ #define DROP_PKT 0 #define ALLOW_PKT 1 #define TCP_ECN_OK 1 +#define CWR 2 -#define HBM_DEBUG 0 // Set to 1 to enable debugging -#if HBM_DEBUG -#define bpf_printk(fmt, ...) \ -({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ -}) -#else +#ifndef HBM_DEBUG // Define HBM_DEBUG to enable debugging +#undef bpf_printk #define bpf_printk(fmt, ...) #endif @@ -52,8 +46,18 @@ #define MAX_CREDIT (100 * MAX_BYTES_PER_PACKET) #define INIT_CREDIT (INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET) +// Time base accounting for fq's EDT +#define BURST_SIZE_NS 100000 // 100us +#define MARK_THRESH_NS 50000 // 50us +#define DROP_THRESH_NS 500000 // 500us +// Reserve 20us of queuing for small packets (less than 120 bytes) +#define LARGE_PKT_DROP_THRESH_NS (DROP_THRESH_NS - 20000) +#define MARK_REGION_SIZE_NS (LARGE_PKT_DROP_THRESH_NS - MARK_THRESH_NS) + // rate in bytes per ns << 20 #define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20) +#define BYTES_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20) +#define BYTES_TO_NS(bytes, rate) div64_u64(((u64)(bytes)) << 20, (u64)(rate)) struct bpf_map_def SEC("maps") queue_state = { .type = BPF_MAP_TYPE_CGROUP_STORAGE, @@ -72,17 +76,48 @@ struct bpf_map_def SEC("maps") queue_stats = { BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats); struct hbm_pkt_info { + int cwnd; + int rtt; + int packets_out; bool is_ip; bool is_tcp; short ecn; }; -static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb, - struct hbm_pkt_info *pkti) +static int get_tcp_info(struct __sk_buff *skb, struct hbm_pkt_info *pkti) +{ + struct bpf_sock *sk; + struct bpf_tcp_sock *tp; + + sk = skb->sk; + if (sk) { + sk = bpf_sk_fullsock(sk); + if (sk) { + if (sk->protocol == IPPROTO_TCP) { + tp = bpf_tcp_sock(sk); + if (tp) { + pkti->cwnd = tp->snd_cwnd; + pkti->rtt = tp->srtt_us >> 3; + pkti->packets_out = tp->packets_out; + return 0; + } + } + } + } + pkti->cwnd = 0; + pkti->rtt = 0; + pkti->packets_out = 0; + return 1; +} + +static void hbm_get_pkt_info(struct __sk_buff *skb, + struct hbm_pkt_info *pkti) { struct iphdr iph; struct ipv6hdr *ip6h; + pkti->cwnd = 0; + pkti->rtt = 0; bpf_skb_load_bytes(skb, 0, &iph, 12); if (iph.version == 6) { ip6h = (struct ipv6hdr *)&iph; @@ -98,22 +133,42 @@ static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb, pkti->is_tcp = false; pkti->ecn = 0; } + if (pkti->is_tcp) + get_tcp_info(skb, pkti); } static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate) { - bpf_printk("Initializing queue_state, rate:%d\n", rate * 128); - qdp->lasttime = bpf_ktime_get_ns(); - qdp->credit = INIT_CREDIT; - qdp->rate = rate * 128; + bpf_printk("Initializing queue_state, rate:%d\n", rate * 128); + qdp->lasttime = bpf_ktime_get_ns(); + qdp->credit = INIT_CREDIT; + qdp->rate = rate * 128; +} + +static __always_inline void hbm_init_edt_vqueue(struct hbm_vqueue *qdp, + int rate) +{ + unsigned long long curtime; + + curtime = bpf_ktime_get_ns(); + bpf_printk("Initializing queue_state, rate:%d\n", rate * 128); + qdp->lasttime = curtime - BURST_SIZE_NS; // support initial burst + qdp->credit = 0; // not used + qdp->rate = rate * 128; } static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp, int len, unsigned long long curtime, bool congestion_flag, - bool drop_flag) + bool drop_flag, + bool cwr_flag, + bool ecn_ce_flag, + struct hbm_pkt_info *pkti, + int credit) { + int rv = ALLOW_PKT; + if (qsp != NULL) { // Following is needed for work conserving __sync_add_and_fetch(&(qsp->bytes_total), len); @@ -123,7 +178,7 @@ static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp, qsp->firstPacketTime = curtime; qsp->lastPacketTime = curtime; __sync_add_and_fetch(&(qsp->pkts_total), 1); - if (congestion_flag || drop_flag) { + if (congestion_flag) { __sync_add_and_fetch(&(qsp->pkts_marked), 1); __sync_add_and_fetch(&(qsp->bytes_marked), len); } @@ -132,6 +187,34 @@ static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp, __sync_add_and_fetch(&(qsp->bytes_dropped), len); } + if (ecn_ce_flag) + __sync_add_and_fetch(&(qsp->pkts_ecn_ce), 1); + if (pkti->cwnd) { + __sync_add_and_fetch(&(qsp->sum_cwnd), + pkti->cwnd); + __sync_add_and_fetch(&(qsp->sum_cwnd_cnt), 1); + } + if (pkti->rtt) + __sync_add_and_fetch(&(qsp->sum_rtt), + pkti->rtt); + __sync_add_and_fetch(&(qsp->sum_credit), credit); + + if (drop_flag) + rv = DROP_PKT; + if (cwr_flag) + rv |= 2; + if (rv == DROP_PKT) + __sync_add_and_fetch(&(qsp->returnValCount[0]), + 1); + else if (rv == ALLOW_PKT) + __sync_add_and_fetch(&(qsp->returnValCount[1]), + 1); + else if (rv == 2) + __sync_add_and_fetch(&(qsp->returnValCount[2]), + 1); + else if (rv == 3) + __sync_add_and_fetch(&(qsp->returnValCount[3]), + 1); } } } diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c index f806863d0b79..829934bd43cb 100644 --- a/samples/bpf/hbm_out_kern.c +++ b/samples/bpf/hbm_out_kern.c @@ -62,11 +62,12 @@ int _hbm_out_cg(struct __sk_buff *skb) unsigned int queue_index = 0; unsigned long long curtime; int credit; - signed long long delta = 0, zero = 0; + signed long long delta = 0, new_credit; int max_credit = MAX_CREDIT; bool congestion_flag = false; bool drop_flag = false; bool cwr_flag = false; + bool ecn_ce_flag = false; struct hbm_vqueue *qdp; struct hbm_queue_stats *qsp = NULL; int rv = ALLOW_PKT; @@ -99,9 +100,11 @@ int _hbm_out_cg(struct __sk_buff *skb) */ if (delta > 0) { qdp->lasttime = curtime; - credit += CREDIT_PER_NS(delta, qdp->rate); - if (credit > MAX_CREDIT) + new_credit = credit + CREDIT_PER_NS(delta, qdp->rate); + if (new_credit > MAX_CREDIT) credit = MAX_CREDIT; + else + credit = new_credit; } credit -= len; qdp->credit = credit; @@ -119,13 +122,16 @@ int _hbm_out_cg(struct __sk_buff *skb) // Set flags (drop, congestion, cwr) // Dropping => we are congested, so ignore congestion flag if (credit < -DROP_THRESH || - (len > LARGE_PKT_THRESH && - credit < -LARGE_PKT_DROP_THRESH)) { - // Very congested, set drop flag + (len > LARGE_PKT_THRESH && credit < -LARGE_PKT_DROP_THRESH)) { + // Very congested, set drop packet drop_flag = true; + if (pkti.ecn) + congestion_flag = true; + else if (pkti.is_tcp) + cwr_flag = true; } else if (credit < 0) { // Congested, set congestion flag - if (pkti.ecn) { + if (pkti.ecn || pkti.is_tcp) { if (credit < -MARK_THRESH) congestion_flag = true; else @@ -136,22 +142,38 @@ int _hbm_out_cg(struct __sk_buff *skb) } if (congestion_flag) { - if (!bpf_skb_ecn_set_ce(skb)) { - if (len > LARGE_PKT_THRESH) { + if (bpf_skb_ecn_set_ce(skb)) { + ecn_ce_flag = true; + } else { + if (pkti.is_tcp) { + unsigned int rand = bpf_get_prandom_u32(); + + if (-credit >= MARK_THRESH + + (rand % MARK_REGION_SIZE)) { + // Do congestion control + cwr_flag = true; + } + } else if (len > LARGE_PKT_THRESH) { // Problem if too many small packets? drop_flag = true; } } } - if (drop_flag) - rv = DROP_PKT; + if (qsp != NULL) + if (qsp->no_cn) + cwr_flag = false; - hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag); + hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag, + cwr_flag, ecn_ce_flag, &pkti, credit); - if (rv == DROP_PKT) + if (drop_flag) { __sync_add_and_fetch(&(qdp->credit), len); + rv = DROP_PKT; + } + if (cwr_flag) + rv |= 2; return rv; } char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/ibumad_kern.c b/samples/bpf/ibumad_kern.c index 38b2b3f22049..f281df7e0089 100644 --- a/samples/bpf/ibumad_kern.c +++ b/samples/bpf/ibumad_kern.c @@ -31,15 +31,9 @@ struct bpf_map_def SEC("maps") write_count = { }; #undef DEBUG -#ifdef DEBUG -#define bpf_debug(fmt, ...) \ -({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ -}) -#else -#define bpf_debug(fmt, ...) +#ifndef DEBUG +#undef bpf_printk +#define bpf_printk(fmt, ...) #endif /* Taken from the current format defined in @@ -86,7 +80,7 @@ int on_ib_umad_read_recv(struct ib_umad_rw_args *ctx) u64 zero = 0, *val; u8 class = ctx->mgmt_class; - bpf_debug("ib_umad read recv : class 0x%x\n", class); + bpf_printk("ib_umad read recv : class 0x%x\n", class); val = bpf_map_lookup_elem(&read_count, &class); if (!val) { @@ -106,7 +100,7 @@ int on_ib_umad_read_send(struct ib_umad_rw_args *ctx) u64 zero = 0, *val; u8 class = ctx->mgmt_class; - bpf_debug("ib_umad read send : class 0x%x\n", class); + bpf_printk("ib_umad read send : class 0x%x\n", class); val = bpf_map_lookup_elem(&read_count, &class); if (!val) { @@ -126,7 +120,7 @@ int on_ib_umad_write(struct ib_umad_rw_args *ctx) u64 zero = 0, *val; u8 class = ctx->mgmt_class; - bpf_debug("ib_umad write : class 0x%x\n", class); + bpf_printk("ib_umad write : class 0x%x\n", class); val = bpf_map_lookup_elem(&write_count, &class); if (!val) { diff --git a/samples/bpf/ibumad_user.c b/samples/bpf/ibumad_user.c index 097d76143363..cb5a8f994849 100644 --- a/samples/bpf/ibumad_user.c +++ b/samples/bpf/ibumad_user.c @@ -25,7 +25,7 @@ #include "bpf_load.h" #include "bpf_util.h" -#include "bpf/libbpf.h" +#include "libbpf.h" static void dump_counts(int fd) { diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c index 7f90796ae15a..a219442afbee 100644 --- a/samples/bpf/sockex1_user.c +++ b/samples/bpf/sockex1_user.c @@ -3,7 +3,7 @@ #include <assert.h> #include <linux/bpf.h> #include <bpf/bpf.h> -#include "bpf/libbpf.h" +#include "libbpf.h" #include "sock_example.h" #include <unistd.h> #include <arpa/inet.h> diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c index bc257333ad92..6de383ddd08b 100644 --- a/samples/bpf/sockex2_user.c +++ b/samples/bpf/sockex2_user.c @@ -3,7 +3,7 @@ #include <assert.h> #include <linux/bpf.h> #include <bpf/bpf.h> -#include "bpf/libbpf.h" +#include "libbpf.h" #include "sock_example.h" #include <unistd.h> #include <arpa/inet.h> diff --git a/samples/bpf/tcp_basertt_kern.c b/samples/bpf/tcp_basertt_kern.c index 6ef1625e8b2c..9dba48c2b920 100644 --- a/samples/bpf/tcp_basertt_kern.c +++ b/samples/bpf/tcp_basertt_kern.c @@ -21,13 +21,6 @@ #define DEBUG 1 -#define bpf_printk(fmt, ...) \ -({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ -}) - SEC("sockops") int bpf_basertt(struct bpf_sock_ops *skops) { diff --git a/samples/bpf/tcp_bpf.readme b/samples/bpf/tcp_bpf.readme index fee746621aec..78e247f62108 100644 --- a/samples/bpf/tcp_bpf.readme +++ b/samples/bpf/tcp_bpf.readme @@ -25,4 +25,4 @@ attached to the cgroupv2). To remove (unattach) a socket_ops BPF program from a cgroupv2: - bpftool cgroup attach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog + bpftool cgroup detach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog diff --git a/samples/bpf/tcp_bufs_kern.c b/samples/bpf/tcp_bufs_kern.c index e03e204739fa..af8486f33771 100644 --- a/samples/bpf/tcp_bufs_kern.c +++ b/samples/bpf/tcp_bufs_kern.c @@ -22,13 +22,6 @@ #define DEBUG 1 -#define bpf_printk(fmt, ...) \ -({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ -}) - SEC("sockops") int bpf_bufs(struct bpf_sock_ops *skops) { diff --git a/samples/bpf/tcp_clamp_kern.c b/samples/bpf/tcp_clamp_kern.c index a0dc2d254aca..26c0fd091f3c 100644 --- a/samples/bpf/tcp_clamp_kern.c +++ b/samples/bpf/tcp_clamp_kern.c @@ -22,13 +22,6 @@ #define DEBUG 1 -#define bpf_printk(fmt, ...) \ -({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ -}) - SEC("sockops") int bpf_clamp(struct bpf_sock_ops *skops) { diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c index 4fd3ca979a06..6d4dc4c7dd1e 100644 --- a/samples/bpf/tcp_cong_kern.c +++ b/samples/bpf/tcp_cong_kern.c @@ -21,13 +21,6 @@ #define DEBUG 1 -#define bpf_printk(fmt, ...) \ -({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ -}) - SEC("sockops") int bpf_cong(struct bpf_sock_ops *skops) { diff --git a/samples/bpf/tcp_dumpstats_kern.c b/samples/bpf/tcp_dumpstats_kern.c new file mode 100644 index 000000000000..8557913106a0 --- /dev/null +++ b/samples/bpf/tcp_dumpstats_kern.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Refer to samples/bpf/tcp_bpf.readme for the instructions on + * how to run this sample program. + */ +#include <linux/bpf.h> + +#include "bpf_helpers.h" +#include "bpf_endian.h" + +#define INTERVAL 1000000000ULL + +int _version SEC("version") = 1; +char _license[] SEC("license") = "GPL"; + +struct { + __u32 type; + __u32 map_flags; + int *key; + __u64 *value; +} bpf_next_dump SEC(".maps") = { + .type = BPF_MAP_TYPE_SK_STORAGE, + .map_flags = BPF_F_NO_PREALLOC, +}; + +SEC("sockops") +int _sockops(struct bpf_sock_ops *ctx) +{ + struct bpf_tcp_sock *tcp_sk; + struct bpf_sock *sk; + __u64 *next_dump; + __u64 now; + + switch (ctx->op) { + case BPF_SOCK_OPS_TCP_CONNECT_CB: + bpf_sock_ops_cb_flags_set(ctx, BPF_SOCK_OPS_RTT_CB_FLAG); + return 1; + case BPF_SOCK_OPS_RTT_CB: + break; + default: + return 1; + } + + sk = ctx->sk; + if (!sk) + return 1; + + next_dump = bpf_sk_storage_get(&bpf_next_dump, sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!next_dump) + return 1; + + now = bpf_ktime_get_ns(); + if (now < *next_dump) + return 1; + + tcp_sk = bpf_tcp_sock(sk); + if (!tcp_sk) + return 1; + + *next_dump = now + INTERVAL; + + bpf_printk("dsack_dups=%u delivered=%u\n", + tcp_sk->dsack_dups, tcp_sk->delivered); + bpf_printk("delivered_ce=%u icsk_retransmits=%u\n", + tcp_sk->delivered_ce, tcp_sk->icsk_retransmits); + + return 1; +} diff --git a/samples/bpf/tcp_iw_kern.c b/samples/bpf/tcp_iw_kern.c index 9b139ec69560..da61d53378b3 100644 --- a/samples/bpf/tcp_iw_kern.c +++ b/samples/bpf/tcp_iw_kern.c @@ -22,13 +22,6 @@ #define DEBUG 1 -#define bpf_printk(fmt, ...) \ -({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ -}) - SEC("sockops") int bpf_iw(struct bpf_sock_ops *skops) { diff --git a/samples/bpf/tcp_rwnd_kern.c b/samples/bpf/tcp_rwnd_kern.c index cc71ee96e044..d011e38b80d2 100644 --- a/samples/bpf/tcp_rwnd_kern.c +++ b/samples/bpf/tcp_rwnd_kern.c @@ -21,13 +21,6 @@ #define DEBUG 1 -#define bpf_printk(fmt, ...) \ -({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ -}) - SEC("sockops") int bpf_rwnd(struct bpf_sock_ops *skops) { diff --git a/samples/bpf/tcp_synrto_kern.c b/samples/bpf/tcp_synrto_kern.c index ca87ed34f896..720d1950322d 100644 --- a/samples/bpf/tcp_synrto_kern.c +++ b/samples/bpf/tcp_synrto_kern.c @@ -21,13 +21,6 @@ #define DEBUG 1 -#define bpf_printk(fmt, ...) \ -({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ -}) - SEC("sockops") int bpf_synrto(struct bpf_sock_ops *skops) { diff --git a/samples/bpf/tcp_tos_reflect_kern.c b/samples/bpf/tcp_tos_reflect_kern.c index de788be6f862..369faca70a15 100644 --- a/samples/bpf/tcp_tos_reflect_kern.c +++ b/samples/bpf/tcp_tos_reflect_kern.c @@ -20,13 +20,6 @@ #define DEBUG 1 -#define bpf_printk(fmt, ...) \ -({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ -}) - SEC("sockops") int bpf_basertt(struct bpf_sock_ops *skops) { diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c deleted file mode 100644 index 0bb6507256b7..000000000000 --- a/samples/bpf/test_cgrp2_attach2.c +++ /dev/null @@ -1,459 +0,0 @@ -/* eBPF example program: - * - * - Creates arraymap in kernel with 4 bytes keys and 8 byte values - * - * - Loads eBPF program - * - * The eBPF program accesses the map passed in to store two pieces of - * information. The number of invocations of the program, which maps - * to the number of packets received, is stored to key 0. Key 1 is - * incremented on each iteration by the number of bytes stored in - * the skb. The program also stores the number of received bytes - * in the cgroup storage. - * - * - Attaches the new program to a cgroup using BPF_PROG_ATTACH - * - * - Every second, reads map[0] and map[1] to see how many bytes and - * packets were seen on any socket of tasks in the given cgroup. - */ - -#define _GNU_SOURCE - -#include <stdio.h> -#include <stdlib.h> -#include <assert.h> -#include <sys/resource.h> -#include <sys/time.h> -#include <unistd.h> - -#include <linux/bpf.h> -#include <bpf/bpf.h> - -#include "bpf_insn.h" -#include "bpf_rlimit.h" -#include "cgroup_helpers.h" - -#define FOO "/foo" -#define BAR "/foo/bar/" -#define PING_CMD "ping -c1 -w1 127.0.0.1 > /dev/null" - -char bpf_log_buf[BPF_LOG_BUF_SIZE]; - -static int prog_load(int verdict) -{ - int ret; - struct bpf_insn prog[] = { - BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ - BPF_EXIT_INSN(), - }; - size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); - - ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB, - prog, insns_cnt, "GPL", 0, - bpf_log_buf, BPF_LOG_BUF_SIZE); - - if (ret < 0) { - log_err("Loading program"); - printf("Output from verifier:\n%s\n-------\n", bpf_log_buf); - return 0; - } - return ret; -} - -static int test_foo_bar(void) -{ - int drop_prog, allow_prog, foo = 0, bar = 0, rc = 0; - - allow_prog = prog_load(1); - if (!allow_prog) - goto err; - - drop_prog = prog_load(0); - if (!drop_prog) - goto err; - - if (setup_cgroup_environment()) - goto err; - - /* Create cgroup /foo, get fd, and join it */ - foo = create_and_get_cgroup(FOO); - if (foo < 0) - goto err; - - if (join_cgroup(FOO)) - goto err; - - if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_OVERRIDE)) { - log_err("Attaching prog to /foo"); - goto err; - } - - printf("Attached DROP prog. This ping in cgroup /foo should fail...\n"); - assert(system(PING_CMD) != 0); - - /* Create cgroup /foo/bar, get fd, and join it */ - bar = create_and_get_cgroup(BAR); - if (bar < 0) - goto err; - - if (join_cgroup(BAR)) - goto err; - - printf("Attached DROP prog. This ping in cgroup /foo/bar should fail...\n"); - assert(system(PING_CMD) != 0); - - if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_OVERRIDE)) { - log_err("Attaching prog to /foo/bar"); - goto err; - } - - printf("Attached PASS prog. This ping in cgroup /foo/bar should pass...\n"); - assert(system(PING_CMD) == 0); - - if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) { - log_err("Detaching program from /foo/bar"); - goto err; - } - - printf("Detached PASS from /foo/bar while DROP is attached to /foo.\n" - "This ping in cgroup /foo/bar should fail...\n"); - assert(system(PING_CMD) != 0); - - if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_OVERRIDE)) { - log_err("Attaching prog to /foo/bar"); - goto err; - } - - if (bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS)) { - log_err("Detaching program from /foo"); - goto err; - } - - printf("Attached PASS from /foo/bar and detached DROP from /foo.\n" - "This ping in cgroup /foo/bar should pass...\n"); - assert(system(PING_CMD) == 0); - - if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_OVERRIDE)) { - log_err("Attaching prog to /foo/bar"); - goto err; - } - - if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) { - errno = 0; - log_err("Unexpected success attaching prog to /foo/bar"); - goto err; - } - - if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) { - log_err("Detaching program from /foo/bar"); - goto err; - } - - if (!bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS)) { - errno = 0; - log_err("Unexpected success in double detach from /foo"); - goto err; - } - - if (bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) { - log_err("Attaching non-overridable prog to /foo"); - goto err; - } - - if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0)) { - errno = 0; - log_err("Unexpected success attaching non-overridable prog to /foo/bar"); - goto err; - } - - if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_OVERRIDE)) { - errno = 0; - log_err("Unexpected success attaching overridable prog to /foo/bar"); - goto err; - } - - if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_OVERRIDE)) { - errno = 0; - log_err("Unexpected success attaching overridable prog to /foo"); - goto err; - } - - if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 0)) { - log_err("Attaching different non-overridable prog to /foo"); - goto err; - } - - goto out; - -err: - rc = 1; - -out: - close(foo); - close(bar); - cleanup_cgroup_environment(); - if (!rc) - printf("### override:PASS\n"); - else - printf("### override:FAIL\n"); - return rc; -} - -static int map_fd = -1; - -static int prog_load_cnt(int verdict, int val) -{ - int cgroup_storage_fd, percpu_cgroup_storage_fd; - - if (map_fd < 0) - map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); - if (map_fd < 0) { - printf("failed to create map '%s'\n", strerror(errno)); - return -1; - } - - cgroup_storage_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_STORAGE, - sizeof(struct bpf_cgroup_storage_key), 8, 0, 0); - if (cgroup_storage_fd < 0) { - printf("failed to create map '%s'\n", strerror(errno)); - return -1; - } - - percpu_cgroup_storage_fd = bpf_create_map( - BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, - sizeof(struct bpf_cgroup_storage_key), 8, 0, 0); - if (percpu_cgroup_storage_fd < 0) { - printf("failed to create map '%s'\n", strerror(errno)); - return -1; - } - - struct bpf_insn prog[] = { - BPF_MOV32_IMM(BPF_REG_0, 0), - BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ - BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ - BPF_LD_MAP_FD(BPF_REG_1, map_fd), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), - BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), - BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ - - BPF_LD_MAP_FD(BPF_REG_1, cgroup_storage_fd), - BPF_MOV64_IMM(BPF_REG_2, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage), - BPF_MOV64_IMM(BPF_REG_1, val), - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_0, BPF_REG_1, 0, 0), - - BPF_LD_MAP_FD(BPF_REG_1, percpu_cgroup_storage_fd), - BPF_MOV64_IMM(BPF_REG_2, 0), - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage), - BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x1), - BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_3, 0), - - BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ - BPF_EXIT_INSN(), - }; - size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); - int ret; - - ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB, - prog, insns_cnt, "GPL", 0, - bpf_log_buf, BPF_LOG_BUF_SIZE); - - if (ret < 0) { - log_err("Loading program"); - printf("Output from verifier:\n%s\n-------\n", bpf_log_buf); - return 0; - } - close(cgroup_storage_fd); - return ret; -} - - -static int test_multiprog(void) -{ - __u32 prog_ids[4], prog_cnt = 0, attach_flags, saved_prog_id; - int cg1 = 0, cg2 = 0, cg3 = 0, cg4 = 0, cg5 = 0, key = 0; - int drop_prog, allow_prog[6] = {}, rc = 0; - unsigned long long value; - int i = 0; - - for (i = 0; i < 6; i++) { - allow_prog[i] = prog_load_cnt(1, 1 << i); - if (!allow_prog[i]) - goto err; - } - drop_prog = prog_load_cnt(0, 1); - if (!drop_prog) - goto err; - - if (setup_cgroup_environment()) - goto err; - - cg1 = create_and_get_cgroup("/cg1"); - if (cg1 < 0) - goto err; - cg2 = create_and_get_cgroup("/cg1/cg2"); - if (cg2 < 0) - goto err; - cg3 = create_and_get_cgroup("/cg1/cg2/cg3"); - if (cg3 < 0) - goto err; - cg4 = create_and_get_cgroup("/cg1/cg2/cg3/cg4"); - if (cg4 < 0) - goto err; - cg5 = create_and_get_cgroup("/cg1/cg2/cg3/cg4/cg5"); - if (cg5 < 0) - goto err; - - if (join_cgroup("/cg1/cg2/cg3/cg4/cg5")) - goto err; - - if (bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_MULTI)) { - log_err("Attaching prog to cg1"); - goto err; - } - if (!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_MULTI)) { - log_err("Unexpected success attaching the same prog to cg1"); - goto err; - } - if (bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_MULTI)) { - log_err("Attaching prog2 to cg1"); - goto err; - } - if (bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_OVERRIDE)) { - log_err("Attaching prog to cg2"); - goto err; - } - if (bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_MULTI)) { - log_err("Attaching prog to cg3"); - goto err; - } - if (bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS, - BPF_F_ALLOW_OVERRIDE)) { - log_err("Attaching prog to cg4"); - goto err; - } - if (bpf_prog_attach(allow_prog[5], cg5, BPF_CGROUP_INET_EGRESS, 0)) { - log_err("Attaching prog to cg5"); - goto err; - } - assert(system(PING_CMD) == 0); - assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); - assert(value == 1 + 2 + 8 + 32); - - /* query the number of effective progs in cg5 */ - assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, - NULL, NULL, &prog_cnt) == 0); - assert(prog_cnt == 4); - /* retrieve prog_ids of effective progs in cg5 */ - assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, - &attach_flags, prog_ids, &prog_cnt) == 0); - assert(prog_cnt == 4); - assert(attach_flags == 0); - saved_prog_id = prog_ids[0]; - /* check enospc handling */ - prog_ids[0] = 0; - prog_cnt = 2; - assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, - &attach_flags, prog_ids, &prog_cnt) == -1 && - errno == ENOSPC); - assert(prog_cnt == 4); - /* check that prog_ids are returned even when buffer is too small */ - assert(prog_ids[0] == saved_prog_id); - /* retrieve prog_id of single attached prog in cg5 */ - prog_ids[0] = 0; - assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0, - NULL, prog_ids, &prog_cnt) == 0); - assert(prog_cnt == 1); - assert(prog_ids[0] == saved_prog_id); - - /* detach bottom program and ping again */ - if (bpf_prog_detach2(-1, cg5, BPF_CGROUP_INET_EGRESS)) { - log_err("Detaching prog from cg5"); - goto err; - } - value = 0; - assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0); - assert(system(PING_CMD) == 0); - assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); - assert(value == 1 + 2 + 8 + 16); - - /* detach 3rd from bottom program and ping again */ - errno = 0; - if (!bpf_prog_detach2(0, cg3, BPF_CGROUP_INET_EGRESS)) { - log_err("Unexpected success on detach from cg3"); - goto err; - } - if (bpf_prog_detach2(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS)) { - log_err("Detaching from cg3"); - goto err; - } - value = 0; - assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0); - assert(system(PING_CMD) == 0); - assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); - assert(value == 1 + 2 + 16); - - /* detach 2nd from bottom program and ping again */ - if (bpf_prog_detach2(-1, cg4, BPF_CGROUP_INET_EGRESS)) { - log_err("Detaching prog from cg4"); - goto err; - } - value = 0; - assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0); - assert(system(PING_CMD) == 0); - assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); - assert(value == 1 + 2 + 4); - - prog_cnt = 4; - assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, - &attach_flags, prog_ids, &prog_cnt) == 0); - assert(prog_cnt == 3); - assert(attach_flags == 0); - assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0, - NULL, prog_ids, &prog_cnt) == 0); - assert(prog_cnt == 0); - goto out; -err: - rc = 1; - -out: - for (i = 0; i < 6; i++) - if (allow_prog[i] > 0) - close(allow_prog[i]); - close(cg1); - close(cg2); - close(cg3); - close(cg4); - close(cg5); - cleanup_cgroup_environment(); - if (!rc) - printf("### multi:PASS\n"); - else - printf("### multi:FAIL\n"); - return rc; -} - -int main(int argc, char **argv) -{ - int rc = 0; - - rc = test_foo_bar(); - if (rc) - return rc; - - return test_multiprog(); -} diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c index 5b39421adb44..a8e5fa02e8a8 100644 --- a/samples/bpf/xdp1_user.c +++ b/samples/bpf/xdp1_user.c @@ -15,8 +15,8 @@ #include <net/if.h> #include "bpf_util.h" -#include "bpf/bpf.h" -#include "bpf/libbpf.h" +#include "bpf.h" +#include "libbpf.h" static int ifindex; static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; diff --git a/samples/bpf/xdp_adjust_tail_user.c b/samples/bpf/xdp_adjust_tail_user.c index 07e1b9269e49..a3596b617c4c 100644 --- a/samples/bpf/xdp_adjust_tail_user.c +++ b/samples/bpf/xdp_adjust_tail_user.c @@ -13,13 +13,14 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <net/if.h> #include <sys/resource.h> #include <arpa/inet.h> #include <netinet/ether.h> #include <unistd.h> #include <time.h> -#include "bpf/bpf.h" -#include "bpf/libbpf.h" +#include "bpf.h" +#include "libbpf.h" #define STATS_INTERVAL_S 2U @@ -69,7 +70,7 @@ static void usage(const char *cmd) printf("Start a XDP prog which send ICMP \"packet too big\" \n" "messages if ingress packet is bigger then MAX_SIZE bytes\n"); printf("Usage: %s [...]\n", cmd); - printf(" -i <ifindex> Interface Index\n"); + printf(" -i <ifname|ifindex> Interface\n"); printf(" -T <stop-after-X-seconds> Default: 0 (forever)\n"); printf(" -S use skb-mode\n"); printf(" -N enforce native mode\n"); @@ -102,7 +103,9 @@ int main(int argc, char **argv) switch (opt) { case 'i': - ifindex = atoi(optarg); + ifindex = if_nametoindex(optarg); + if (!ifindex) + ifindex = atoi(optarg); break; case 'T': kill_after_s = atoi(optarg); @@ -136,6 +139,11 @@ int main(int argc, char **argv) return 1; } + if (!ifindex) { + fprintf(stderr, "Invalid ifname\n"); + return 1; + } + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); prog_load_attr.file = filename; diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c index f88e1d7093d6..5b46ee12c696 100644 --- a/samples/bpf/xdp_fwd_user.c +++ b/samples/bpf/xdp_fwd_user.c @@ -24,7 +24,7 @@ #include <fcntl.h> #include <libgen.h> -#include "bpf/libbpf.h" +#include "libbpf.h" #include <bpf/bpf.h> diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 575deaca429f..0da6e9e7132e 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -26,7 +26,7 @@ static const char *__doc__ = #define MAX_PROG 6 #include <bpf/bpf.h> -#include "bpf/libbpf.h" +#include "libbpf.h" #include "bpf_util.h" diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c index be317f5f058f..f70ee33907fd 100644 --- a/samples/bpf/xdp_redirect_map_user.c +++ b/samples/bpf/xdp_redirect_map_user.c @@ -10,13 +10,14 @@ #include <stdlib.h> #include <stdbool.h> #include <string.h> +#include <net/if.h> #include <unistd.h> #include <libgen.h> #include <sys/resource.h> #include "bpf_util.h" #include <bpf/bpf.h> -#include "bpf/libbpf.h" +#include "libbpf.h" static int ifindex_in; static int ifindex_out; @@ -85,7 +86,7 @@ static void poll_stats(int interval, int ifindex) static void usage(const char *prog) { fprintf(stderr, - "usage: %s [OPTS] IFINDEX_IN IFINDEX_OUT\n\n" + "usage: %s [OPTS] <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n\n" "OPTS:\n" " -S use skb-mode\n" " -N enforce native mode\n" @@ -127,7 +128,7 @@ int main(int argc, char **argv) } if (optind == argc) { - printf("usage: %s IFINDEX_IN IFINDEX_OUT\n", argv[0]); + printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]); return 1; } @@ -136,8 +137,14 @@ int main(int argc, char **argv) return 1; } - ifindex_in = strtoul(argv[optind], NULL, 0); - ifindex_out = strtoul(argv[optind + 1], NULL, 0); + ifindex_in = if_nametoindex(argv[optind]); + if (!ifindex_in) + ifindex_in = strtoul(argv[optind], NULL, 0); + + ifindex_out = if_nametoindex(argv[optind + 1]); + if (!ifindex_out) + ifindex_out = strtoul(argv[optind + 1], NULL, 0); + printf("input: %d output: %d\n", ifindex_in, ifindex_out); snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c index 09747bee6668..5440cd620607 100644 --- a/samples/bpf/xdp_redirect_user.c +++ b/samples/bpf/xdp_redirect_user.c @@ -10,13 +10,14 @@ #include <stdlib.h> #include <stdbool.h> #include <string.h> +#include <net/if.h> #include <unistd.h> #include <libgen.h> #include <sys/resource.h> #include "bpf_util.h" #include <bpf/bpf.h> -#include "bpf/libbpf.h" +#include "libbpf.h" static int ifindex_in; static int ifindex_out; @@ -85,7 +86,7 @@ static void poll_stats(int interval, int ifindex) static void usage(const char *prog) { fprintf(stderr, - "usage: %s [OPTS] IFINDEX_IN IFINDEX_OUT\n\n" + "usage: %s [OPTS] <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n\n" "OPTS:\n" " -S use skb-mode\n" " -N enforce native mode\n" @@ -128,7 +129,7 @@ int main(int argc, char **argv) } if (optind == argc) { - printf("usage: %s IFINDEX_IN IFINDEX_OUT\n", argv[0]); + printf("usage: %s <IFNAME|IFINDEX>_IN <IFNAME|IFINDEX>_OUT\n", argv[0]); return 1; } @@ -137,8 +138,14 @@ int main(int argc, char **argv) return 1; } - ifindex_in = strtoul(argv[optind], NULL, 0); - ifindex_out = strtoul(argv[optind + 1], NULL, 0); + ifindex_in = if_nametoindex(argv[optind]); + if (!ifindex_in) + ifindex_in = strtoul(argv[optind], NULL, 0); + + ifindex_out = if_nametoindex(argv[optind + 1]); + if (!ifindex_out) + ifindex_out = strtoul(argv[optind + 1], NULL, 0); + printf("input: %d output: %d\n", ifindex_in, ifindex_out); snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); @@ -189,7 +196,7 @@ int main(int argc, char **argv) } memset(&info, 0, sizeof(info)); - ret = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); + ret = bpf_obj_get_info_by_fd(dummy_prog_fd, &info, &info_len); if (ret) { printf("can't get prog info - %s\n", strerror(errno)); return ret; diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c index 1f66419631c3..1469b66ebad1 100644 --- a/samples/bpf/xdp_router_ipv4_user.c +++ b/samples/bpf/xdp_router_ipv4_user.c @@ -21,7 +21,7 @@ #include <sys/ioctl.h> #include <sys/syscall.h> #include "bpf_util.h" -#include "bpf/libbpf.h" +#include "libbpf.h" #include <sys/resource.h> #include <libgen.h> diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c index 1210f3b170f0..c7e4e45d824a 100644 --- a/samples/bpf/xdp_rxq_info_user.c +++ b/samples/bpf/xdp_rxq_info_user.c @@ -22,8 +22,8 @@ static const char *__doc__ = " XDP RX-queue info extract example\n\n" #include <arpa/inet.h> #include <linux/if_link.h> -#include "bpf/bpf.h" -#include "bpf/libbpf.h" +#include "bpf.h" +#include "libbpf.h" #include "bpf_util.h" static int ifindex = -1; diff --git a/samples/bpf/xdp_sample_pkts_kern.c b/samples/bpf/xdp_sample_pkts_kern.c index f7ca8b850978..6c7c7e0aaeda 100644 --- a/samples/bpf/xdp_sample_pkts_kern.c +++ b/samples/bpf/xdp_sample_pkts_kern.c @@ -7,13 +7,6 @@ #define SAMPLE_SIZE 64ul #define MAX_CPUS 128 -#define bpf_printk(fmt, ...) \ -({ \ - char ____fmt[] = fmt; \ - bpf_trace_printk(____fmt, sizeof(____fmt), \ - ##__VA_ARGS__); \ -}) - struct bpf_map_def SEC("maps") my_map = { .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, .key_size = sizeof(int), diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c index e746a00d122e..dfb68582e243 100644 --- a/samples/bpf/xdp_tx_iptunnel_user.c +++ b/samples/bpf/xdp_tx_iptunnel_user.c @@ -9,12 +9,13 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <net/if.h> #include <sys/resource.h> #include <arpa/inet.h> #include <netinet/ether.h> #include <unistd.h> #include <time.h> -#include "bpf/libbpf.h" +#include "libbpf.h" #include <bpf/bpf.h> #include "bpf_util.h" #include "xdp_tx_iptunnel_common.h" @@ -83,7 +84,7 @@ static void usage(const char *cmd) "in an IPv4/v6 header and XDP_TX it out. The dst <VIP:PORT>\n" "is used to select packets to encapsulate\n\n"); printf("Usage: %s [...]\n", cmd); - printf(" -i <ifindex> Interface Index\n"); + printf(" -i <ifname|ifindex> Interface\n"); printf(" -a <vip-service-address> IPv4 or IPv6\n"); printf(" -p <vip-service-port> A port range (e.g. 433-444) is also allowed\n"); printf(" -s <source-ip> Used in the IPTunnel header\n"); @@ -181,7 +182,9 @@ int main(int argc, char **argv) switch (opt) { case 'i': - ifindex = atoi(optarg); + ifindex = if_nametoindex(optarg); + if (!ifindex) + ifindex = atoi(optarg); break; case 'a': vip.family = parse_ipstr(optarg, vip.daddr.v6); @@ -253,6 +256,11 @@ int main(int argc, char **argv) return 1; } + if (!ifindex) { + fprintf(stderr, "Invalid ifname\n"); + return 1; + } + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); prog_load_attr.file = filename; diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index d08ee1ab7bb4..93eaaf7239b2 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -27,8 +27,8 @@ #include <time.h> #include <unistd.h> -#include "bpf/libbpf.h" -#include "bpf/xsk.h" +#include "libbpf.h" +#include "xsk.h" #include <bpf/bpf.h> #ifndef SOL_XDP @@ -68,6 +68,7 @@ static int opt_queue; static int opt_poll; static int opt_interval = 1; static u32 opt_xdp_bind_flags; +static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; static __u32 prog_id; struct xsk_umem_info { @@ -276,6 +277,12 @@ static size_t gen_eth_frame(struct xsk_umem_info *umem, u64 addr) static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size) { struct xsk_umem_info *umem; + struct xsk_umem_config cfg = { + .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .frame_size = opt_xsk_frame_size, + .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM, + }; int ret; umem = calloc(1, sizeof(*umem)); @@ -283,7 +290,7 @@ static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size) exit_with_error(errno); ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, - NULL); + &cfg); if (ret) exit_with_error(-ret); @@ -323,11 +330,9 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem) &idx); if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS) exit_with_error(-ret); - for (i = 0; - i < XSK_RING_PROD__DEFAULT_NUM_DESCS * - XSK_UMEM__DEFAULT_FRAME_SIZE; - i += XSK_UMEM__DEFAULT_FRAME_SIZE) - *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx++) = i; + for (i = 0; i < XSK_RING_PROD__DEFAULT_NUM_DESCS; i++) + *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx++) = + i * opt_xsk_frame_size; xsk_ring_prod__submit(&xsk->umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS); @@ -346,6 +351,7 @@ static struct option long_options[] = { {"interval", required_argument, 0, 'n'}, {"zero-copy", no_argument, 0, 'z'}, {"copy", no_argument, 0, 'c'}, + {"frame-size", required_argument, 0, 'f'}, {0, 0, 0, 0} }; @@ -365,8 +371,9 @@ static void usage(const char *prog) " -n, --interval=n Specify statistics update interval (default 1 sec).\n" " -z, --zero-copy Force zero-copy mode.\n" " -c, --copy Force copy mode.\n" + " -f, --frame-size=n Set the frame size (must be a power of two, default is %d).\n" "\n"; - fprintf(stderr, str, prog); + fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE); exit(EXIT_FAILURE); } @@ -377,7 +384,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "Frtli:q:psSNn:cz", long_options, + c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:", long_options, &option_index); if (c == -1) break; @@ -420,6 +427,9 @@ static void parse_command_line(int argc, char **argv) case 'F': opt_xdp_flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST; break; + case 'f': + opt_xsk_frame_size = atoi(optarg); + break; default: usage(basename(argv[0])); } @@ -432,6 +442,11 @@ static void parse_command_line(int argc, char **argv) usage(basename(argv[0])); } + if (opt_xsk_frame_size & (opt_xsk_frame_size - 1)) { + fprintf(stderr, "--frame-size=%d is not a power of two\n", + opt_xsk_frame_size); + usage(basename(argv[0])); + } } static void kick_tx(struct xsk_socket_info *xsk) @@ -583,8 +598,7 @@ static void tx_only(struct xsk_socket_info *xsk) for (i = 0; i < BATCH_SIZE; i++) { xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr - = (frame_nb + i) << - XSK_UMEM__DEFAULT_FRAME_SHIFT; + = (frame_nb + i) * opt_xsk_frame_size; xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len = sizeof(pkt_data) - 1; } @@ -661,21 +675,19 @@ int main(int argc, char **argv) } ret = posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */ - NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE); + NUM_FRAMES * opt_xsk_frame_size); if (ret) exit_with_error(ret); /* Create sockets... */ - umem = xsk_configure_umem(bufs, - NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE); + umem = xsk_configure_umem(bufs, NUM_FRAMES * opt_xsk_frame_size); xsks[num_socks++] = xsk_configure_socket(umem); if (opt_bench == BENCH_TXONLY) { int i; - for (i = 0; i < NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE; - i += XSK_UMEM__DEFAULT_FRAME_SIZE) - (void)gen_eth_frame(umem, i); + for (i = 0; i < NUM_FRAMES; i++) + (void)gen_eth_frame(umem, i * opt_xsk_frame_size); } signal(SIGINT, int_exit); diff --git a/samples/pktgen/README.rst b/samples/pktgen/README.rst index ff8929da61c5..fd39215db508 100644 --- a/samples/pktgen/README.rst +++ b/samples/pktgen/README.rst @@ -20,6 +20,7 @@ across the sample scripts. Usage example is printed on errors:: -s : ($PKT_SIZE) packet size -d : ($DEST_IP) destination IP -m : ($DST_MAC) destination MAC-addr + -p : ($DST_PORT) destination PORT range (e.g. 433-444) is also allowed -t : ($THREADS) threads to start -f : ($F_THREAD) index of first thread (zero indexed CPU number) -c : ($SKB_CLONE) SKB clones send before alloc new SKB diff --git a/samples/pktgen/functions.sh b/samples/pktgen/functions.sh index f8bb3cd0f4ce..4af4046d71be 100644 --- a/samples/pktgen/functions.sh +++ b/samples/pktgen/functions.sh @@ -162,3 +162,37 @@ function get_node_cpus() echo $node_cpu_list } + +# Given a single or range of port(s), return minimum and maximum port number. +function parse_ports() +{ + local port_str=$1 + local port_list + local min_port + local max_port + + IFS="-" read -ra port_list <<< $port_str + + min_port=${port_list[0]} + max_port=${port_list[1]:-$min_port} + + echo $min_port $max_port +} + +# Given a minimum and maximum port, verify port number. +function validate_ports() +{ + local min_port=$1 + local max_port=$2 + + # 0 < port < 65536 + if [[ $min_port -gt 0 && $min_port -lt 65536 ]]; then + if [[ $max_port -gt 0 && $max_port -lt 65536 ]]; then + if [[ $min_port -le $max_port ]]; then + return 0 + fi + fi + fi + + err 5 "Invalid port(s): $min_port-$max_port" +} diff --git a/samples/pktgen/parameters.sh b/samples/pktgen/parameters.sh index 72fc562876e2..a06b00a0c7b6 100644 --- a/samples/pktgen/parameters.sh +++ b/samples/pktgen/parameters.sh @@ -10,6 +10,7 @@ function usage() { echo " -s : (\$PKT_SIZE) packet size" echo " -d : (\$DEST_IP) destination IP" echo " -m : (\$DST_MAC) destination MAC-addr" + echo " -p : (\$DST_PORT) destination PORT range (e.g. 433-444) is also allowed" echo " -t : (\$THREADS) threads to start" echo " -f : (\$F_THREAD) index of first thread (zero indexed CPU number)" echo " -c : (\$SKB_CLONE) SKB clones send before alloc new SKB" @@ -23,7 +24,7 @@ function usage() { ## --- Parse command line arguments / parameters --- ## echo "Commandline options:" -while getopts "s:i:d:m:f:t:c:n:b:vxh6" option; do +while getopts "s:i:d:m:p:f:t:c:n:b:vxh6" option; do case $option in i) # interface export DEV=$OPTARG @@ -41,6 +42,10 @@ while getopts "s:i:d:m:f:t:c:n:b:vxh6" option; do export DST_MAC=$OPTARG info "Destination MAC set to: DST_MAC=$DST_MAC" ;; + p) # PORT + export DST_PORT=$OPTARG + info "Destination PORT set to: DST_PORT=$DST_PORT" + ;; f) export F_THREAD=$OPTARG info "Index of first thread (zero indexed CPU number): $F_THREAD" diff --git a/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh b/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh index 2839f7d315cf..e14b1a9144d9 100755 --- a/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh +++ b/samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh @@ -41,6 +41,10 @@ fi [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" [ -z "$BURST" ] && BURST=1024 [ -z "$COUNT" ] && COUNT="10000000" # Zero means indefinitely +if [ -n "$DST_PORT" ]; then + read -r DST_MIN DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $DST_MIN $DST_MAX +fi # Base Config DELAY="0" # Zero means max speed @@ -69,6 +73,13 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do pg_set $dev "dst_mac $DST_MAC" pg_set $dev "dst$IP6 $DEST_IP" + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $DST_MIN" + pg_set $dev "udp_dst_max $DST_MAX" + fi + # Inject packet into RX path of stack pg_set $dev "xmit_mode netif_receive" diff --git a/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh b/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh index e1ee54465def..82c3e504e056 100755 --- a/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh +++ b/samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh @@ -24,6 +24,10 @@ if [[ -n "$BURST" ]]; then err 1 "Bursting not supported for this mode" fi [ -z "$COUNT" ] && COUNT="10000000" # Zero means indefinitely +if [ -n "$DST_PORT" ]; then + read -r DST_MIN DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $DST_MIN $DST_MAX +fi # Base Config DELAY="0" # Zero means max speed @@ -52,6 +56,13 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do pg_set $dev "dst_mac $DST_MAC" pg_set $dev "dst$IP6 $DEST_IP" + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $DST_MIN" + pg_set $dev "udp_dst_max $DST_MAX" + fi + # Inject packet into TX qdisc egress path of stack pg_set $dev "xmit_mode queue_xmit" done diff --git a/samples/pktgen/pktgen_sample01_simple.sh b/samples/pktgen/pktgen_sample01_simple.sh index e9ab4edba2d7..d1702fdde8f3 100755 --- a/samples/pktgen/pktgen_sample01_simple.sh +++ b/samples/pktgen/pktgen_sample01_simple.sh @@ -22,6 +22,10 @@ fi # Example enforce param "-m" for dst_mac [ -z "$DST_MAC" ] && usage && err 2 "Must specify -m dst_mac" [ -z "$COUNT" ] && COUNT="100000" # Zero means indefinitely +if [ -n "$DST_PORT" ]; then + read -r DST_MIN DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $DST_MIN $DST_MAX +fi # Base Config DELAY="0" # Zero means max speed @@ -59,6 +63,13 @@ pg_set $DEV "flag NO_TIMESTAMP" pg_set $DEV "dst_mac $DST_MAC" pg_set $DEV "dst$IP6 $DEST_IP" +if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $DEV "flag UDPDST_RND" + pg_set $DEV "udp_dst_min $DST_MIN" + pg_set $DEV "udp_dst_max $DST_MAX" +fi + # Setup random UDP port src range pg_set $DEV "flag UDPSRC_RND" pg_set $DEV "udp_src_min $UDP_MIN" diff --git a/samples/pktgen/pktgen_sample02_multiqueue.sh b/samples/pktgen/pktgen_sample02_multiqueue.sh index 99f740ae9857..7f7a9a27548f 100755 --- a/samples/pktgen/pktgen_sample02_multiqueue.sh +++ b/samples/pktgen/pktgen_sample02_multiqueue.sh @@ -29,6 +29,10 @@ if [ -z "$DEST_IP" ]; then [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" fi [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" +if [ -n "$DST_PORT" ]; then + read -r DST_MIN DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $DST_MIN $DST_MAX +fi # General cleanup everything since last run pg_ctrl "reset" @@ -60,6 +64,13 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do pg_set $dev "dst_mac $DST_MAC" pg_set $dev "dst$IP6 $DEST_IP" + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $DST_MIN" + pg_set $dev "udp_dst_max $DST_MAX" + fi + # Setup random UDP port src range pg_set $dev "flag UDPSRC_RND" pg_set $dev "udp_src_min $UDP_MIN" diff --git a/samples/pktgen/pktgen_sample03_burst_single_flow.sh b/samples/pktgen/pktgen_sample03_burst_single_flow.sh index 8fdd36722d9e..b520637817ce 100755 --- a/samples/pktgen/pktgen_sample03_burst_single_flow.sh +++ b/samples/pktgen/pktgen_sample03_burst_single_flow.sh @@ -33,6 +33,10 @@ fi [ -z "$BURST" ] && BURST=32 [ -z "$CLONE_SKB" ] && CLONE_SKB="0" # No need for clones when bursting [ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely +if [ -n "$DST_PORT" ]; then + read -r DST_MIN DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $DST_MIN $DST_MAX +fi # Base Config DELAY="0" # Zero means max speed @@ -60,6 +64,13 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do pg_set $dev "dst_mac $DST_MAC" pg_set $dev "dst$IP6 $DEST_IP" + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $DST_MIN" + pg_set $dev "udp_dst_max $DST_MAX" + fi + # Setup burst, for easy testing -b 0 disable bursting # (internally in pktgen default and minimum burst=1) if [[ ${BURST} -ne 0 ]]; then diff --git a/samples/pktgen/pktgen_sample04_many_flows.sh b/samples/pktgen/pktgen_sample04_many_flows.sh index 4df92b7176da..5b6e9d9cb5b5 100755 --- a/samples/pktgen/pktgen_sample04_many_flows.sh +++ b/samples/pktgen/pktgen_sample04_many_flows.sh @@ -17,6 +17,10 @@ source ${basedir}/parameters.sh [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" [ -z "$CLONE_SKB" ] && CLONE_SKB="0" [ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely +if [ -n "$DST_PORT" ]; then + read -r DST_MIN DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $DST_MIN $DST_MAX +fi # NOTICE: Script specific settings # ======= @@ -56,6 +60,13 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do pg_set $dev "dst_mac $DST_MAC" pg_set $dev "dst $DEST_IP" + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $DST_MIN" + pg_set $dev "udp_dst_max $DST_MAX" + fi + # Randomize source IP-addresses pg_set $dev "flag IPSRC_RND" pg_set $dev "src_min 198.18.0.0" diff --git a/samples/pktgen/pktgen_sample05_flow_per_thread.sh b/samples/pktgen/pktgen_sample05_flow_per_thread.sh index 7f8b5e59f01e..0c06e63fbe97 100755 --- a/samples/pktgen/pktgen_sample05_flow_per_thread.sh +++ b/samples/pktgen/pktgen_sample05_flow_per_thread.sh @@ -22,7 +22,10 @@ source ${basedir}/parameters.sh [ -z "$CLONE_SKB" ] && CLONE_SKB="0" [ -z "$BURST" ] && BURST=32 [ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely - +if [ -n "$DST_PORT" ]; then + read -r DST_MIN DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $DST_MIN $DST_MAX +fi # Base Config DELAY="0" # Zero means max speed @@ -50,6 +53,13 @@ for ((thread = $F_THREAD; thread <= $L_THREAD; thread++)); do pg_set $dev "dst_mac $DST_MAC" pg_set $dev "dst $DEST_IP" + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $DST_MIN" + pg_set $dev "udp_dst_max $DST_MAX" + fi + # Setup source IP-addresses based on thread number pg_set $dev "src_min 198.18.$((thread+1)).1" pg_set $dev "src_max 198.18.$((thread+1)).1" diff --git a/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh index 353adc17205e..97f0266c0356 100755 --- a/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh +++ b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh @@ -35,6 +35,10 @@ if [ -z "$DEST_IP" ]; then [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" fi [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" +if [ -n "$DST_PORT" ]; then + read -r DST_MIN DST_MAX <<< $(parse_ports $DST_PORT) + validate_ports $DST_MIN $DST_MAX +fi # General cleanup everything since last run pg_ctrl "reset" @@ -77,6 +81,13 @@ for ((i = 0; i < $THREADS; i++)); do pg_set $dev "dst_mac $DST_MAC" pg_set $dev "dst$IP6 $DEST_IP" + if [ -n "$DST_PORT" ]; then + # Single destination port or random port range + pg_set $dev "flag UDPDST_RND" + pg_set $dev "udp_dst_min $DST_MIN" + pg_set $dev "udp_dst_max $DST_MAX" + fi + # Setup random UDP port src range pg_set $dev "flag UDPSRC_RND" pg_set $dev "udp_src_min $UDP_MIN" |