From 68e8b849b221b37a78a110a0307717d45e3593a0 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 2 May 2018 13:01:22 +0200 Subject: net: initial AF_XDP skeleton Buildable skeleton of AF_XDP without any functionality. Just what it takes to register a new address family. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- net/core/sock.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index b2c3db169ca1..e7d8b6c955c6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -226,7 +226,8 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ - x "AF_QIPCRTR", x "AF_SMC" , x "AF_MAX" + x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ + x "AF_MAX" static const char *const af_family_key_strings[AF_MAX+1] = { _sock_locks("sk_lock-") @@ -262,7 +263,8 @@ static const char *const af_family_rlock_key_strings[AF_MAX+1] = { "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" , "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" , "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" , - "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_MAX" + "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" , + "rlock-AF_MAX" }; static const char *const af_family_wlock_key_strings[AF_MAX+1] = { "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" , @@ -279,7 +281,8 @@ static const char *const af_family_wlock_key_strings[AF_MAX+1] = { "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" , "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" , "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" , - "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_MAX" + "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" , + "wlock-AF_MAX" }; static const char *const af_family_elock_key_strings[AF_MAX+1] = { "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" , @@ -296,7 +299,8 @@ static const char *const af_family_elock_key_strings[AF_MAX+1] = { "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" , "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" , "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" , - "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_MAX" + "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" , + "elock-AF_MAX" }; /* -- cgit v1.2.3 From c497176cb2e478f0a5713b0e05f242276e3194b5 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 2 May 2018 13:01:27 +0200 Subject: xsk: add Rx receive functions and poll support Here the actual receive functions of AF_XDP are implemented, that in a later commit, will be called from the XDP layers. There's one set of functions for the XDP_DRV side and another for XDP_SKB (generic). A new XDP API, xdp_return_buff, is also introduced. Adding xdp_return_buff, which is analogous to xdp_return_frame, but acts upon an struct xdp_buff. The API will be used by AF_XDP in future commits. Support for the poll syscall is also implemented. v2: xskq_validate_id did not update cons_tail. The entries variable was calculated twice in xskq_nb_avail. Squashed xdp_return_buff commit. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 1 + include/net/xdp_sock.h | 22 ++++++++++ net/core/xdp.c | 15 +++++-- net/xdp/xdp_umem.h | 18 ++++++++ net/xdp/xsk.c | 73 ++++++++++++++++++++++++++++++- net/xdp/xsk_queue.h | 114 ++++++++++++++++++++++++++++++++++++++++++++++++- 6 files changed, 238 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/include/net/xdp.h b/include/net/xdp.h index 137ad5f9f40f..0b689cf561c7 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) } void xdp_return_frame(struct xdp_frame *xdpf); +void xdp_return_buff(struct xdp_buff *xdp); int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index); diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 85d02512f59b..a0342dff6a4d 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -31,6 +31,28 @@ struct xdp_sock { u16 queue_id; /* Protects multiple processes in the control path */ struct mutex mutex; + u64 rx_dropped; }; +struct xdp_buff; +#ifdef CONFIG_XDP_SOCKETS +int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); +int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); +void xsk_flush(struct xdp_sock *xs); +#else +static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + return -ENOTSUPP; +} + +static inline int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + return -ENOTSUPP; +} + +static inline void xsk_flush(struct xdp_sock *xs) +{ +} +#endif /* CONFIG_XDP_SOCKETS */ + #endif /* _LINUX_XDP_SOCK_H */ diff --git a/net/core/xdp.c b/net/core/xdp.c index 0c86b53a3a63..bf6758f74339 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -308,11 +308,9 @@ err: } EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); -void xdp_return_frame(struct xdp_frame *xdpf) +static void xdp_return(void *data, struct xdp_mem_info *mem) { - struct xdp_mem_info *mem = &xdpf->mem; struct xdp_mem_allocator *xa; - void *data = xdpf->data; struct page *page; switch (mem->type) { @@ -339,4 +337,15 @@ void xdp_return_frame(struct xdp_frame *xdpf) break; } } + +void xdp_return_frame(struct xdp_frame *xdpf) +{ + xdp_return(xdpf->data, &xdpf->mem); +} EXPORT_SYMBOL_GPL(xdp_return_frame); + +void xdp_return_buff(struct xdp_buff *xdp) +{ + xdp_return(xdp->data, &xdp->rxq->mem); +} +EXPORT_SYMBOL_GPL(xdp_return_buff); diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index b13133e9c501..c7378a11721f 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -39,6 +39,24 @@ struct xdp_umem { struct work_struct work; }; +static inline char *xdp_umem_get_data(struct xdp_umem *umem, u32 idx) +{ + u64 pg, off; + char *data; + + pg = idx >> umem->nfpplog2; + off = (idx & umem->nfpp_mask) << umem->frame_size_log2; + + data = page_address(umem->pgs[pg]); + return data + off; +} + +static inline char *xdp_umem_get_data_with_headroom(struct xdp_umem *umem, + u32 idx) +{ + return xdp_umem_get_data(umem, idx) + umem->frame_headroom; +} + bool xdp_umem_validate_queues(struct xdp_umem *umem); int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr); void xdp_get_umem(struct xdp_umem *umem); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index bf2c97b87992..4e1e6c581e1d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -41,6 +41,74 @@ static struct xdp_sock *xdp_sk(struct sock *sk) return (struct xdp_sock *)sk; } +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + u32 *id, len = xdp->data_end - xdp->data; + void *buffer; + int err = 0; + + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) + return -EINVAL; + + id = xskq_peek_id(xs->umem->fq); + if (!id) + return -ENOSPC; + + buffer = xdp_umem_get_data_with_headroom(xs->umem, *id); + memcpy(buffer, xdp->data, len); + err = xskq_produce_batch_desc(xs->rx, *id, len, + xs->umem->frame_headroom); + if (!err) + xskq_discard_id(xs->umem->fq); + + return err; +} + +int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + int err; + + err = __xsk_rcv(xs, xdp); + if (likely(!err)) + xdp_return_buff(xdp); + else + xs->rx_dropped++; + + return err; +} + +void xsk_flush(struct xdp_sock *xs) +{ + xskq_produce_flush_desc(xs->rx); + xs->sk.sk_data_ready(&xs->sk); +} + +int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + int err; + + err = __xsk_rcv(xs, xdp); + if (!err) + xsk_flush(xs); + else + xs->rx_dropped++; + + return err; +} + +static unsigned int xsk_poll(struct file *file, struct socket *sock, + struct poll_table_struct *wait) +{ + unsigned int mask = datagram_poll(file, sock, wait); + struct sock *sk = sock->sk; + struct xdp_sock *xs = xdp_sk(sk); + + if (xs->rx && !xskq_empty_desc(xs->rx)) + mask |= POLLIN | POLLRDNORM; + + return mask; +} + static int xsk_init_queue(u32 entries, struct xsk_queue **queue, bool umem_queue) { @@ -179,6 +247,9 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) { err = -EINVAL; goto out_unlock; + } else { + /* This xsk has its own umem. */ + xskq_set_umem(xs->umem->fq, &xs->umem->props); } /* Rebind? */ @@ -330,7 +401,7 @@ static const struct proto_ops xsk_proto_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, - .poll = sock_no_poll, + .poll = xsk_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 9ddd2ee07a84..0a9b92b4f93a 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -20,6 +20,8 @@ #include "xdp_umem_props.h" +#define RX_BATCH_SIZE 16 + struct xsk_queue { struct xdp_umem_props umem_props; u32 ring_mask; @@ -32,8 +34,118 @@ struct xsk_queue { u64 invalid_descs; }; +/* Common functions operating for both RXTX and umem queues */ + +static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) +{ + u32 entries = q->prod_tail - q->cons_tail; + + if (entries == 0) { + /* Refresh the local pointer */ + q->prod_tail = READ_ONCE(q->ring->producer); + entries = q->prod_tail - q->cons_tail; + } + + return (entries > dcnt) ? dcnt : entries; +} + +static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) +{ + u32 free_entries = q->nentries - (producer - q->cons_tail); + + if (free_entries >= dcnt) + return free_entries; + + /* Refresh the local tail pointer */ + q->cons_tail = READ_ONCE(q->ring->consumer); + return q->nentries - (producer - q->cons_tail); +} + +/* UMEM queue */ + +static inline bool xskq_is_valid_id(struct xsk_queue *q, u32 idx) +{ + if (unlikely(idx >= q->umem_props.nframes)) { + q->invalid_descs++; + return false; + } + return true; +} + +static inline u32 *xskq_validate_id(struct xsk_queue *q) +{ + while (q->cons_tail != q->cons_head) { + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + unsigned int idx = q->cons_tail & q->ring_mask; + + if (xskq_is_valid_id(q, ring->desc[idx])) + return &ring->desc[idx]; + + q->cons_tail++; + } + + return NULL; +} + +static inline u32 *xskq_peek_id(struct xsk_queue *q) +{ + struct xdp_umem_ring *ring; + + if (q->cons_tail == q->cons_head) { + WRITE_ONCE(q->ring->consumer, q->cons_tail); + q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE); + + /* Order consumer and data */ + smp_rmb(); + + return xskq_validate_id(q); + } + + ring = (struct xdp_umem_ring *)q->ring; + return &ring->desc[q->cons_tail & q->ring_mask]; +} + +static inline void xskq_discard_id(struct xsk_queue *q) +{ + q->cons_tail++; + (void)xskq_validate_id(q); +} + +/* Rx queue */ + +static inline int xskq_produce_batch_desc(struct xsk_queue *q, + u32 id, u32 len, u16 offset) +{ + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + unsigned int idx; + + if (xskq_nb_free(q, q->prod_head, 1) == 0) + return -ENOSPC; + + idx = (q->prod_head++) & q->ring_mask; + ring->desc[idx].idx = id; + ring->desc[idx].len = len; + ring->desc[idx].offset = offset; + + return 0; +} + +static inline void xskq_produce_flush_desc(struct xsk_queue *q) +{ + /* Order producer and data */ + smp_wmb(); + + q->prod_tail = q->prod_head, + WRITE_ONCE(q->ring->producer, q->prod_tail); +} + +static inline bool xskq_empty_desc(struct xsk_queue *q) +{ + return (xskq_nb_free(q, q->prod_tail, 1) == q->nentries); +} + void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); -void xskq_destroy(struct xsk_queue *q); +void xskq_destroy(struct xsk_queue *q_ops); #endif /* _LINUX_XSK_QUEUE_H */ -- cgit v1.2.3 From 1b1a251c83d29ec18b4a2191cce5c5b7117529ff Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 2 May 2018 13:01:29 +0200 Subject: xsk: wire up XDP_DRV side of AF_XDP This commit wires up the xskmap to XDP_DRV layer. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index d3781daa26ab..40d4bbb4508d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2801,7 +2801,8 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, { int err; - if (map->map_type == BPF_MAP_TYPE_DEVMAP) { + switch (map->map_type) { + case BPF_MAP_TYPE_DEVMAP: { struct net_device *dev = fwd; struct xdp_frame *xdpf; @@ -2819,14 +2820,25 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, if (err) return err; __dev_map_insert_ctx(map, index); - - } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + break; + } + case BPF_MAP_TYPE_CPUMAP: { struct bpf_cpu_map_entry *rcpu = fwd; err = cpu_map_enqueue(rcpu, xdp, dev_rx); if (err) return err; __cpu_map_insert_ctx(map, index); + break; + } + case BPF_MAP_TYPE_XSKMAP: { + struct xdp_sock *xs = fwd; + + err = __xsk_map_redirect(map, xdp, xs); + return err; + } + default: + break; } return 0; } @@ -2845,6 +2857,9 @@ void xdp_do_flush_map(void) case BPF_MAP_TYPE_CPUMAP: __cpu_map_flush(map); break; + case BPF_MAP_TYPE_XSKMAP: + __xsk_map_flush(map); + break; default: break; } @@ -2859,6 +2874,8 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) return __dev_map_lookup_elem(map, index); case BPF_MAP_TYPE_CPUMAP: return __cpu_map_lookup_elem(map, index); + case BPF_MAP_TYPE_XSKMAP: + return __xsk_map_lookup_elem(map, index); default: return NULL; } -- cgit v1.2.3 From 02671e23e7b383763fe1ae4f20b56d8029f9dfc6 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 2 May 2018 13:01:30 +0200 Subject: xsk: wire up XDP_SKB side of AF_XDP This commit wires up the xskmap to XDP_SKB layer. Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 +- net/core/dev.c | 35 +++++++++++++++++++---------------- net/core/filter.c | 17 ++++++++++++++--- 3 files changed, 34 insertions(+), 20 deletions(-) (limited to 'net/core') diff --git a/include/linux/filter.h b/include/linux/filter.h index 64899c04c1a6..b7f81e3a70cb 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -760,7 +760,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * This does not appear to be a real limitation for existing software. */ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *prog); + struct xdp_buff *xdp, struct bpf_prog *prog); int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *prog); diff --git a/net/core/dev.c b/net/core/dev.c index 8f8931b93140..aea36b5a2fed 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3994,12 +3994,12 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) } static u32 netif_receive_generic_xdp(struct sk_buff *skb, + struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct netdev_rx_queue *rxqueue; void *orig_data, *orig_data_end; u32 metalen, act = XDP_DROP; - struct xdp_buff xdp; int hlen, off; u32 mac_len; @@ -4034,19 +4034,19 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, */ mac_len = skb->data - skb_mac_header(skb); hlen = skb_headlen(skb) + mac_len; - xdp.data = skb->data - mac_len; - xdp.data_meta = xdp.data; - xdp.data_end = xdp.data + hlen; - xdp.data_hard_start = skb->data - skb_headroom(skb); - orig_data_end = xdp.data_end; - orig_data = xdp.data; + xdp->data = skb->data - mac_len; + xdp->data_meta = xdp->data; + xdp->data_end = xdp->data + hlen; + xdp->data_hard_start = skb->data - skb_headroom(skb); + orig_data_end = xdp->data_end; + orig_data = xdp->data; rxqueue = netif_get_rxqueue(skb); - xdp.rxq = &rxqueue->xdp_rxq; + xdp->rxq = &rxqueue->xdp_rxq; - act = bpf_prog_run_xdp(xdp_prog, &xdp); + act = bpf_prog_run_xdp(xdp_prog, xdp); - off = xdp.data - orig_data; + off = xdp->data - orig_data; if (off > 0) __skb_pull(skb, off); else if (off < 0) @@ -4056,10 +4056,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* check if bpf_xdp_adjust_tail was used. it can only "shrink" * pckt. */ - off = orig_data_end - xdp.data_end; + off = orig_data_end - xdp->data_end; if (off != 0) { - skb_set_tail_pointer(skb, xdp.data_end - xdp.data); + skb_set_tail_pointer(skb, xdp->data_end - xdp->data); skb->len -= off; + } switch (act) { @@ -4068,7 +4069,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, __skb_push(skb, mac_len); break; case XDP_PASS: - metalen = xdp.data - xdp.data_meta; + metalen = xdp->data - xdp->data_meta; if (metalen) skb_metadata_set(skb, metalen); break; @@ -4118,17 +4119,19 @@ static struct static_key generic_xdp_needed __read_mostly; int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) { if (xdp_prog) { - u32 act = netif_receive_generic_xdp(skb, xdp_prog); + struct xdp_buff xdp; + u32 act; int err; + act = netif_receive_generic_xdp(skb, &xdp, xdp_prog); if (act != XDP_PASS) { switch (act) { case XDP_REDIRECT: err = xdp_do_generic_redirect(skb->dev, skb, - xdp_prog); + &xdp, xdp_prog); if (err) goto out_redir; - /* fallthru to submit skb */ + break; case XDP_TX: generic_xdp_tx(skb, xdp_prog); break; diff --git a/net/core/filter.c b/net/core/filter.c index 40d4bbb4508d..120bc8a202d9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -59,6 +59,7 @@ #include #include #include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -2973,13 +2974,14 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd) static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, + struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; - struct net_device *fwd = NULL; u32 index = ri->ifindex; + void *fwd = NULL; int err = 0; ri->ifindex = 0; @@ -3001,6 +3003,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) goto err; skb->dev = fwd; + generic_xdp_tx(skb, xdp_prog); + } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { + struct xdp_sock *xs = fwd; + + err = xsk_generic_rcv(xs, xdp); + if (err) + goto err; + consume_skb(skb); } else { /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ err = -EBADRQC; @@ -3015,7 +3025,7 @@ err: } int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); u32 index = ri->ifindex; @@ -3023,7 +3033,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, int err = 0; if (ri->map) - return xdp_do_generic_redirect_map(dev, skb, xdp_prog); + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog); ri->ifindex = 0; fwd = dev_get_by_index_rcu(dev_net(dev), index); @@ -3037,6 +3047,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, skb->dev = fwd; _trace_xdp_redirect(dev, xdp_prog, index); + generic_xdp_tx(skb, xdp_prog); return 0; err: _trace_xdp_redirect_err(dev, xdp_prog, index, err); -- cgit v1.2.3 From 865b03f21162e4edfda51fc08693c864b1d4fdaf Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Wed, 2 May 2018 13:01:33 +0200 Subject: dev: packet: make packet_direct_xmit a common function The new dev_direct_xmit will be used by AF_XDP in later commits. Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/linux/netdevice.h | 1 + net/core/dev.c | 38 ++++++++++++++++++++++++++++++++++++++ net/packet/af_packet.c | 42 +++++------------------------------------- 3 files changed, 44 insertions(+), 37 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 366c32891158..a30435118530 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2486,6 +2486,7 @@ void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); int dev_queue_xmit(struct sk_buff *skb); int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv); +int dev_direct_xmit(struct sk_buff *skb, u16 queue_id); int register_netdevice(struct net_device *dev); void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); void unregister_netdevice_many(struct list_head *head); diff --git a/net/core/dev.c b/net/core/dev.c index aea36b5a2fed..d3fdc86516e8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3625,6 +3625,44 @@ int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) } EXPORT_SYMBOL(dev_queue_xmit_accel); +int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) +{ + struct net_device *dev = skb->dev; + struct sk_buff *orig_skb = skb; + struct netdev_queue *txq; + int ret = NETDEV_TX_BUSY; + bool again = false; + + if (unlikely(!netif_running(dev) || + !netif_carrier_ok(dev))) + goto drop; + + skb = validate_xmit_skb_list(skb, dev, &again); + if (skb != orig_skb) + goto drop; + + skb_set_queue_mapping(skb, queue_id); + txq = skb_get_tx_queue(dev, skb); + + local_bh_disable(); + + HARD_TX_LOCK(dev, txq, smp_processor_id()); + if (!netif_xmit_frozen_or_drv_stopped(txq)) + ret = netdev_start_xmit(skb, dev, txq, false); + HARD_TX_UNLOCK(dev, txq); + + local_bh_enable(); + + if (!dev_xmit_complete(ret)) + kfree_skb(skb); + + return ret; +drop: + atomic_long_inc(&dev->tx_dropped); + kfree_skb_list(skb); + return NET_XMIT_DROP; +} +EXPORT_SYMBOL(dev_direct_xmit); /************************************************************************* * Receiver routines diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 01f3515cada0..611a26d5235c 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -209,7 +209,7 @@ static void prb_clear_rxhash(struct tpacket_kbdq_core *, static void prb_fill_vlan_info(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void packet_flush_mclist(struct sock *sk); -static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb); +static u16 packet_pick_tx_queue(struct sk_buff *skb); struct packet_skb_cb { union { @@ -243,40 +243,7 @@ static void __fanout_link(struct sock *sk, struct packet_sock *po); static int packet_direct_xmit(struct sk_buff *skb) { - struct net_device *dev = skb->dev; - struct sk_buff *orig_skb = skb; - struct netdev_queue *txq; - int ret = NETDEV_TX_BUSY; - bool again = false; - - if (unlikely(!netif_running(dev) || - !netif_carrier_ok(dev))) - goto drop; - - skb = validate_xmit_skb_list(skb, dev, &again); - if (skb != orig_skb) - goto drop; - - packet_pick_tx_queue(dev, skb); - txq = skb_get_tx_queue(dev, skb); - - local_bh_disable(); - - HARD_TX_LOCK(dev, txq, smp_processor_id()); - if (!netif_xmit_frozen_or_drv_stopped(txq)) - ret = netdev_start_xmit(skb, dev, txq, false); - HARD_TX_UNLOCK(dev, txq); - - local_bh_enable(); - - if (!dev_xmit_complete(ret)) - kfree_skb(skb); - - return ret; -drop: - atomic_long_inc(&dev->tx_dropped); - kfree_skb_list(skb); - return NET_XMIT_DROP; + return dev_direct_xmit(skb, packet_pick_tx_queue(skb)); } static struct net_device *packet_cached_dev_get(struct packet_sock *po) @@ -313,8 +280,9 @@ static u16 __packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) return (u16) raw_smp_processor_id() % dev->real_num_tx_queues; } -static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) +static u16 packet_pick_tx_queue(struct sk_buff *skb) { + struct net_device *dev = skb->dev; const struct net_device_ops *ops = dev->netdev_ops; u16 queue_index; @@ -326,7 +294,7 @@ static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb) queue_index = __packet_pick_tx_queue(dev, skb); } - skb_set_queue_mapping(skb, queue_index); + return queue_index; } /* __register_prot_hook must be invoked through register_prot_hook -- cgit v1.2.3 From b390134c2423c557cac844ee3b7cb52dc1cfab02 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:12 +0200 Subject: bpf: prefix cbpf internal helpers with bpf_ No change in functionality, just remove the '__' prefix and replace it with a 'bpf_' prefix instead. We later on add a couple of more helpers for cBPF and keeping the scheme with '__' is suboptimal there. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 120bc8a202d9..c33595a8d604 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -113,12 +113,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) } EXPORT_SYMBOL(sk_filter_trim_cap); -BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb) +BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb) { return skb_get_poff(skb); } -BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) +BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; @@ -138,7 +138,7 @@ BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) return 0; } -BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) +BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; @@ -162,13 +162,13 @@ BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; } -BPF_CALL_0(__get_raw_cpu_id) +BPF_CALL_0(bpf_get_raw_cpu_id) { return raw_smp_processor_id(); } static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { - .func = __get_raw_cpu_id, + .func = bpf_get_raw_cpu_id, .gpl_only = false, .ret_type = RET_INTEGER, }; @@ -318,16 +318,16 @@ static bool convert_bpf_extensions(struct sock_filter *fp, /* Emit call(arg1=CTX, arg2=A, arg3=X) */ switch (fp->k) { case SKF_AD_OFF + SKF_AD_PAY_OFFSET: - *insn = BPF_EMIT_CALL(__skb_get_pay_offset); + *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset); break; case SKF_AD_OFF + SKF_AD_NLATTR: - *insn = BPF_EMIT_CALL(__skb_get_nlattr); + *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr); break; case SKF_AD_OFF + SKF_AD_NLATTR_NEST: - *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); + *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest); break; case SKF_AD_OFF + SKF_AD_CPU: - *insn = BPF_EMIT_CALL(__get_raw_cpu_id); + *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id); break; case SKF_AD_OFF + SKF_AD_RANDOM: *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); -- cgit v1.2.3 From 93731ef086cee90af594e62874bb98ae6d6eee91 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:13 +0200 Subject: bpf: migrate ebpf ld_abs/ld_ind tests to test_verifier Remove all eBPF tests involving LD_ABS/LD_IND from test_bpf.ko. Reason is that the eBPF tests from test_bpf module do not go via BPF verifier and therefore any instruction rewrites from verifier cannot take place. Therefore, move them into test_verifier which runs out of user space, so that verfier can rewrite LD_ABS/LD_IND internally in upcoming patches. It will have the same effect since runtime tests are also performed from there. This also allows to finally unexport bpf_skb_vlan_{push,pop}_proto and keep it internal to core kernel. Additionally, also add further cBPF LD_ABS/LD_IND test coverage into test_bpf.ko suite. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 - lib/test_bpf.c | 570 +++++++++++++++++----------- net/core/filter.c | 6 +- tools/testing/selftests/bpf/test_verifier.c | 266 ++++++++++++- 4 files changed, 619 insertions(+), 225 deletions(-) (limited to 'net/core') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 68ecdb4eea09..d0e3d7ef36a8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -714,8 +714,6 @@ extern const struct bpf_func_proto bpf_ktime_get_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; -extern const struct bpf_func_proto bpf_skb_vlan_push_proto; -extern const struct bpf_func_proto bpf_skb_vlan_pop_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 8e157806df7a..317f231462d4 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -386,116 +386,6 @@ static int bpf_fill_ld_abs_get_processor_id(struct bpf_test *self) return 0; } -#define PUSH_CNT 68 -/* test: {skb->data[0], vlan_push} x 68 + {skb->data[0], vlan_pop} x 68 */ -static int bpf_fill_ld_abs_vlan_push_pop(struct bpf_test *self) -{ - unsigned int len = BPF_MAXINSNS; - struct bpf_insn *insn; - int i = 0, j, k = 0; - - insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL); - if (!insn) - return -ENOMEM; - - insn[i++] = BPF_MOV64_REG(R6, R1); -loop: - for (j = 0; j < PUSH_CNT; j++) { - insn[i++] = BPF_LD_ABS(BPF_B, 0); - insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0x34, len - i - 2); - i++; - insn[i++] = BPF_MOV64_REG(R1, R6); - insn[i++] = BPF_MOV64_IMM(R2, 1); - insn[i++] = BPF_MOV64_IMM(R3, 2); - insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, - bpf_skb_vlan_push_proto.func - __bpf_call_base); - insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0, len - i - 2); - i++; - } - - for (j = 0; j < PUSH_CNT; j++) { - insn[i++] = BPF_LD_ABS(BPF_B, 0); - insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0x34, len - i - 2); - i++; - insn[i++] = BPF_MOV64_REG(R1, R6); - insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, - bpf_skb_vlan_pop_proto.func - __bpf_call_base); - insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0, len - i - 2); - i++; - } - if (++k < 5) - goto loop; - - for (; i < len - 1; i++) - insn[i] = BPF_ALU32_IMM(BPF_MOV, R0, 0xbef); - - insn[len - 1] = BPF_EXIT_INSN(); - - self->u.ptr.insns = insn; - self->u.ptr.len = len; - - return 0; -} - -static int bpf_fill_ld_abs_vlan_push_pop2(struct bpf_test *self) -{ - struct bpf_insn *insn; - - insn = kmalloc_array(16, sizeof(*insn), GFP_KERNEL); - if (!insn) - return -ENOMEM; - - /* Due to func address being non-const, we need to - * assemble this here. - */ - insn[0] = BPF_MOV64_REG(R6, R1); - insn[1] = BPF_LD_ABS(BPF_B, 0); - insn[2] = BPF_LD_ABS(BPF_H, 0); - insn[3] = BPF_LD_ABS(BPF_W, 0); - insn[4] = BPF_MOV64_REG(R7, R6); - insn[5] = BPF_MOV64_IMM(R6, 0); - insn[6] = BPF_MOV64_REG(R1, R7); - insn[7] = BPF_MOV64_IMM(R2, 1); - insn[8] = BPF_MOV64_IMM(R3, 2); - insn[9] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, - bpf_skb_vlan_push_proto.func - __bpf_call_base); - insn[10] = BPF_MOV64_REG(R6, R7); - insn[11] = BPF_LD_ABS(BPF_B, 0); - insn[12] = BPF_LD_ABS(BPF_H, 0); - insn[13] = BPF_LD_ABS(BPF_W, 0); - insn[14] = BPF_MOV64_IMM(R0, 42); - insn[15] = BPF_EXIT_INSN(); - - self->u.ptr.insns = insn; - self->u.ptr.len = 16; - - return 0; -} - -static int bpf_fill_jump_around_ld_abs(struct bpf_test *self) -{ - unsigned int len = BPF_MAXINSNS; - struct bpf_insn *insn; - int i = 0; - - insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL); - if (!insn) - return -ENOMEM; - - insn[i++] = BPF_MOV64_REG(R6, R1); - insn[i++] = BPF_LD_ABS(BPF_B, 0); - insn[i] = BPF_JMP_IMM(BPF_JEQ, R0, 10, len - i - 2); - i++; - while (i < len - 1) - insn[i++] = BPF_LD_ABS(BPF_B, 1); - insn[i] = BPF_EXIT_INSN(); - - self->u.ptr.insns = insn; - self->u.ptr.len = len; - - return 0; -} - static int __bpf_fill_stxdw(struct bpf_test *self, int size) { unsigned int len = BPF_MAXINSNS; @@ -1987,40 +1877,6 @@ static struct bpf_test tests[] = { { }, { { 0, -1 } } }, - { - "INT: DIV + ABS", - .u.insns_int = { - BPF_ALU64_REG(BPF_MOV, R6, R1), - BPF_LD_ABS(BPF_B, 3), - BPF_ALU64_IMM(BPF_MOV, R2, 2), - BPF_ALU32_REG(BPF_DIV, R0, R2), - BPF_ALU64_REG(BPF_MOV, R8, R0), - BPF_LD_ABS(BPF_B, 4), - BPF_ALU64_REG(BPF_ADD, R8, R0), - BPF_LD_IND(BPF_B, R8, -70), - BPF_EXIT_INSN(), - }, - INTERNAL, - { 10, 20, 30, 40, 50 }, - { { 4, 0 }, { 5, 10 } } - }, - { - /* This one doesn't go through verifier, but is just raw insn - * as opposed to cBPF tests from here. Thus div by 0 tests are - * done in test_verifier in BPF kselftests. - */ - "INT: DIV by -1", - .u.insns_int = { - BPF_ALU64_REG(BPF_MOV, R6, R1), - BPF_ALU64_IMM(BPF_MOV, R7, -1), - BPF_LD_ABS(BPF_B, 3), - BPF_ALU32_REG(BPF_DIV, R0, R7), - BPF_EXIT_INSN(), - }, - INTERNAL, - { 10, 20, 30, 40, 50 }, - { { 3, 0 }, { 4, 0 } } - }, { "check: missing ret", .u.insns = { @@ -2383,50 +2239,6 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } } }, - { - "nmap reduced", - .u.insns_int = { - BPF_MOV64_REG(R6, R1), - BPF_LD_ABS(BPF_H, 12), - BPF_JMP_IMM(BPF_JNE, R0, 0x806, 28), - BPF_LD_ABS(BPF_H, 12), - BPF_JMP_IMM(BPF_JNE, R0, 0x806, 26), - BPF_MOV32_IMM(R0, 18), - BPF_STX_MEM(BPF_W, R10, R0, -64), - BPF_LDX_MEM(BPF_W, R7, R10, -64), - BPF_LD_IND(BPF_W, R7, 14), - BPF_STX_MEM(BPF_W, R10, R0, -60), - BPF_MOV32_IMM(R0, 280971478), - BPF_STX_MEM(BPF_W, R10, R0, -56), - BPF_LDX_MEM(BPF_W, R7, R10, -56), - BPF_LDX_MEM(BPF_W, R0, R10, -60), - BPF_ALU32_REG(BPF_SUB, R0, R7), - BPF_JMP_IMM(BPF_JNE, R0, 0, 15), - BPF_LD_ABS(BPF_H, 12), - BPF_JMP_IMM(BPF_JNE, R0, 0x806, 13), - BPF_MOV32_IMM(R0, 22), - BPF_STX_MEM(BPF_W, R10, R0, -56), - BPF_LDX_MEM(BPF_W, R7, R10, -56), - BPF_LD_IND(BPF_H, R7, 14), - BPF_STX_MEM(BPF_W, R10, R0, -52), - BPF_MOV32_IMM(R0, 17366), - BPF_STX_MEM(BPF_W, R10, R0, -48), - BPF_LDX_MEM(BPF_W, R7, R10, -48), - BPF_LDX_MEM(BPF_W, R0, R10, -52), - BPF_ALU32_REG(BPF_SUB, R0, R7), - BPF_JMP_IMM(BPF_JNE, R0, 0, 2), - BPF_MOV32_IMM(R0, 256), - BPF_EXIT_INSN(), - BPF_MOV32_IMM(R0, 0), - BPF_EXIT_INSN(), - }, - INTERNAL, - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x06, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0x10, 0xbf, 0x48, 0xd6, 0x43, 0xd6}, - { { 38, 256 } }, - .stack_depth = 64, - }, /* BPF_ALU | BPF_MOV | BPF_X */ { "ALU_MOV_X: dst = 2", @@ -5485,22 +5297,6 @@ static struct bpf_test tests[] = { { { 1, 0xbee } }, .fill_helper = bpf_fill_ld_abs_get_processor_id, }, - { - "BPF_MAXINSNS: ld_abs+vlan_push/pop", - { }, - INTERNAL, - { 0x34 }, - { { ETH_HLEN, 0xbef } }, - .fill_helper = bpf_fill_ld_abs_vlan_push_pop, - }, - { - "BPF_MAXINSNS: jump around ld_abs", - { }, - INTERNAL, - { 10, 11 }, - { { 2, 10 } }, - .fill_helper = bpf_fill_jump_around_ld_abs, - }, /* * LD_IND / LD_ABS on fragmented SKBs */ @@ -5682,6 +5478,53 @@ static struct bpf_test tests[] = { { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, { {0x40, 0x05 } }, }, + { + "LD_IND byte positive offset, all ff", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3e), + BPF_STMT(BPF_LD | BPF_IND | BPF_B, 0x1), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0xff, [0x3d] = 0xff, [0x3e] = 0xff, [0x3f] = 0xff }, + { {0x40, 0xff } }, + }, + { + "LD_IND byte positive offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3e), + BPF_STMT(BPF_LD | BPF_IND | BPF_B, 0x1), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 }, }, + }, + { + "LD_IND byte negative offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3e), + BPF_STMT(BPF_LD | BPF_IND | BPF_B, -0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 } }, + }, + { + "LD_IND byte negative offset, multiple calls", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3b), + BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 1), + BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 2), + BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 3), + BPF_STMT(BPF_LD | BPF_IND | BPF_B, SKF_LL_OFF + 4), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0x82 }, }, + }, { "LD_IND halfword positive offset", .u.insns = { @@ -5730,6 +5573,39 @@ static struct bpf_test tests[] = { }, { {0x40, 0x66cc } }, }, + { + "LD_IND halfword positive offset, all ff", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3d), + BPF_STMT(BPF_LD | BPF_IND | BPF_H, 0x1), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0xff, [0x3d] = 0xff, [0x3e] = 0xff, [0x3f] = 0xff }, + { {0x40, 0xffff } }, + }, + { + "LD_IND halfword positive offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3e), + BPF_STMT(BPF_LD | BPF_IND | BPF_H, 0x1), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 }, }, + }, + { + "LD_IND halfword negative offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3e), + BPF_STMT(BPF_LD | BPF_IND | BPF_H, -0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 } }, + }, { "LD_IND word positive offset", .u.insns = { @@ -5820,6 +5696,39 @@ static struct bpf_test tests[] = { }, { {0x40, 0x66cc77dd } }, }, + { + "LD_IND word positive offset, all ff", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3b), + BPF_STMT(BPF_LD | BPF_IND | BPF_W, 0x1), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0xff, [0x3d] = 0xff, [0x3e] = 0xff, [0x3f] = 0xff }, + { {0x40, 0xffffffff } }, + }, + { + "LD_IND word positive offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3e), + BPF_STMT(BPF_LD | BPF_IND | BPF_W, 0x1), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 }, }, + }, + { + "LD_IND word negative offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_IMM, 0x3e), + BPF_STMT(BPF_LD | BPF_IND | BPF_W, -0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 } }, + }, { "LD_ABS byte", .u.insns = { @@ -5837,6 +5746,68 @@ static struct bpf_test tests[] = { }, { {0x40, 0xcc } }, }, + { + "LD_ABS byte positive offset, all ff", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_B, 0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0xff, [0x3d] = 0xff, [0x3e] = 0xff, [0x3f] = 0xff }, + { {0x40, 0xff } }, + }, + { + "LD_ABS byte positive offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_B, 0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 }, }, + }, + { + "LD_ABS byte negative offset, out of bounds load", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_B, -1), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC | FLAG_EXPECTED_FAIL, + .expected_errcode = -EINVAL, + }, + { + "LD_ABS byte negative offset, in bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0x82 }, }, + }, + { + "LD_ABS byte negative offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 }, }, + }, + { + "LD_ABS byte negative offset, multiple calls", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3c), + BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3d), + BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3e), + BPF_STMT(BPF_LD | BPF_ABS | BPF_B, SKF_LL_OFF + 0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0x82 }, }, + }, { "LD_ABS halfword", .u.insns = { @@ -5871,6 +5842,55 @@ static struct bpf_test tests[] = { }, { {0x40, 0x99ff } }, }, + { + "LD_ABS halfword positive offset, all ff", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_H, 0x3e), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0xff, [0x3d] = 0xff, [0x3e] = 0xff, [0x3f] = 0xff }, + { {0x40, 0xffff } }, + }, + { + "LD_ABS halfword positive offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_H, 0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 }, }, + }, + { + "LD_ABS halfword negative offset, out of bounds load", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_H, -1), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC | FLAG_EXPECTED_FAIL, + .expected_errcode = -EINVAL, + }, + { + "LD_ABS halfword negative offset, in bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_H, SKF_LL_OFF + 0x3e), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0x1982 }, }, + }, + { + "LD_ABS halfword negative offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_H, SKF_LL_OFF + 0x3e), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 }, }, + }, { "LD_ABS word", .u.insns = { @@ -5939,6 +5959,140 @@ static struct bpf_test tests[] = { }, { {0x40, 0x88ee99ff } }, }, + { + "LD_ABS word positive offset, all ff", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0x3c), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0xff, [0x3d] = 0xff, [0x3e] = 0xff, [0x3f] = 0xff }, + { {0x40, 0xffffffff } }, + }, + { + "LD_ABS word positive offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_W, 0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 }, }, + }, + { + "LD_ABS word negative offset, out of bounds load", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_W, -1), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC | FLAG_EXPECTED_FAIL, + .expected_errcode = -EINVAL, + }, + { + "LD_ABS word negative offset, in bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_W, SKF_LL_OFF + 0x3c), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0x25051982 }, }, + }, + { + "LD_ABS word negative offset, out of bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_ABS | BPF_W, SKF_LL_OFF + 0x3c), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x3f, 0 }, }, + }, + { + "LDX_MSH standalone, preserved A", + .u.insns = { + BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3c), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0xffeebbaa }, }, + }, + { + "LDX_MSH standalone, preserved A 2", + .u.insns = { + BPF_STMT(BPF_LD | BPF_IMM, 0x175e9d63), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3c), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3d), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3e), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3f), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0x175e9d63 }, }, + }, + { + "LDX_MSH standalone, test result 1", + .u.insns = { + BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3c), + BPF_STMT(BPF_MISC | BPF_TXA, 0), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0x14 }, }, + }, + { + "LDX_MSH standalone, test result 2", + .u.insns = { + BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x3e), + BPF_STMT(BPF_MISC | BPF_TXA, 0), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0x24 }, }, + }, + { + "LDX_MSH standalone, negative offset", + .u.insns = { + BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, -1), + BPF_STMT(BPF_MISC | BPF_TXA, 0), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0 }, }, + }, + { + "LDX_MSH standalone, negative offset 2", + .u.insns = { + BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, SKF_LL_OFF + 0x3e), + BPF_STMT(BPF_MISC | BPF_TXA, 0), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0x24 }, }, + }, + { + "LDX_MSH standalone, out of bounds", + .u.insns = { + BPF_STMT(BPF_LD | BPF_IMM, 0xffeebbaa), + BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0x40), + BPF_STMT(BPF_MISC | BPF_TXA, 0), + BPF_STMT(BPF_RET | BPF_A, 0x0), + }, + CLASSIC, + { [0x3c] = 0x25, [0x3d] = 0x05, [0x3e] = 0x19, [0x3f] = 0x82 }, + { {0x40, 0 }, }, + }, /* * verify that the interpreter or JIT correctly sets A and X * to 0. @@ -6127,14 +6281,6 @@ static struct bpf_test tests[] = { {}, { {0x1, 0x42 } }, }, - { - "LD_ABS with helper changing skb data", - { }, - INTERNAL, - { 0x34 }, - { { ETH_HLEN, 42 } }, - .fill_helper = bpf_fill_ld_abs_vlan_push_pop2, - }, /* Checking interpreter vs JIT wrt signed extended imms. */ { "JNE signed compare, test 1", diff --git a/net/core/filter.c b/net/core/filter.c index c33595a8d604..865500f6180d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2181,7 +2181,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, return ret; } -const struct bpf_func_proto bpf_skb_vlan_push_proto = { +static const struct bpf_func_proto bpf_skb_vlan_push_proto = { .func = bpf_skb_vlan_push, .gpl_only = false, .ret_type = RET_INTEGER, @@ -2189,7 +2189,6 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = { .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; -EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) { @@ -2203,13 +2202,12 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) return ret; } -const struct bpf_func_proto bpf_skb_vlan_pop_proto = { +static const struct bpf_func_proto bpf_skb_vlan_pop_proto = { .func = bpf_skb_vlan_pop, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; -EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) { diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 1acafe26498b..275b4570b5b8 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -47,7 +47,7 @@ # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #endif -#define MAX_INSNS 512 +#define MAX_INSNS BPF_MAXINSNS #define MAX_FIXUPS 8 #define MAX_NR_MAPS 4 #define POINTER_VALUE 0xcafe4all @@ -77,6 +77,8 @@ struct bpf_test { } result, result_unpriv; enum bpf_prog_type prog_type; uint8_t flags; + __u8 data[TEST_DATA_LEN]; + void (*fill_helper)(struct bpf_test *self); }; /* Note we want this to be 64 bit aligned so that the end of our array is @@ -94,6 +96,62 @@ struct other_val { long long bar; }; +static void bpf_fill_ld_abs_vlan_push_pop(struct bpf_test *self) +{ + /* test: {skb->data[0], vlan_push} x 68 + {skb->data[0], vlan_pop} x 68 */ +#define PUSH_CNT 51 + unsigned int len = BPF_MAXINSNS; + struct bpf_insn *insn = self->insns; + int i = 0, j, k = 0; + + insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); +loop: + for (j = 0; j < PUSH_CNT; j++) { + insn[i++] = BPF_LD_ABS(BPF_B, 0); + insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x34, len - i - 2); + i++; + insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + insn[i++] = BPF_MOV64_IMM(BPF_REG_2, 1); + insn[i++] = BPF_MOV64_IMM(BPF_REG_3, 2); + insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_skb_vlan_push), + insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, len - i - 2); + i++; + } + + for (j = 0; j < PUSH_CNT; j++) { + insn[i++] = BPF_LD_ABS(BPF_B, 0); + insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x34, len - i - 2); + i++; + insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_skb_vlan_pop), + insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, len - i - 2); + i++; + } + if (++k < 5) + goto loop; + + for (; i < len - 1; i++) + insn[i] = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 0xbef); + insn[len - 1] = BPF_EXIT_INSN(); +} + +static void bpf_fill_jump_around_ld_abs(struct bpf_test *self) +{ + struct bpf_insn *insn = self->insns; + unsigned int len = BPF_MAXINSNS; + int i = 0; + + insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + insn[i++] = BPF_LD_ABS(BPF_B, 0); + insn[i] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 10, len - i - 2); + i++; + while (i < len - 1) + insn[i++] = BPF_LD_ABS(BPF_B, 1); + insn[i] = BPF_EXIT_INSN(); +} + static struct bpf_test tests[] = { { "add+sub+mul", @@ -11725,6 +11783,197 @@ static struct bpf_test tests[] = { .result = ACCEPT, .prog_type = BPF_PROG_TYPE_TRACEPOINT, }, + { + "ld_abs: invalid op 1", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_LD_ABS(BPF_DW, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = REJECT, + .errstr = "unknown opcode", + }, + { + "ld_abs: invalid op 2", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, 256), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_LD_IND(BPF_DW, BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = REJECT, + .errstr = "unknown opcode", + }, + { + "ld_abs: nmap reduced", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_LD_ABS(BPF_H, 12), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x806, 28), + BPF_LD_ABS(BPF_H, 12), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x806, 26), + BPF_MOV32_IMM(BPF_REG_0, 18), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -64), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -64), + BPF_LD_IND(BPF_W, BPF_REG_7, 14), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -60), + BPF_MOV32_IMM(BPF_REG_0, 280971478), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -56), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -56), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -60), + BPF_ALU32_REG(BPF_SUB, BPF_REG_0, BPF_REG_7), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 15), + BPF_LD_ABS(BPF_H, 12), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x806, 13), + BPF_MOV32_IMM(BPF_REG_0, 22), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -56), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -56), + BPF_LD_IND(BPF_H, BPF_REG_7, 14), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -52), + BPF_MOV32_IMM(BPF_REG_0, 17366), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -48), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -48), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -52), + BPF_ALU32_REG(BPF_SUB, BPF_REG_0, BPF_REG_7), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV32_IMM(BPF_REG_0, 256), + BPF_EXIT_INSN(), + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .data = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x06, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0x10, 0xbf, 0x48, 0xd6, 0x43, 0xd6, + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 256, + }, + { + "ld_abs: div + abs, test 1", + .insns = { + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1), + BPF_LD_ABS(BPF_B, 3), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_2, 2), + BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_2), + BPF_ALU64_REG(BPF_MOV, BPF_REG_8, BPF_REG_0), + BPF_LD_ABS(BPF_B, 4), + BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_0), + BPF_LD_IND(BPF_B, BPF_REG_8, -70), + BPF_EXIT_INSN(), + }, + .data = { + 10, 20, 30, 40, 50, + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 10, + }, + { + "ld_abs: div + abs, test 2", + .insns = { + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1), + BPF_LD_ABS(BPF_B, 3), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_2, 2), + BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_2), + BPF_ALU64_REG(BPF_MOV, BPF_REG_8, BPF_REG_0), + BPF_LD_ABS(BPF_B, 128), + BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_0), + BPF_LD_IND(BPF_B, BPF_REG_8, -70), + BPF_EXIT_INSN(), + }, + .data = { + 10, 20, 30, 40, 50, + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0, + }, + { + "ld_abs: div + abs, test 3", + .insns = { + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_7, 0), + BPF_LD_ABS(BPF_B, 3), + BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_7), + BPF_EXIT_INSN(), + }, + .data = { + 10, 20, 30, 40, 50, + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0, + }, + { + "ld_abs: div + abs, test 4", + .insns = { + BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1), + BPF_ALU64_IMM(BPF_MOV, BPF_REG_7, 0), + BPF_LD_ABS(BPF_B, 256), + BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_7), + BPF_EXIT_INSN(), + }, + .data = { + 10, 20, 30, 40, 50, + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0, + }, + { + "ld_abs: vlan + abs, test 1", + .insns = { }, + .data = { + 0x34, + }, + .fill_helper = bpf_fill_ld_abs_vlan_push_pop, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 0xbef, + }, + { + "ld_abs: vlan + abs, test 2", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_LD_ABS(BPF_B, 0), + BPF_LD_ABS(BPF_H, 0), + BPF_LD_ABS(BPF_W, 0), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_6, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_MOV64_IMM(BPF_REG_2, 1), + BPF_MOV64_IMM(BPF_REG_3, 2), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_skb_vlan_push), + BPF_MOV64_REG(BPF_REG_6, BPF_REG_7), + BPF_LD_ABS(BPF_B, 0), + BPF_LD_ABS(BPF_H, 0), + BPF_LD_ABS(BPF_W, 0), + BPF_MOV64_IMM(BPF_REG_0, 42), + BPF_EXIT_INSN(), + }, + .data = { + 0x34, + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 42, + }, + { + "ld_abs: jump around ld_abs", + .insns = { }, + .data = { + 10, 11, + }, + .fill_helper = bpf_fill_jump_around_ld_abs, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = ACCEPT, + .retval = 10, + }, }; static int probe_filter_length(const struct bpf_insn *fp) @@ -11828,7 +12077,7 @@ static int create_map_in_map(void) return outer_map_fd; } -static char bpf_vlog[32768]; +static char bpf_vlog[UINT_MAX >> 8]; static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog, int *map_fds) @@ -11839,6 +12088,9 @@ static void do_test_fixup(struct bpf_test *test, struct bpf_insn *prog, int *fixup_prog = test->fixup_prog; int *fixup_map_in_map = test->fixup_map_in_map; + if (test->fill_helper) + test->fill_helper(test); + /* Allocating HTs with 1 elem is fine here, since we only test * for verifier and not do a runtime lookup, so the only thing * that really matters is value size in this case. @@ -11888,10 +12140,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv, int *passes, int *errors) { int fd_prog, expected_ret, reject_from_alignment; + int prog_len, prog_type = test->prog_type; struct bpf_insn *prog = test->insns; - int prog_len = probe_filter_length(prog); - char data_in[TEST_DATA_LEN] = {}; - int prog_type = test->prog_type; int map_fds[MAX_NR_MAPS]; const char *expected_err; uint32_t retval; @@ -11901,6 +12151,7 @@ static void do_test_single(struct bpf_test *test, bool unpriv, map_fds[i] = -1; do_test_fixup(test, prog, map_fds); + prog_len = probe_filter_length(prog); fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER, prog, prog_len, test->flags & F_LOAD_WITH_STRICT_ALIGNMENT, @@ -11940,8 +12191,9 @@ static void do_test_single(struct bpf_test *test, bool unpriv, } if (fd_prog >= 0) { - err = bpf_prog_test_run(fd_prog, 1, data_in, sizeof(data_in), - NULL, NULL, &retval, NULL); + err = bpf_prog_test_run(fd_prog, 1, test->data, + sizeof(test->data), NULL, NULL, + &retval, NULL); if (err && errno != 524/*ENOTSUPP*/ && errno != EPERM) { printf("Unexpected bpf_prog_test_run error\n"); goto fail_log; -- cgit v1.2.3 From e0cea7ce988cf48cc4052235d2ad2550b3bc4fa0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:14 +0200 Subject: bpf: implement ld_abs/ld_ind in native bpf The main part of this work is to finally allow removal of LD_ABS and LD_IND from the BPF core by reimplementing them through native eBPF instead. Both LD_ABS/LD_IND were carried over from cBPF and keeping them around in native eBPF caused way more trouble than actually worth it. To just list some of the security issues in the past: * fdfaf64e7539 ("x86: bpf_jit: support negative offsets") * 35607b02dbef ("sparc: bpf_jit: fix loads from negative offsets") * e0ee9c12157d ("x86: bpf_jit: fix two bugs in eBPF JIT compiler") * 07aee9439454 ("bpf, sparc: fix usage of wrong reg for load_skb_regs after call") * 6d59b7dbf72e ("bpf, s390x: do not reload skb pointers in non-skb context") * 87338c8e2cbb ("bpf, ppc64: do not reload skb pointers in non-skb context") For programs in native eBPF, LD_ABS/LD_IND are pretty much legacy these days due to their limitations and more efficient/flexible alternatives that have been developed over time such as direct packet access. LD_ABS/LD_IND only cover 1/2/4 byte loads into a register, the load happens in host endianness and its exception handling can yield unexpected behavior. The latter is explained in depth in f6b1b3bf0d5f ("bpf: fix subprog verifier bypass by div/mod by 0 exception") with similar cases of exceptions we had. In native eBPF more recent program types will disable LD_ABS/LD_IND altogether through may_access_skb() in verifier, and given the limitations in terms of exception handling, it's also disabled in programs that use BPF to BPF calls. In terms of cBPF, the LD_ABS/LD_IND is used in networking programs to access packet data. It is not used in seccomp-BPF but programs that use it for socket filtering or reuseport for demuxing with cBPF. This is mostly relevant for applications that have not yet migrated to native eBPF. The main complexity and source of bugs in LD_ABS/LD_IND is coming from their implementation in the various JITs. Most of them keep the model around from cBPF times by implementing a fastpath written in asm. They use typically two from the BPF program hidden CPU registers for caching the skb's headlen (skb->len - skb->data_len) and skb->data. Throughout the JIT phase this requires to keep track whether LD_ABS/LD_IND are used and if so, the two registers need to be recached each time a BPF helper would change the underlying packet data in native eBPF case. At least in eBPF case, available CPU registers are rare and the additional exit path out of the asm written JIT helper makes it also inflexible since not all parts of the JITer are in control from plain C. A LD_ABS/LD_IND implementation in eBPF therefore allows to significantly reduce the complexity in JITs with comparable performance results for them, e.g.: test_bpf tcpdump port 22 tcpdump complex x64 - before 15 21 10 14 19 18 - after 7 10 10 7 10 15 arm64 - before 40 91 92 40 91 151 - after 51 64 73 51 62 113 For cBPF we now track any usage of LD_ABS/LD_IND in bpf_convert_filter() and cache the skb's headlen and data in the cBPF prologue. The BPF_REG_TMP gets remapped from R8 to R2 since it's mainly just used as a local temporary variable. This allows to shrink the image on x86_64 also for seccomp programs slightly since mapping to %rsi is not an ereg. In callee-saved R8 and R9 we now track skb data and headlen, respectively. For normal prologue emission in the JITs this does not add any extra instructions since R8, R9 are pushed to stack in any case from eBPF side. cBPF uses the convert_bpf_ld_abs() emitter which probes the fast path inline already and falls back to bpf_skb_load_helper_{8,16,32}() helper relying on the cached skb data and headlen as well. R8 and R9 never need to be reloaded due to bpf_helper_changes_pkt_data() since all skb access in cBPF is read-only. Then, for the case of native eBPF, we use the bpf_gen_ld_abs() emitter, which calls the bpf_skb_load_helper_{8,16,32}_no_cache() helper unconditionally, does neither cache skb data and headlen nor has an inlined fast path. The reason for the latter is that native eBPF does not have any extra registers available anyway, but even if there were, it avoids any reload of skb data and headlen in the first place. Additionally, for the negative offsets, we provide an alternative bpf_skb_load_bytes_relative() helper in eBPF which operates similarly as bpf_skb_load_bytes() and allows for more flexibility. Tested myself on x64, arm64, s390x, from Sandipan on ppc64. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 + include/linux/filter.h | 4 +- kernel/bpf/core.c | 96 ++------------------ kernel/bpf/verifier.c | 24 +++++ net/core/filter.c | 236 ++++++++++++++++++++++++++++++++++++++++++++++--- 5 files changed, 262 insertions(+), 100 deletions(-) (limited to 'net/core') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d0e3d7ef36a8..0e00a13ff01b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -235,6 +235,8 @@ struct bpf_verifier_ops { struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); + int (*gen_ld_abs)(const struct bpf_insn *orig, + struct bpf_insn *insn_buf); u32 (*convert_ctx_access)(enum bpf_access_type type, const struct bpf_insn *src, struct bpf_insn *dst, diff --git a/include/linux/filter.h b/include/linux/filter.h index b7f81e3a70cb..da7e16523128 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -47,7 +47,9 @@ struct xdp_buff; /* Additional register mappings for converted user programs. */ #define BPF_REG_A BPF_REG_0 #define BPF_REG_X BPF_REG_7 -#define BPF_REG_TMP BPF_REG_8 +#define BPF_REG_TMP BPF_REG_2 /* scratch reg */ +#define BPF_REG_D BPF_REG_8 /* data, callee-saved */ +#define BPF_REG_H BPF_REG_9 /* hlen, callee-saved */ /* Kernel hidden auxiliary/helper register for hardening step. * Only used by eBPF JITs. It's nothing more than a temporary diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 90feeba3a1a1..1127552c8033 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -634,23 +634,6 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; - case BPF_LD | BPF_ABS | BPF_W: - case BPF_LD | BPF_ABS | BPF_H: - case BPF_LD | BPF_ABS | BPF_B: - *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); - *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); - *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); - break; - - case BPF_LD | BPF_IND | BPF_W: - case BPF_LD | BPF_IND | BPF_H: - case BPF_LD | BPF_IND | BPF_B: - *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); - *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); - *to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg); - *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0); - break; - case BPF_LD | BPF_IMM | BPF_DW: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); @@ -891,14 +874,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(LDX, MEM, W), \ INSN_3(LDX, MEM, DW), \ /* Immediate based. */ \ - INSN_3(LD, IMM, DW), \ - /* Misc (old cBPF carry-over). */ \ - INSN_3(LD, ABS, B), \ - INSN_3(LD, ABS, H), \ - INSN_3(LD, ABS, W), \ - INSN_3(LD, IND, B), \ - INSN_3(LD, IND, H), \ - INSN_3(LD, IND, W) + INSN_3(LD, IMM, DW) bool bpf_opcode_in_insntable(u8 code) { @@ -908,6 +884,13 @@ bool bpf_opcode_in_insntable(u8 code) [0 ... 255] = false, /* Now overwrite non-defaults ... */ BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), + /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */ + [BPF_LD | BPF_ABS | BPF_B] = true, + [BPF_LD | BPF_ABS | BPF_H] = true, + [BPF_LD | BPF_ABS | BPF_W] = true, + [BPF_LD | BPF_IND | BPF_B] = true, + [BPF_LD | BPF_IND | BPF_H] = true, + [BPF_LD | BPF_IND | BPF_W] = true, }; #undef BPF_INSN_3_TBL #undef BPF_INSN_2_TBL @@ -938,8 +921,6 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) #undef BPF_INSN_3_LBL #undef BPF_INSN_2_LBL u32 tail_call_cnt = 0; - void *ptr; - int off; #define CONT ({ insn++; goto select_insn; }) #define CONT_JMP ({ insn++; goto select_insn; }) @@ -1266,67 +1247,6 @@ out: atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) (DST + insn->off)); CONT; - LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ - off = IMM; -load_word: - /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only - * appearing in the programs where ctx == skb - * (see may_access_skb() in the verifier). All programs - * keep 'ctx' in regs[BPF_REG_CTX] == BPF_R6, - * bpf_convert_filter() saves it in BPF_R6, internal BPF - * verifier will check that BPF_R6 == ctx. - * - * BPF_ABS and BPF_IND are wrappers of function calls, - * so they scratch BPF_R1-BPF_R5 registers, preserve - * BPF_R6-BPF_R9, and store return value into BPF_R0. - * - * Implicit input: - * ctx == skb == BPF_R6 == CTX - * - * Explicit input: - * SRC == any register - * IMM == 32-bit immediate - * - * Output: - * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness - */ - - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = get_unaligned_be32(ptr); - CONT; - } - - return 0; - LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ - off = IMM; -load_half: - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = get_unaligned_be16(ptr); - CONT; - } - - return 0; - LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ - off = IMM; -load_byte: - ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); - if (likely(ptr != NULL)) { - BPF_R0 = *(u8 *)ptr; - CONT; - } - - return 0; - LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ - off = IMM + SRC; - goto load_word; - LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ - off = IMM + SRC; - goto load_half; - LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ - off = IMM + SRC; - goto load_byte; default_label: /* If we ever reach this, we have a bug somewhere. Die hard here diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0d91f18b2eb5..6ba10a83909d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3884,6 +3884,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (!env->ops->gen_ld_abs) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } + if (env->subprog_cnt) { /* when program has LD_ABS insn JITs and interpreter assume * that r1 == ctx == skb which is not the case for callees @@ -5519,6 +5524,25 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + if (BPF_CLASS(insn->code) == BPF_LD && + (BPF_MODE(insn->code) == BPF_ABS || + BPF_MODE(insn->code) == BPF_IND)) { + cnt = env->ops->gen_ld_abs(insn, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; if (insn->src_reg == BPF_PSEUDO_CALL) diff --git a/net/core/filter.c b/net/core/filter.c index 865500f6180d..a49729842b3d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -162,6 +162,87 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; } +BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u8 tmp, *ptr; + const int len = sizeof(tmp); + + if (offset >= 0) { + if (headlen - offset >= len) + return *(u8 *)(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return tmp; + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return *(u8 *)ptr; + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u16 tmp, *ptr; + const int len = sizeof(tmp); + + if (offset >= 0) { + if (headlen - offset >= len) + return get_unaligned_be16(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be16_to_cpu(tmp); + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return get_unaligned_be16(ptr); + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u32 tmp, *ptr; + const int len = sizeof(tmp); + + if (likely(offset >= 0)) { + if (headlen - offset >= len) + return get_unaligned_be32(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be32_to_cpu(tmp); + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return get_unaligned_be32(ptr); + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len, + offset); +} + BPF_CALL_0(bpf_get_raw_cpu_id) { return raw_smp_processor_id(); @@ -354,26 +435,87 @@ static bool convert_bpf_extensions(struct sock_filter *fp, return true; } +static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) +{ + const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS); + int size = bpf_size_to_bytes(BPF_SIZE(fp->code)); + bool endian = BPF_SIZE(fp->code) == BPF_H || + BPF_SIZE(fp->code) == BPF_W; + bool indirect = BPF_MODE(fp->code) == BPF_IND; + const int ip_align = NET_IP_ALIGN; + struct bpf_insn *insn = *insnp; + int offset = fp->k; + + if (!indirect && + ((unaligned_ok && offset >= 0) || + (!unaligned_ok && offset >= 0 && + offset + ip_align >= 0 && + offset + ip_align % size == 0))) { + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); + *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); + *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian); + *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D, + offset); + if (endian) + *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); + *insn++ = BPF_JMP_A(8); + } + + *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); + *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D); + *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H); + if (!indirect) { + *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X); + if (fp->k) + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset); + } + + switch (BPF_SIZE(fp->code)) { + case BPF_B: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8); + break; + case BPF_H: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16); + break; + case BPF_W: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32); + break; + default: + return false; + } + + *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *insn = BPF_EXIT_INSN(); + + *insnp = insn; + return true; +} + /** * bpf_convert_filter - convert filter program * @prog: the user passed filter program * @len: the length of the user passed filter program * @new_prog: allocated 'struct bpf_prog' or NULL * @new_len: pointer to store length of converted program + * @seen_ld_abs: bool whether we've seen ld_abs/ind * * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' * style extended BPF (eBPF). * Conversion workflow: * * 1) First pass for calculating the new program length: - * bpf_convert_filter(old_prog, old_len, NULL, &new_len) + * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs) * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: - * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); + * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs) */ static int bpf_convert_filter(struct sock_filter *prog, int len, - struct bpf_prog *new_prog, int *new_len) + struct bpf_prog *new_prog, int *new_len, + bool *seen_ld_abs) { int new_flen = 0, pass = 0, target, i, stack_off; struct bpf_insn *new_insn, *first_insn = NULL; @@ -412,12 +554,27 @@ do_pass: * do this ourself. Initial CTX is present in BPF_REG_ARG1. */ *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); + if (*seen_ld_abs) { + /* For packet access in classic BPF, cache skb->data + * in callee-saved BPF R8 and skb->len - skb->data_len + * (headlen) in BPF R9. Since classic BPF is read-only + * on CTX, we only need to cache it once. + */ + *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + BPF_REG_D, BPF_REG_CTX, + offsetof(struct sk_buff, data)); + *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX, + offsetof(struct sk_buff, len)); + *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX, + offsetof(struct sk_buff, data_len)); + *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP); + } } else { new_insn += 3; } for (i = 0; i < len; fp++, i++) { - struct bpf_insn tmp_insns[6] = { }; + struct bpf_insn tmp_insns[32] = { }; struct bpf_insn *insn = tmp_insns; if (addrs) @@ -460,6 +617,11 @@ do_pass: BPF_MODE(fp->code) == BPF_ABS && convert_bpf_extensions(fp, &insn)) break; + if (BPF_CLASS(fp->code) == BPF_LD && + convert_bpf_ld_abs(fp, &insn)) { + *seen_ld_abs = true; + break; + } if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { @@ -562,21 +724,31 @@ jmp_rest: break; /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ - case BPF_LDX | BPF_MSH | BPF_B: - /* tmp = A */ - *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); + case BPF_LDX | BPF_MSH | BPF_B: { + struct sock_filter tmp = { + .code = BPF_LD | BPF_ABS | BPF_B, + .k = fp->k, + }; + + *seen_ld_abs = true; + + /* X = A */ + *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = BPF_R0 = *(u8 *) (skb->data + K) */ - *insn++ = BPF_LD_ABS(BPF_B, fp->k); + convert_bpf_ld_abs(&tmp, &insn); + insn++; /* A &= 0xf */ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); /* A <<= 2 */ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); + /* tmp = X */ + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X); /* X = A */ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = tmp */ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; - + } /* RET_K is remaped into 2 insns. RET_A case doesn't need an * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. */ @@ -658,6 +830,8 @@ jmp_rest: if (!new_prog) { /* Only calculating new length. */ *new_len = new_insn - first_insn; + if (*seen_ld_abs) + *new_len += 4; /* Prologue bits. */ return 0; } @@ -1019,6 +1193,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) struct sock_filter *old_prog; struct bpf_prog *old_fp; int err, new_len, old_len = fp->len; + bool seen_ld_abs = false; /* We are free to overwrite insns et al right here as it * won't be used at this point in time anymore internally @@ -1040,7 +1215,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) } /* 1st pass: calculate the new program length. */ - err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); + err = bpf_convert_filter(old_prog, old_len, NULL, &new_len, + &seen_ld_abs); if (err) goto out_err_free; @@ -1059,7 +1235,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) fp->len = new_len; /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ - err = bpf_convert_filter(old_prog, old_len, fp, &new_len); + err = bpf_convert_filter(old_prog, old_len, fp, &new_len, + &seen_ld_abs); if (err) /* 2nd bpf_convert_filter() can fail only if it fails * to allocate memory, remapping must succeed. Note, @@ -4330,6 +4507,41 @@ static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, return insn - insn_buf; } +static int bpf_gen_ld_abs(const struct bpf_insn *orig, + struct bpf_insn *insn_buf) +{ + bool indirect = BPF_MODE(orig->code) == BPF_IND; + struct bpf_insn *insn = insn_buf; + + /* We're guaranteed here that CTX is in R6. */ + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); + if (!indirect) { + *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg); + if (orig->imm) + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm); + } + + switch (BPF_SIZE(orig->code)) { + case BPF_B: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache); + break; + case BPF_H: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache); + break; + case BPF_W: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache); + break; + } + + *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); + *insn++ = BPF_EXIT_INSN(); + + return insn - insn_buf; +} + static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { @@ -5599,6 +5811,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, + .gen_ld_abs = bpf_gen_ld_abs, }; const struct bpf_prog_ops sk_filter_prog_ops = { @@ -5610,6 +5823,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, + .gen_ld_abs = bpf_gen_ld_abs, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { -- cgit v1.2.3 From 4e1ec56cdc59746943b2acfab3c171b930187bbe Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 May 2018 01:08:15 +0200 Subject: bpf: add skb_load_bytes_relative helper This adds a small BPF helper similar to bpf_skb_load_bytes() that is able to load relative to mac/net header offset from the skb's linear data. Compared to bpf_skb_load_bytes(), it takes a fifth argument namely start_header, which is either BPF_HDR_START_MAC or BPF_HDR_START_NET. This allows for a more flexible alternative compared to LD_ABS/LD_IND with negative offset. It's enabled for tc BPF programs as well as sock filter program types where it's mainly useful in reuseport programs to ease access to lower header data. Reference: https://lists.iovisor.org/pipermail/iovisor-dev/2017-March/000698.html Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 33 ++++++++++++++++++++++++++++++++- net/core/filter.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a3a495052511..93d5a4eeec2a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1802,6 +1802,30 @@ union bpf_attr { * Return * a non-negative value equal to or less than size on success, or * a negative error in case of failure. + * + * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) + * Description + * This helper is similar to **bpf_skb_load_bytes**\ () in that + * it provides an easy way to load *len* bytes from *offset* + * from the packet associated to *skb*, into the buffer pointed + * by *to*. The difference to **bpf_skb_load_bytes**\ () is that + * a fifth argument *start_header* exists in order to select a + * base offset to start from. *start_header* can be one of: + * + * **BPF_HDR_START_MAC** + * Base offset to load data from is *skb*'s mac header. + * **BPF_HDR_START_NET** + * Base offset to load data from is *skb*'s network header. + * + * In general, "direct packet access" is the preferred method to + * access packet data, however, this helper is in particular useful + * in socket filters where *skb*\ **->data** does not always point + * to the start of the mac header and where "direct packet access" + * is not available. + * + * Return + * 0 on success, or a negative error in case of failure. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1871,7 +1895,8 @@ union bpf_attr { FN(bind), \ FN(xdp_adjust_tail), \ FN(skb_get_xfrm_state), \ - FN(get_stack), + FN(get_stack), \ + FN(skb_load_bytes_relative), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -1932,6 +1957,12 @@ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, }; +/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ +enum bpf_hdr_start_off { + BPF_HDR_START_MAC, + BPF_HDR_START_NET, +}; + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ diff --git a/net/core/filter.c b/net/core/filter.c index a49729842b3d..6877426c23a6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1684,6 +1684,47 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .arg4_type = ARG_CONST_SIZE, }; +BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, + u32, offset, void *, to, u32, len, u32, start_header) +{ + u8 *ptr; + + if (unlikely(offset > 0xffff || len > skb_headlen(skb))) + goto err_clear; + + switch (start_header) { + case BPF_HDR_START_MAC: + ptr = skb_mac_header(skb) + offset; + break; + case BPF_HDR_START_NET: + ptr = skb_network_header(skb) + offset; + break; + default: + goto err_clear; + } + + if (likely(ptr >= skb_mac_header(skb) && + ptr + len <= skb_tail_pointer(skb))) { + memcpy(to, ptr, len); + return 0; + } + +err_clear: + memset(to, 0, len); + return -EFAULT; +} + +static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = { + .func = bpf_skb_load_bytes_relative, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) { /* Idea is the following: should the needed direct read/write @@ -4061,6 +4102,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: @@ -4078,6 +4121,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_load_bytes_relative: + return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_skb_pull_data: return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: -- cgit v1.2.3