aboutsummaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
authorJakub Kicinski2020-12-04 07:48:11 -0800
committerJakub Kicinski2020-12-04 07:48:12 -0800
commita1dd1d86973182458da7798a95f26cfcbea599b4 (patch)
tree1adda22ea30ccfac7651a7eed7b7c90356f8243a /net/core
parent55fd59b003f6e8fd88cf16590e79823d7ccf3026 (diff)
parenteceae70bdeaeb6b8ceb662983cf663ff352fbc96 (diff)
Merge https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Alexei Starovoitov says: ==================== pull-request: bpf-next 2020-12-03 The main changes are: 1) Support BTF in kernel modules, from Andrii. 2) Introduce preferred busy-polling, from Björn. 3) bpf_ima_inode_hash() and bpf_bprm_opts_set() helpers, from KP Singh. 4) Memcg-based memory accounting for bpf objects, from Roman. 5) Allow bpf_{s,g}etsockopt from cgroup bind{4,6} hooks, from Stanislav. * https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (118 commits) selftests/bpf: Fix invalid use of strncat in test_sockmap libbpf: Use memcpy instead of strncpy to please GCC selftests/bpf: Add fentry/fexit/fmod_ret selftest for kernel module selftests/bpf: Add tp_btf CO-RE reloc test for modules libbpf: Support attachment of BPF tracing programs to kernel modules libbpf: Factor out low-level BPF program loading helper bpf: Allow to specify kernel module BTFs when attaching BPF programs bpf: Remove hard-coded btf_vmlinux assumption from BPF verifier selftests/bpf: Add CO-RE relocs selftest relying on kernel module BTF selftests/bpf: Add support for marking sub-tests as skipped selftests/bpf: Add bpf_testmod kernel module for testing libbpf: Add kernel module BTF support for CO-RE relocations libbpf: Refactor CO-RE relocs to not assume a single BTF object libbpf: Add internal helper to load BTF data by FD bpf: Keep module's btf_data_size intact after load bpf: Fix bpf_put_raw_tracepoint()'s use of __module_address() selftests/bpf: Add Userspace tests for TCP_WINDOW_CLAMP bpf: Adds support for setting window clamp samples/bpf: Fix spelling mistake "recieving" -> "receiving" bpf: Fix cold build of test_progs-no_alu32 ... ==================== Link: https://lore.kernel.org/r/20201204021936.85653-1-alexei.starovoitov@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net/core')
-rw-r--r--net/core/bpf_sk_storage.c4
-rw-r--r--net/core/dev.c89
-rw-r--r--net/core/filter.c7
-rw-r--r--net/core/sock.c19
-rw-r--r--net/core/sock_map.c42
-rw-r--r--net/core/xdp.c3
6 files changed, 109 insertions, 55 deletions
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 359908a7d3c1..a32037daa933 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -415,7 +415,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
void *, value, u64, flags)
{
- if (!in_serving_softirq() && !in_task())
+ if (in_irq() || in_nmi())
return (unsigned long)NULL;
return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags);
@@ -424,7 +424,7 @@ BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map,
struct sock *, sk)
{
- if (!in_serving_softirq() && !in_task())
+ if (in_irq() || in_nmi())
return -EPERM;
return ____bpf_sk_storage_delete(map, sk);
diff --git a/net/core/dev.c b/net/core/dev.c
index e3f998d5c15c..ce8fea2e2788 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6448,7 +6448,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
- new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
+ new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
+ NAPIF_STATE_PREFER_BUSY_POLL);
/* If STATE_MISSED was set, leave STATE_SCHED set,
* because we will call napi->poll() one more time.
@@ -6485,10 +6486,30 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
#if defined(CONFIG_NET_RX_BUSY_POLL)
-#define BUSY_POLL_BUDGET 8
+static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
+{
+ if (!skip_schedule) {
+ gro_normal_list(napi);
+ __napi_schedule(napi);
+ return;
+ }
+
+ if (napi->gro_bitmask) {
+ /* flush too old packets
+ * If HZ < 1000, flush all packets.
+ */
+ napi_gro_flush(napi, HZ >= 1000);
+ }
-static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
+ gro_normal_list(napi);
+ clear_bit(NAPI_STATE_SCHED, &napi->state);
+}
+
+static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
+ u16 budget)
{
+ bool skip_schedule = false;
+ unsigned long timeout;
int rc;
/* Busy polling means there is a high chance device driver hard irq
@@ -6505,29 +6526,33 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
local_bh_disable();
+ if (prefer_busy_poll) {
+ napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
+ timeout = READ_ONCE(napi->dev->gro_flush_timeout);
+ if (napi->defer_hard_irqs_count && timeout) {
+ hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
+ skip_schedule = true;
+ }
+ }
+
/* All we really want here is to re-enable device interrupts.
* Ideally, a new ndo_busy_poll_stop() could avoid another round.
*/
- rc = napi->poll(napi, BUSY_POLL_BUDGET);
+ rc = napi->poll(napi, budget);
/* We can't gro_normal_list() here, because napi->poll() might have
* rearmed the napi (napi_complete_done()) in which case it could
* already be running on another CPU.
*/
- trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
+ trace_napi_poll(napi, rc, budget);
netpoll_poll_unlock(have_poll_lock);
- if (rc == BUSY_POLL_BUDGET) {
- /* As the whole budget was spent, we still own the napi so can
- * safely handle the rx_list.
- */
- gro_normal_list(napi);
- __napi_schedule(napi);
- }
+ if (rc == budget)
+ __busy_poll_stop(napi, skip_schedule);
local_bh_enable();
}
void napi_busy_loop(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long),
- void *loop_end_arg)
+ void *loop_end_arg, bool prefer_busy_poll, u16 budget)
{
unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
int (*napi_poll)(struct napi_struct *napi, int budget);
@@ -6555,17 +6580,23 @@ restart:
* we avoid dirtying napi->state as much as we can.
*/
if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
- NAPIF_STATE_IN_BUSY_POLL))
+ NAPIF_STATE_IN_BUSY_POLL)) {
+ if (prefer_busy_poll)
+ set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
+ }
if (cmpxchg(&napi->state, val,
val | NAPIF_STATE_IN_BUSY_POLL |
- NAPIF_STATE_SCHED) != val)
+ NAPIF_STATE_SCHED) != val) {
+ if (prefer_busy_poll)
+ set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
+ }
have_poll_lock = netpoll_poll_lock(napi);
napi_poll = napi->poll;
}
- work = napi_poll(napi, BUSY_POLL_BUDGET);
- trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
+ work = napi_poll(napi, budget);
+ trace_napi_poll(napi, work, budget);
gro_normal_list(napi);
count:
if (work > 0)
@@ -6578,7 +6609,7 @@ count:
if (unlikely(need_resched())) {
if (napi_poll)
- busy_poll_stop(napi, have_poll_lock);
+ busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
preempt_enable();
rcu_read_unlock();
cond_resched();
@@ -6589,7 +6620,7 @@ count:
cpu_relax();
}
if (napi_poll)
- busy_poll_stop(napi, have_poll_lock);
+ busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
preempt_enable();
out:
rcu_read_unlock();
@@ -6640,8 +6671,10 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/
if (!napi_disable_pending(napi) &&
- !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
+ !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
+ clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
__napi_schedule_irqoff(napi);
+ }
return HRTIMER_NORESTART;
}
@@ -6699,6 +6732,7 @@ void napi_disable(struct napi_struct *n)
hrtimer_cancel(&n->timer);
+ clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
clear_bit(NAPI_STATE_DISABLE, &n->state);
}
EXPORT_SYMBOL(napi_disable);
@@ -6771,6 +6805,19 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
goto out_unlock;
}
+ /* The NAPI context has more processing work, but busy-polling
+ * is preferred. Exit early.
+ */
+ if (napi_prefer_busy_poll(n)) {
+ if (napi_complete_done(n, work)) {
+ /* If timeout is not set, we need to make sure
+ * that the NAPI is re-scheduled.
+ */
+ napi_schedule(n);
+ }
+ goto out_unlock;
+ }
+
if (n->gro_bitmask) {
/* flush too old packets
* If HZ < 1000, flush all packets.
@@ -9753,7 +9800,7 @@ static int netif_alloc_rx_queues(struct net_device *dev)
rx[i].dev = dev;
/* XDP RX-queue setup */
- err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
+ err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
if (err < 0)
goto err_rxq_info;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index 2ca5eecebacf..77001a35768f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4910,6 +4910,9 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
tp->notsent_lowat = val;
sk->sk_write_space(sk);
break;
+ case TCP_WINDOW_CLAMP:
+ ret = tcp_set_window_clamp(sk, val);
+ break;
default:
ret = -EINVAL;
}
@@ -6995,6 +6998,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_storage_delete_proto;
case BPF_FUNC_setsockopt:
switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
return &bpf_sock_addr_setsockopt_proto;
@@ -7003,6 +7008,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
case BPF_FUNC_getsockopt:
switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
return &bpf_sock_addr_getsockopt_proto;
diff --git a/net/core/sock.c b/net/core/sock.c
index f0f096852876..4fd7e785f177 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1159,6 +1159,22 @@ set_sndbuf:
sk->sk_ll_usec = val;
}
break;
+ case SO_PREFER_BUSY_POLL:
+ if (valbool && !capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ else
+ WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
+ break;
+ case SO_BUSY_POLL_BUDGET:
+ if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ } else {
+ if (val < 0 || val > U16_MAX)
+ ret = -EINVAL;
+ else
+ WRITE_ONCE(sk->sk_busy_poll_budget, val);
+ }
+ break;
#endif
case SO_MAX_PACING_RATE:
@@ -1523,6 +1539,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
case SO_BUSY_POLL:
v.val = sk->sk_ll_usec;
break;
+ case SO_PREFER_BUSY_POLL:
+ v.val = READ_ONCE(sk->sk_prefer_busy_poll);
+ break;
#endif
case SO_MAX_PACING_RATE:
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index ddc899e83313..64b5ec14ff50 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -27,8 +27,6 @@ struct bpf_stab {
static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
{
struct bpf_stab *stab;
- u64 cost;
- int err;
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
@@ -39,29 +37,22 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
- stab = kzalloc(sizeof(*stab), GFP_USER);
+ stab = kzalloc(sizeof(*stab), GFP_USER | __GFP_ACCOUNT);
if (!stab)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&stab->map, attr);
raw_spin_lock_init(&stab->lock);
- /* Make sure page count doesn't overflow. */
- cost = (u64) stab->map.max_entries * sizeof(struct sock *);
- err = bpf_map_charge_init(&stab->map.memory, cost);
- if (err)
- goto free_stab;
-
stab->sks = bpf_map_area_alloc(stab->map.max_entries *
sizeof(struct sock *),
stab->map.numa_node);
- if (stab->sks)
- return &stab->map;
- err = -ENOMEM;
- bpf_map_charge_finish(&stab->map.memory);
-free_stab:
- kfree(stab);
- return ERR_PTR(err);
+ if (!stab->sks) {
+ kfree(stab);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return &stab->map;
}
int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
@@ -975,8 +966,9 @@ static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab,
}
}
- new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
- htab->map.numa_node);
+ new = bpf_map_kmalloc_node(&htab->map, htab->elem_size,
+ GFP_ATOMIC | __GFP_NOWARN,
+ htab->map.numa_node);
if (!new) {
atomic_dec(&htab->count);
return ERR_PTR(-ENOMEM);
@@ -1103,7 +1095,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
{
struct bpf_shtab *htab;
int i, err;
- u64 cost;
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
@@ -1116,7 +1107,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
if (attr->key_size > MAX_BPF_STACK)
return ERR_PTR(-E2BIG);
- htab = kzalloc(sizeof(*htab), GFP_USER);
+ htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT);
if (!htab)
return ERR_PTR(-ENOMEM);
@@ -1131,21 +1122,10 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
goto free_htab;
}
- cost = (u64) htab->buckets_num * sizeof(struct bpf_shtab_bucket) +
- (u64) htab->elem_size * htab->map.max_entries;
- if (cost >= U32_MAX - PAGE_SIZE) {
- err = -EINVAL;
- goto free_htab;
- }
- err = bpf_map_charge_init(&htab->map.memory, cost);
- if (err)
- goto free_htab;
-
htab->buckets = bpf_map_area_alloc(htab->buckets_num *
sizeof(struct bpf_shtab_bucket),
htab->map.numa_node);
if (!htab->buckets) {
- bpf_map_charge_finish(&htab->map.memory);
err = -ENOMEM;
goto free_htab;
}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 3d330ebda893..17ffd33c6b18 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -158,7 +158,7 @@ static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq)
/* Returns 0 on success, negative on failure */
int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
- struct net_device *dev, u32 queue_index)
+ struct net_device *dev, u32 queue_index, unsigned int napi_id)
{
if (xdp_rxq->reg_state == REG_STATE_UNUSED) {
WARN(1, "Driver promised not to register this");
@@ -179,6 +179,7 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
xdp_rxq_info_init(xdp_rxq);
xdp_rxq->dev = dev;
xdp_rxq->queue_index = queue_index;
+ xdp_rxq->napi_id = napi_id;
xdp_rxq->reg_state = REG_STATE_REGISTERED;
return 0;