diff options
Diffstat (limited to 'kernel/bpf')
-rw-r--r-- | kernel/bpf/Makefile | 3 | ||||
-rw-r--r-- | kernel/bpf/arraymap.c | 12 | ||||
-rw-r--r-- | kernel/bpf/bloom_filter.c | 29 | ||||
-rw-r--r-- | kernel/bpf/bpf_cgrp_storage.c | 23 | ||||
-rw-r--r-- | kernel/bpf/bpf_inode_storage.c | 22 | ||||
-rw-r--r-- | kernel/bpf/bpf_iter.c | 70 | ||||
-rw-r--r-- | kernel/bpf/bpf_local_storage.c | 382 | ||||
-rw-r--r-- | kernel/bpf/bpf_struct_ops.c | 260 | ||||
-rw-r--r-- | kernel/bpf/bpf_task_storage.c | 27 | ||||
-rw-r--r-- | kernel/bpf/btf.c | 279 | ||||
-rw-r--r-- | kernel/bpf/cpumap.c | 8 | ||||
-rw-r--r-- | kernel/bpf/cpumask.c | 43 | ||||
-rw-r--r-- | kernel/bpf/devmap.c | 24 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 38 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 139 | ||||
-rw-r--r-- | kernel/bpf/local_storage.c | 6 | ||||
-rw-r--r-- | kernel/bpf/log.c | 330 | ||||
-rw-r--r-- | kernel/bpf/lpm_trie.c | 6 | ||||
-rw-r--r-- | kernel/bpf/memalloc.c | 59 | ||||
-rw-r--r-- | kernel/bpf/queue_stack_maps.c | 22 | ||||
-rw-r--r-- | kernel/bpf/reuseport_array.c | 2 | ||||
-rw-r--r-- | kernel/bpf/ringbuf.c | 6 | ||||
-rw-r--r-- | kernel/bpf/stackmap.c | 6 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 112 | ||||
-rw-r--r-- | kernel/bpf/trampoline.c | 28 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 1096 |
26 files changed, 2246 insertions, 786 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 02242614dcc7..1d3892168d32 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,8 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o +obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 1588c793a715..2058e89b5ddd 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -307,8 +307,8 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key } /* Called from syscall or from eBPF program */ -static int array_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long array_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; @@ -386,7 +386,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, } /* Called from syscall or from eBPF program */ -static int array_map_delete_elem(struct bpf_map *map, void *key) +static long array_map_delete_elem(struct bpf_map *map, void *key) { return -EINVAL; } @@ -686,8 +686,8 @@ static const struct bpf_iter_seq_info iter_seq_info = { .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info), }; -static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn, - void *callback_ctx, u64 flags) +static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn, + void *callback_ctx, u64 flags) { u32 i, key, num_elems = 0; struct bpf_array *array; @@ -871,7 +871,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, return 0; } -static int fd_array_map_delete_elem(struct bpf_map *map, void *key) +static long fd_array_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *old_ptr; diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index 6350c5d35a9b..540331b610a9 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -16,13 +16,6 @@ struct bpf_bloom_filter { struct bpf_map map; u32 bitset_mask; u32 hash_seed; - /* If the size of the values in the bloom filter is u32 aligned, - * then it is more performant to use jhash2 as the underlying hash - * function, else we use jhash. This tracks the number of u32s - * in an u32-aligned value size. If the value size is not u32 aligned, - * this will be 0. - */ - u32 aligned_u32_count; u32 nr_hash_funcs; unsigned long bitset[]; }; @@ -32,16 +25,15 @@ static u32 hash(struct bpf_bloom_filter *bloom, void *value, { u32 h; - if (bloom->aligned_u32_count) - h = jhash2(value, bloom->aligned_u32_count, - bloom->hash_seed + index); + if (likely(value_size % 4 == 0)) + h = jhash2(value, value_size / 4, bloom->hash_seed + index); else h = jhash(value, value_size, bloom->hash_seed + index); return h & bloom->bitset_mask; } -static int bloom_map_peek_elem(struct bpf_map *map, void *value) +static long bloom_map_peek_elem(struct bpf_map *map, void *value) { struct bpf_bloom_filter *bloom = container_of(map, struct bpf_bloom_filter, map); @@ -56,7 +48,7 @@ static int bloom_map_peek_elem(struct bpf_map *map, void *value) return 0; } -static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags) +static long bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags) { struct bpf_bloom_filter *bloom = container_of(map, struct bpf_bloom_filter, map); @@ -73,12 +65,12 @@ static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags) return 0; } -static int bloom_map_pop_elem(struct bpf_map *map, void *value) +static long bloom_map_pop_elem(struct bpf_map *map, void *value) { return -EOPNOTSUPP; } -static int bloom_map_delete_elem(struct bpf_map *map, void *value) +static long bloom_map_delete_elem(struct bpf_map *map, void *value) { return -EOPNOTSUPP; } @@ -152,11 +144,6 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr) bloom->nr_hash_funcs = nr_hash_funcs; bloom->bitset_mask = bitset_mask; - /* Check whether the value size is u32-aligned */ - if ((attr->value_size & (sizeof(u32) - 1)) == 0) - bloom->aligned_u32_count = - attr->value_size / sizeof(u32); - if (!(attr->map_flags & BPF_F_ZERO_SEED)) bloom->hash_seed = get_random_u32(); @@ -177,8 +164,8 @@ static void *bloom_map_lookup_elem(struct bpf_map *map, void *key) return ERR_PTR(-EINVAL); } -static int bloom_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) +static long bloom_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) { /* The eBPF program should use map_push_elem instead */ return -EINVAL; diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 9ae07aedaf23..d44fe8dd9732 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -46,8 +46,6 @@ static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner) void bpf_cgrp_storage_free(struct cgroup *cgroup) { struct bpf_local_storage *local_storage; - bool free_cgroup_storage = false; - unsigned long flags; rcu_read_lock(); local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); @@ -57,14 +55,9 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup) } bpf_cgrp_storage_lock(); - raw_spin_lock_irqsave(&local_storage->lock, flags); - free_cgroup_storage = bpf_local_storage_unlink_nolock(local_storage); - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_local_storage_destroy(local_storage); bpf_cgrp_storage_unlock(); rcu_read_unlock(); - - if (free_cgroup_storage) - kfree_rcu(local_storage, rcu); } static struct bpf_local_storage_data * @@ -100,8 +93,8 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key) return sdata ? sdata->data : NULL; } -static int bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct cgroup *cgroup; @@ -128,11 +121,11 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), true); + bpf_selem_unlink(SELEM(sdata), false); return 0; } -static int bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) +static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) { struct cgroup *cgroup; int err, fd; @@ -156,7 +149,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { - return bpf_local_storage_map_alloc(attr, &cgroup_cache); + return bpf_local_storage_map_alloc(attr, &cgroup_cache, true); } static void cgroup_storage_map_free(struct bpf_map *map) @@ -231,7 +224,7 @@ const struct bpf_func_proto bpf_cgrp_storage_get_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_cgroup_btf_id[0], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, @@ -242,6 +235,6 @@ const struct bpf_func_proto bpf_cgrp_storage_delete_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_cgroup_btf_id[0], }; diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 43e2619c8167..a4d93df78c75 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -57,7 +57,6 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode, void bpf_inode_storage_free(struct inode *inode) { struct bpf_local_storage *local_storage; - bool free_inode_storage = false; struct bpf_storage_blob *bsb; bsb = bpf_inode(inode); @@ -72,13 +71,8 @@ void bpf_inode_storage_free(struct inode *inode) return; } - raw_spin_lock_bh(&local_storage->lock); - free_inode_storage = bpf_local_storage_unlink_nolock(local_storage); - raw_spin_unlock_bh(&local_storage->lock); + bpf_local_storage_destroy(local_storage); rcu_read_unlock(); - - if (free_inode_storage) - kfree_rcu(local_storage, rcu); } static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) @@ -97,8 +91,8 @@ static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key) return sdata ? sdata->data : NULL; } -static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct file *f; @@ -128,12 +122,12 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map) if (!sdata) return -ENOENT; - bpf_selem_unlink(SELEM(sdata), true); + bpf_selem_unlink(SELEM(sdata), false); return 0; } -static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) +static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) { struct file *f; int fd, err; @@ -205,7 +199,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) { - return bpf_local_storage_map_alloc(attr, &inode_cache); + return bpf_local_storage_map_alloc(attr, &inode_cache, false); } static void inode_storage_map_free(struct bpf_map *map) @@ -235,7 +229,7 @@ const struct bpf_func_proto bpf_inode_storage_get_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_inode_storage_btf_ids[0], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, @@ -246,6 +240,6 @@ const struct bpf_func_proto bpf_inode_storage_delete_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_inode_storage_btf_ids[0], }; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 5dc307bdeaeb..96856f130cbf 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -776,3 +776,73 @@ const struct bpf_func_proto bpf_loop_proto = { .arg3_type = ARG_PTR_TO_STACK_OR_NULL, .arg4_type = ARG_ANYTHING, }; + +struct bpf_iter_num_kern { + int cur; /* current value, inclusive */ + int end; /* final value, exclusive */ +} __aligned(8); + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in vmlinux BTF"); + +__bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) +{ + struct bpf_iter_num_kern *s = (void *)it; + + BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num)); + BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num)); + + BTF_TYPE_EMIT(struct btf_iter_num); + + /* start == end is legit, it's an empty range and we'll just get NULL + * on first (and any subsequent) bpf_iter_num_next() call + */ + if (start > end) { + s->cur = s->end = 0; + return -EINVAL; + } + + /* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */ + if ((s64)end - (s64)start > BPF_MAX_LOOPS) { + s->cur = s->end = 0; + return -E2BIG; + } + + /* user will call bpf_iter_num_next() first, + * which will set s->cur to exactly start value; + * underflow shouldn't matter + */ + s->cur = start - 1; + s->end = end; + + return 0; +} + +__bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it) +{ + struct bpf_iter_num_kern *s = (void *)it; + + /* check failed initialization or if we are done (same behavior); + * need to be careful about overflow, so convert to s64 for checks, + * e.g., if s->cur == s->end == INT_MAX, we can't just do + * s->cur + 1 >= s->end + */ + if ((s64)(s->cur + 1) >= s->end) { + s->cur = s->end = 0; + return NULL; + } + + s->cur++; + + return &s->cur; +} + +__bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it) +{ + struct bpf_iter_num_kern *s = (void *)it; + + s->cur = s->end = 0; +} + +__diag_pop(); diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index d3ba3f2db640..47d9948d768f 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -80,8 +80,24 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (charge_mem && mem_charge(smap, owner, smap->elem_size)) return NULL; - selem = bpf_map_kzalloc(&smap->map, smap->elem_size, - gfp_flags | __GFP_NOWARN); + if (smap->bpf_ma) { + migrate_disable(); + selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags); + migrate_enable(); + if (selem) + /* Keep the original bpf_map_kzalloc behavior + * before started using the bpf_mem_cache_alloc. + * + * No need to use zero_map_value. The bpf_selem_free() + * only does bpf_mem_cache_free when there is + * no other bpf prog is using the selem. + */ + memset(SDATA(selem)->data, 0, smap->map.value_size); + } else { + selem = bpf_map_kzalloc(&smap->map, smap->elem_size, + gfp_flags | __GFP_NOWARN); + } + if (selem) { if (value) copy_map_value(&smap->map, SDATA(selem)->data, value); @@ -95,7 +111,8 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, return NULL; } -void bpf_local_storage_free_rcu(struct rcu_head *rcu) +/* rcu tasks trace callback for bpf_ma == false */ +static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; @@ -109,28 +126,66 @@ void bpf_local_storage_free_rcu(struct rcu_head *rcu) kfree_rcu(local_storage, rcu); } -static void bpf_selem_free_fields_rcu(struct rcu_head *rcu) +static void bpf_local_storage_free_rcu(struct rcu_head *rcu) { - struct bpf_local_storage_elem *selem; - struct bpf_local_storage_map *smap; + struct bpf_local_storage *local_storage; - selem = container_of(rcu, struct bpf_local_storage_elem, rcu); - /* protected by the rcu_barrier*() */ - smap = rcu_dereference_protected(SDATA(selem)->smap, true); - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - kfree(selem); + local_storage = container_of(rcu, struct bpf_local_storage, rcu); + bpf_mem_cache_raw_free(local_storage); } -static void bpf_selem_free_fields_trace_rcu(struct rcu_head *rcu) +static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { - /* Free directly if Tasks Trace RCU GP also implies RCU GP */ if (rcu_trace_implies_rcu_gp()) - bpf_selem_free_fields_rcu(rcu); + bpf_local_storage_free_rcu(rcu); else - call_rcu(rcu, bpf_selem_free_fields_rcu); + call_rcu(rcu, bpf_local_storage_free_rcu); } -static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) +/* Handle bpf_ma == false */ +static void __bpf_local_storage_free(struct bpf_local_storage *local_storage, + bool vanilla_rcu) +{ + if (vanilla_rcu) + kfree_rcu(local_storage, rcu); + else + call_rcu_tasks_trace(&local_storage->rcu, + __bpf_local_storage_free_trace_rcu); +} + +static void bpf_local_storage_free(struct bpf_local_storage *local_storage, + struct bpf_local_storage_map *smap, + bool bpf_ma, bool reuse_now) +{ + if (!local_storage) + return; + + if (!bpf_ma) { + __bpf_local_storage_free(local_storage, reuse_now); + return; + } + + if (!reuse_now) { + call_rcu_tasks_trace(&local_storage->rcu, + bpf_local_storage_free_trace_rcu); + return; + } + + if (smap) { + migrate_disable(); + bpf_mem_cache_free(&smap->storage_ma, local_storage); + migrate_enable(); + } else { + /* smap could be NULL if the selem that triggered + * this 'local_storage' creation had been long gone. + * In this case, directly do call_rcu(). + */ + call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); + } +} + +/* rcu tasks trace callback for bpf_ma == false */ +static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage_elem *selem; @@ -141,17 +196,66 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) kfree_rcu(selem, rcu); } +/* Handle bpf_ma == false */ +static void __bpf_selem_free(struct bpf_local_storage_elem *selem, + bool vanilla_rcu) +{ + if (vanilla_rcu) + kfree_rcu(selem, rcu); + else + call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu); +} + +static void bpf_selem_free_rcu(struct rcu_head *rcu) +{ + struct bpf_local_storage_elem *selem; + + selem = container_of(rcu, struct bpf_local_storage_elem, rcu); + bpf_mem_cache_raw_free(selem); +} + +static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) +{ + if (rcu_trace_implies_rcu_gp()) + bpf_selem_free_rcu(rcu); + else + call_rcu(rcu, bpf_selem_free_rcu); +} + +void bpf_selem_free(struct bpf_local_storage_elem *selem, + struct bpf_local_storage_map *smap, + bool reuse_now) +{ + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + + if (!smap->bpf_ma) { + __bpf_selem_free(selem, reuse_now); + return; + } + + if (!reuse_now) { + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); + } else { + /* Instead of using the vanilla call_rcu(), + * bpf_mem_cache_free will be able to reuse selem + * immediately. + */ + migrate_disable(); + bpf_mem_cache_free(&smap->selem_ma, selem); + migrate_enable(); + } +} + /* local_storage->lock must be held and selem->local_storage == local_storage. * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. */ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem, - bool uncharge_mem, bool use_trace_rcu) + bool uncharge_mem, bool reuse_now) { struct bpf_local_storage_map *smap; bool free_local_storage; - struct btf_record *rec; void *owner; smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); @@ -192,35 +296,55 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); - /* A different RCU callback is chosen whenever we need to free - * additional fields in selem data before freeing selem. - * bpf_local_storage_map_free only executes rcu_barrier to wait for RCU - * callbacks when it has special fields, hence we can only conditionally - * dereference smap, as by this time the map might have already been - * freed without waiting for our call_rcu callback if it did not have - * any special fields. + bpf_selem_free(selem, smap, reuse_now); + + if (rcu_access_pointer(local_storage->smap) == smap) + RCU_INIT_POINTER(local_storage->smap, NULL); + + return free_local_storage; +} + +static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage, + struct bpf_local_storage_map *storage_smap, + struct bpf_local_storage_elem *selem) +{ + + struct bpf_local_storage_map *selem_smap; + + /* local_storage->smap may be NULL. If it is, get the bpf_ma + * from any selem in the local_storage->list. The bpf_ma of all + * local_storage and selem should have the same value + * for the same map type. + * + * If the local_storage->list is already empty, the caller will not + * care about the bpf_ma value also because the caller is not + * responsibile to free the local_storage. */ - rec = smap->map.record; - if (use_trace_rcu) { - if (!IS_ERR_OR_NULL(rec)) - call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_fields_trace_rcu); - else - call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); - } else { - if (!IS_ERR_OR_NULL(rec)) - call_rcu(&selem->rcu, bpf_selem_free_fields_rcu); - else - kfree_rcu(selem, rcu); + + if (storage_smap) + return storage_smap->bpf_ma; + + if (!selem) { + struct hlist_node *n; + + n = rcu_dereference_check(hlist_first_rcu(&local_storage->list), + bpf_rcu_lock_held()); + if (!n) + return false; + + selem = hlist_entry(n, struct bpf_local_storage_elem, snode); } + selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - return free_local_storage; + return selem_smap->bpf_ma; } -static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, - bool use_trace_rcu) +static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, + bool reuse_now) { + struct bpf_local_storage_map *storage_smap; struct bpf_local_storage *local_storage; - bool free_local_storage = false; + bool bpf_ma, free_local_storage = false; unsigned long flags; if (unlikely(!selem_linked_to_storage_lockless(selem))) @@ -229,19 +353,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, local_storage = rcu_dereference_check(selem->local_storage, bpf_rcu_lock_held()); + storage_smap = rcu_dereference_check(local_storage->smap, + bpf_rcu_lock_held()); + bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem); + raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, true, use_trace_rcu); + local_storage, selem, true, reuse_now); raw_spin_unlock_irqrestore(&local_storage->lock, flags); - if (free_local_storage) { - if (use_trace_rcu) - call_rcu_tasks_trace(&local_storage->rcu, - bpf_local_storage_free_rcu); - else - kfree_rcu(local_storage, rcu); - } + if (free_local_storage) + bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now); } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, @@ -251,7 +374,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, hlist_add_head_rcu(&selem->snode, &local_storage->list); } -void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) +static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; @@ -281,14 +404,14 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, raw_spin_unlock_irqrestore(&b->lock, flags); } -void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu) +void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now) { /* Always unlink from map before unlinking from local_storage * because selem will be freed after successfully unlinked from * the local_storage. */ bpf_selem_unlink_map(selem); - __bpf_selem_unlink_storage(selem, use_trace_rcu); + bpf_selem_unlink_storage(selem, reuse_now); } /* If cacheit_lockit is false, this lookup function is lockless */ @@ -361,13 +484,21 @@ int bpf_local_storage_alloc(void *owner, if (err) return err; - storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), - gfp_flags | __GFP_NOWARN); + if (smap->bpf_ma) { + migrate_disable(); + storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags); + migrate_enable(); + } else { + storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), + gfp_flags | __GFP_NOWARN); + } + if (!storage) { err = -ENOMEM; goto uncharge; } + RCU_INIT_POINTER(storage->smap, smap); INIT_HLIST_HEAD(&storage->list); raw_spin_lock_init(&storage->lock); storage->owner = owner; @@ -407,7 +538,7 @@ int bpf_local_storage_alloc(void *owner, return 0; uncharge: - kfree(storage); + bpf_local_storage_free(storage, smap, smap->bpf_ma, true); mem_uncharge(smap, owner, sizeof(*storage)); return err; } @@ -451,7 +582,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags); if (err) { - kfree(selem); + bpf_selem_free(selem, smap, true); mem_uncharge(smap, owner, smap->elem_size); return ERR_PTR(err); } @@ -534,7 +665,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (old_sdata) { bpf_selem_unlink_map(SELEM(old_sdata)); bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata), - false, true); + false, false); } unlock: @@ -545,7 +676,7 @@ unlock_err: raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (selem) { mem_uncharge(smap, owner, smap->elem_size); - kfree(selem); + bpf_selem_free(selem, smap, true); } return ERR_PTR(err); } @@ -601,40 +732,6 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr) return 0; } -static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_attr *attr) -{ - struct bpf_local_storage_map *smap; - unsigned int i; - u32 nbuckets; - - smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); - if (!smap) - return ERR_PTR(-ENOMEM); - bpf_map_init_from_attr(&smap->map, attr); - - nbuckets = roundup_pow_of_two(num_possible_cpus()); - /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ - nbuckets = max_t(u32, 2, nbuckets); - smap->bucket_log = ilog2(nbuckets); - - smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), - nbuckets, GFP_USER | __GFP_NOWARN); - if (!smap->buckets) { - bpf_map_area_free(smap); - return ERR_PTR(-ENOMEM); - } - - for (i = 0; i < nbuckets; i++) { - INIT_HLIST_HEAD(&smap->buckets[i].list); - raw_spin_lock_init(&smap->buckets[i].lock); - } - - smap->elem_size = offsetof(struct bpf_local_storage_elem, - sdata.data[attr->value_size]); - - return smap; -} - int bpf_local_storage_map_check_btf(const struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, @@ -652,11 +749,16 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, return 0; } -bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage) +void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) { + struct bpf_local_storage_map *storage_smap; struct bpf_local_storage_elem *selem; - bool free_storage = false; + bool bpf_ma, free_storage = false; struct hlist_node *n; + unsigned long flags; + + storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held()); + bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL); /* Neither the bpf_prog nor the bpf_map's syscall * could be modifying the local_storage->list now. @@ -667,6 +769,7 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ + raw_spin_lock_irqsave(&local_storage->lock, flags); hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { /* Always unlink from map before unlinking from * local_storage. @@ -679,10 +782,12 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage) * of the loop will set the free_cgroup_storage to true. */ free_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false, false); + local_storage, selem, false, true); } + raw_spin_unlock_irqrestore(&local_storage->lock, flags); - return free_storage; + if (free_storage) + bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true); } u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) @@ -695,18 +800,71 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) return usage; } +/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory. + * A deadlock free allocator is useful for storage that the bpf prog can easily + * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf. + * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses + * memory immediately. To be reuse-immediate safe, the owner destruction + * code path needs to go through a rcu grace period before calling + * bpf_local_storage_destroy(). + * + * When bpf_ma == false, the kmalloc and kfree are used. + */ struct bpf_map * bpf_local_storage_map_alloc(union bpf_attr *attr, - struct bpf_local_storage_cache *cache) + struct bpf_local_storage_cache *cache, + bool bpf_ma) { struct bpf_local_storage_map *smap; + unsigned int i; + u32 nbuckets; + int err; + + smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); + if (!smap) + return ERR_PTR(-ENOMEM); + bpf_map_init_from_attr(&smap->map, attr); + + nbuckets = roundup_pow_of_two(num_possible_cpus()); + /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ + nbuckets = max_t(u32, 2, nbuckets); + smap->bucket_log = ilog2(nbuckets); - smap = __bpf_local_storage_map_alloc(attr); - if (IS_ERR(smap)) - return ERR_CAST(smap); + smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), + nbuckets, GFP_USER | __GFP_NOWARN); + if (!smap->buckets) { + err = -ENOMEM; + goto free_smap; + } + + for (i = 0; i < nbuckets; i++) { + INIT_HLIST_HEAD(&smap->buckets[i].list); + raw_spin_lock_init(&smap->buckets[i].lock); + } + + smap->elem_size = offsetof(struct bpf_local_storage_elem, + sdata.data[attr->value_size]); + + smap->bpf_ma = bpf_ma; + if (bpf_ma) { + err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false); + if (err) + goto free_smap; + + err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false); + if (err) { + bpf_mem_alloc_destroy(&smap->selem_ma); + goto free_smap; + } + } smap->cache_idx = bpf_local_storage_cache_idx_get(cache); return &smap->map; + +free_smap: + kvfree(smap->buckets); + bpf_map_area_free(smap); + return ERR_PTR(err); } void bpf_local_storage_map_free(struct bpf_map *map, @@ -748,7 +906,7 @@ void bpf_local_storage_map_free(struct bpf_map *map, migrate_disable(); this_cpu_inc(*busy_counter); } - bpf_selem_unlink(selem, false); + bpf_selem_unlink(selem, true); if (busy_counter) { this_cpu_dec(*busy_counter); migrate_enable(); @@ -772,26 +930,10 @@ void bpf_local_storage_map_free(struct bpf_map *map, */ synchronize_rcu(); - /* Only delay freeing of smap, buckets are not needed anymore */ - kvfree(smap->buckets); - - /* When local storage has special fields, callbacks for - * bpf_selem_free_fields_rcu and bpf_selem_free_fields_trace_rcu will - * keep using the map BTF record, we need to execute an RCU barrier to - * wait for them as the record will be freed right after our map_free - * callback. - */ - if (!IS_ERR_OR_NULL(smap->map.record)) { - rcu_barrier_tasks_trace(); - /* We cannot skip rcu_barrier() when rcu_trace_implies_rcu_gp() - * is true, because while call_rcu invocation is skipped in that - * case in bpf_selem_free_fields_trace_rcu (and all local - * storage maps pass use_trace_rcu = true), there can be - * call_rcu callbacks based on use_trace_rcu = false in the - * while ((selem = ...)) loop above or when owner's free path - * calls bpf_local_storage_unlink_nolock. - */ - rcu_barrier(); + if (smap->bpf_ma) { + bpf_mem_alloc_destroy(&smap->selem_ma); + bpf_mem_alloc_destroy(&smap->storage_ma); } + kvfree(smap->buckets); bpf_map_area_free(smap); } diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 38903fb52f98..d3f0a4825fa6 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -11,11 +11,13 @@ #include <linux/refcount.h> #include <linux/mutex.h> #include <linux/btf_ids.h> +#include <linux/rcupdate_wait.h> enum bpf_struct_ops_state { BPF_STRUCT_OPS_STATE_INIT, BPF_STRUCT_OPS_STATE_INUSE, BPF_STRUCT_OPS_STATE_TOBEFREE, + BPF_STRUCT_OPS_STATE_READY, }; #define BPF_STRUCT_OPS_COMMON_VALUE \ @@ -58,6 +60,13 @@ struct bpf_struct_ops_map { struct bpf_struct_ops_value kvalue; }; +struct bpf_struct_ops_link { + struct bpf_link link; + struct bpf_map __rcu *map; +}; + +static DEFINE_MUTEX(update_mutex); + #define VALUE_PREFIX "bpf_struct_ops_" #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) @@ -249,6 +258,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; struct bpf_struct_ops_value *uvalue, *kvalue; enum bpf_struct_ops_state state; + s64 refcnt; if (unlikely(*(u32 *)key != 0)) return -ENOENT; @@ -267,7 +277,14 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, uvalue = value; memcpy(uvalue, st_map->uvalue, map->value_size); uvalue->state = state; - refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt)); + + /* This value offers the user space a general estimate of how + * many sockets are still utilizing this struct_ops for TCP + * congestion control. The number might not be exact, but it + * should sufficiently meet our present goals. + */ + refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt); + refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0)); return 0; } @@ -349,8 +366,8 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, model, flags, tlinks, NULL); } -static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) +static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; const struct bpf_struct_ops *st_ops = st_map->st_ops; @@ -491,12 +508,29 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, *(unsigned long *)(udata + moff) = prog->aux->id; } - refcount_set(&kvalue->refcnt, 1); - bpf_map_inc(map); + if (st_map->map.map_flags & BPF_F_LINK) { + err = st_ops->validate(kdata); + if (err) + goto reset_unlock; + set_memory_rox((long)st_map->image, 1); + /* Let bpf_link handle registration & unregistration. + * + * Pair with smp_load_acquire() during lookup_elem(). + */ + smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY); + goto unlock; + } set_memory_rox((long)st_map->image, 1); err = st_ops->reg(kdata); if (likely(!err)) { + /* This refcnt increment on the map here after + * 'st_ops->reg()' is secure since the state of the + * map must be set to INIT at this moment, and thus + * bpf_struct_ops_map_delete_elem() can't unregister + * or transition it to TOBEFREE concurrently. + */ + bpf_map_inc(map); /* Pair with smp_load_acquire() during lookup_elem(). * It ensures the above udata updates (e.g. prog->aux->id) * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. @@ -512,7 +546,6 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, */ set_memory_nx((long)st_map->image, 1); set_memory_rw((long)st_map->image, 1); - bpf_map_put(map); reset_unlock: bpf_struct_ops_map_put_progs(st_map); @@ -524,20 +557,22 @@ unlock: return err; } -static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) +static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) { enum bpf_struct_ops_state prev_state; struct bpf_struct_ops_map *st_map; st_map = (struct bpf_struct_ops_map *)map; + if (st_map->map.map_flags & BPF_F_LINK) + return -EOPNOTSUPP; + prev_state = cmpxchg(&st_map->kvalue.state, BPF_STRUCT_OPS_STATE_INUSE, BPF_STRUCT_OPS_STATE_TOBEFREE); switch (prev_state) { case BPF_STRUCT_OPS_STATE_INUSE: st_map->st_ops->unreg(&st_map->kvalue.data); - if (refcount_dec_and_test(&st_map->kvalue.refcnt)) - bpf_map_put(map); + bpf_map_put(map); return 0; case BPF_STRUCT_OPS_STATE_TOBEFREE: return -EINPROGRESS; @@ -570,7 +605,7 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, kfree(value); } -static void bpf_struct_ops_map_free(struct bpf_map *map) +static void __bpf_struct_ops_map_free(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; @@ -582,10 +617,32 @@ static void bpf_struct_ops_map_free(struct bpf_map *map) bpf_map_area_free(st_map); } +static void bpf_struct_ops_map_free(struct bpf_map *map) +{ + /* The struct_ops's function may switch to another struct_ops. + * + * For example, bpf_tcp_cc_x->init() may switch to + * another tcp_cc_y by calling + * setsockopt(TCP_CONGESTION, "tcp_cc_y"). + * During the switch, bpf_struct_ops_put(tcp_cc_x) is called + * and its refcount may reach 0 which then free its + * trampoline image while tcp_cc_x is still running. + * + * A vanilla rcu gp is to wait for all bpf-tcp-cc prog + * to finish. bpf-tcp-cc prog is non sleepable. + * A rcu_tasks gp is to wait for the last few insn + * in the tramopline image to finish before releasing + * the trampoline image. + */ + synchronize_rcu_mult(call_rcu, call_rcu_tasks); + + __bpf_struct_ops_map_free(map); +} + static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) { if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || - attr->map_flags || !attr->btf_vmlinux_value_type_id) + (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id) return -EINVAL; return 0; } @@ -609,6 +666,9 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) if (attr->value_size != vt->size) return ERR_PTR(-EINVAL); + if (attr->map_flags & BPF_F_LINK && (!st_ops->validate || !st_ops->update)) + return ERR_PTR(-EOPNOTSUPP); + t = st_ops->type; st_map_size = sizeof(*st_map) + @@ -630,7 +690,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) NUMA_NO_NODE); st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); if (!st_map->uvalue || !st_map->links || !st_map->image) { - bpf_struct_ops_map_free(map); + __bpf_struct_ops_map_free(map); return ERR_PTR(-ENOMEM); } @@ -676,41 +736,175 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = { bool bpf_struct_ops_get(const void *kdata) { struct bpf_struct_ops_value *kvalue; + struct bpf_struct_ops_map *st_map; + struct bpf_map *map; kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); - return refcount_inc_not_zero(&kvalue->refcnt); + map = __bpf_map_inc_not_zero(&st_map->map, false); + return !IS_ERR(map); } -static void bpf_struct_ops_put_rcu(struct rcu_head *head) +void bpf_struct_ops_put(const void *kdata) { + struct bpf_struct_ops_value *kvalue; struct bpf_struct_ops_map *st_map; - st_map = container_of(head, struct bpf_struct_ops_map, rcu); + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); + bpf_map_put(&st_map->map); } -void bpf_struct_ops_put(const void *kdata) +static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { - struct bpf_struct_ops_value *kvalue; + struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; - kvalue = container_of(kdata, struct bpf_struct_ops_value, data); - if (refcount_dec_and_test(&kvalue->refcnt)) { - struct bpf_struct_ops_map *st_map; + return map->map_type == BPF_MAP_TYPE_STRUCT_OPS && + map->map_flags & BPF_F_LINK && + /* Pair with smp_store_release() during map_update */ + smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY; +} - st_map = container_of(kvalue, struct bpf_struct_ops_map, - kvalue); - /* The struct_ops's function may switch to another struct_ops. - * - * For example, bpf_tcp_cc_x->init() may switch to - * another tcp_cc_y by calling - * setsockopt(TCP_CONGESTION, "tcp_cc_y"). - * During the switch, bpf_struct_ops_put(tcp_cc_x) is called - * and its map->refcnt may reach 0 which then free its - * trampoline image while tcp_cc_x is still running. - * - * Thus, a rcu grace period is needed here. +static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) +{ + struct bpf_struct_ops_link *st_link; + struct bpf_struct_ops_map *st_map; + + st_link = container_of(link, struct bpf_struct_ops_link, link); + st_map = (struct bpf_struct_ops_map *) + rcu_dereference_protected(st_link->map, true); + if (st_map) { + /* st_link->map can be NULL if + * bpf_struct_ops_link_create() fails to register. */ - call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu); + st_map->st_ops->unreg(&st_map->kvalue.data); + bpf_map_put(&st_map->map); + } + kfree(st_link); +} + +static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_struct_ops_link *st_link; + struct bpf_map *map; + + st_link = container_of(link, struct bpf_struct_ops_link, link); + rcu_read_lock(); + map = rcu_dereference(st_link->map); + seq_printf(seq, "map_id:\t%d\n", map->id); + rcu_read_unlock(); +} + +static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_struct_ops_link *st_link; + struct bpf_map *map; + + st_link = container_of(link, struct bpf_struct_ops_link, link); + rcu_read_lock(); + map = rcu_dereference(st_link->map); + info->struct_ops.map_id = map->id; + rcu_read_unlock(); + return 0; +} + +static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map, + struct bpf_map *expected_old_map) +{ + struct bpf_struct_ops_map *st_map, *old_st_map; + struct bpf_map *old_map; + struct bpf_struct_ops_link *st_link; + int err = 0; + + st_link = container_of(link, struct bpf_struct_ops_link, link); + st_map = container_of(new_map, struct bpf_struct_ops_map, map); + + if (!bpf_struct_ops_valid_to_reg(new_map)) + return -EINVAL; + + mutex_lock(&update_mutex); + + old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); + if (expected_old_map && old_map != expected_old_map) { + err = -EPERM; + goto err_out; + } + + old_st_map = container_of(old_map, struct bpf_struct_ops_map, map); + /* The new and old struct_ops must be the same type. */ + if (st_map->st_ops != old_st_map->st_ops) { + err = -EINVAL; + goto err_out; } + + err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data); + if (err) + goto err_out; + + bpf_map_inc(new_map); + rcu_assign_pointer(st_link->map, new_map); + bpf_map_put(old_map); + +err_out: + mutex_unlock(&update_mutex); + + return err; } + +static const struct bpf_link_ops bpf_struct_ops_map_lops = { + .dealloc = bpf_struct_ops_map_link_dealloc, + .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, + .fill_link_info = bpf_struct_ops_map_link_fill_link_info, + .update_map = bpf_struct_ops_map_link_update, +}; + +int bpf_struct_ops_link_create(union bpf_attr *attr) +{ + struct bpf_struct_ops_link *link = NULL; + struct bpf_link_primer link_primer; + struct bpf_struct_ops_map *st_map; + struct bpf_map *map; + int err; + + map = bpf_map_get(attr->link_create.map_fd); + if (IS_ERR(map)) + return PTR_ERR(map); + + st_map = (struct bpf_struct_ops_map *)map; + + if (!bpf_struct_ops_valid_to_reg(map)) { + err = -EINVAL; + goto err_out; + } + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) { + err = -ENOMEM; + goto err_out; + } + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL); + + err = bpf_link_prime(&link->link, &link_primer); + if (err) + goto err_out; + + err = st_map->st_ops->reg(st_map->kvalue.data); + if (err) { + bpf_link_cleanup(&link_primer); + link = NULL; + goto err_out; + } + RCU_INIT_POINTER(link->map, map); + + return bpf_link_settle(&link_primer); + +err_out: + bpf_map_put(map); + kfree(link); + return err; +} + diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 20f942229f3c..adf6dfe0ba68 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -72,8 +72,6 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map, void bpf_task_storage_free(struct task_struct *task) { struct bpf_local_storage *local_storage; - bool free_task_storage = false; - unsigned long flags; rcu_read_lock(); @@ -84,14 +82,9 @@ void bpf_task_storage_free(struct task_struct *task) } bpf_task_storage_lock(); - raw_spin_lock_irqsave(&local_storage->lock, flags); - free_task_storage = bpf_local_storage_unlink_nolock(local_storage); - raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_local_storage_destroy(local_storage); bpf_task_storage_unlock(); rcu_read_unlock(); - - if (free_task_storage) - kfree_rcu(local_storage, rcu); } static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) @@ -127,8 +120,8 @@ out: return ERR_PTR(err); } -static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct task_struct *task; @@ -175,12 +168,12 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map, if (!nobusy) return -EBUSY; - bpf_selem_unlink(SELEM(sdata), true); + bpf_selem_unlink(SELEM(sdata), false); return 0; } -static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) +static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) { struct task_struct *task; unsigned int f_flags; @@ -316,7 +309,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) { - return bpf_local_storage_map_alloc(attr, &task_cache); + return bpf_local_storage_map_alloc(attr, &task_cache, true); } static void task_storage_map_free(struct bpf_map *map) @@ -345,7 +338,7 @@ const struct bpf_func_proto bpf_task_storage_get_recur_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, @@ -356,7 +349,7 @@ const struct bpf_func_proto bpf_task_storage_get_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, @@ -367,7 +360,7 @@ const struct bpf_func_proto bpf_task_storage_delete_recur_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], }; @@ -376,6 +369,6 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 1853beaed4be..2c2d1fb9f410 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3231,12 +3231,6 @@ static void btf_struct_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } -enum btf_field_info_type { - BTF_FIELD_SPIN_LOCK, - BTF_FIELD_TIMER, - BTF_FIELD_KPTR, -}; - enum { BTF_FIELD_IGNORE = 0, BTF_FIELD_FOUND = 1, @@ -3562,7 +3556,10 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, { struct module *mod = NULL; const struct btf_type *t; - struct btf *kernel_btf; + /* If a matching btf type is found in kernel or module BTFs, kptr_ref + * is that BTF, otherwise it's program BTF + */ + struct btf *kptr_btf; int ret; s32 id; @@ -3571,7 +3568,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, */ t = btf_type_by_id(btf, info->kptr.type_id); id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), - &kernel_btf); + &kptr_btf); + if (id == -ENOENT) { + /* btf_parse_kptr should only be called w/ btf = program BTF */ + WARN_ON_ONCE(btf_is_kernel(btf)); + + /* Type exists only in program BTF. Assume that it's a MEM_ALLOC + * kptr allocated via bpf_obj_new + */ + field->kptr.dtor = NULL; + id = info->kptr.type_id; + kptr_btf = (struct btf *)btf; + btf_get(kptr_btf); + goto found_dtor; + } if (id < 0) return id; @@ -3588,20 +3598,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, * can be used as a referenced pointer and be stored in a map at * the same time. */ - dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id); + dtor_btf_id = btf_find_dtor_kfunc(kptr_btf, id); if (dtor_btf_id < 0) { ret = dtor_btf_id; goto end_btf; } - dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id); + dtor_func = btf_type_by_id(kptr_btf, dtor_btf_id); if (!dtor_func) { ret = -ENOENT; goto end_btf; } - if (btf_is_module(kernel_btf)) { - mod = btf_try_get_module(kernel_btf); + if (btf_is_module(kptr_btf)) { + mod = btf_try_get_module(kptr_btf); if (!mod) { ret = -ENXIO; goto end_btf; @@ -3611,7 +3621,7 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, /* We already verified dtor_func to be btf_type_is_func * in register_btf_id_dtor_kfuncs. */ - dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off); + dtor_func_name = __btf_name_by_offset(kptr_btf, dtor_func->name_off); addr = kallsyms_lookup_name(dtor_func_name); if (!addr) { ret = -EINVAL; @@ -3620,14 +3630,15 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, field->kptr.dtor = (void *)addr; } +found_dtor: field->kptr.btf_id = id; - field->kptr.btf = kernel_btf; + field->kptr.btf = kptr_btf; field->kptr.module = mod; return 0; end_mod: module_put(mod); end_btf: - btf_put(kernel_btf); + btf_put(kptr_btf); return ret; } @@ -5494,38 +5505,45 @@ static int btf_check_type_tags(struct btf_verifier_env *env, return 0; } -static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, - u32 log_level, char __user *log_ubuf, u32 log_size) +static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size) { + u32 log_true_size; + int err; + + err = bpf_vlog_finalize(log, &log_true_size); + + if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) && + copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size), + &log_true_size, sizeof(log_true_size))) + err = -EFAULT; + + return err; +} + +static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) +{ + bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel); + char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf); struct btf_struct_metas *struct_meta_tab; struct btf_verifier_env *env = NULL; - struct bpf_verifier_log *log; struct btf *btf = NULL; u8 *data; - int err; + int err, ret; - if (btf_data_size > BTF_MAX_SIZE) + if (attr->btf_size > BTF_MAX_SIZE) return ERR_PTR(-E2BIG); env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); if (!env) return ERR_PTR(-ENOMEM); - log = &env->log; - if (log_level || log_ubuf || log_size) { - /* user requested verbose verifier output - * and supplied buffer to store the verification trace - */ - log->level = log_level; - log->ubuf = log_ubuf; - log->len_total = log_size; - - /* log attributes have to be sane */ - if (!bpf_verifier_log_attr_valid(log)) { - err = -EINVAL; - goto errout; - } - } + /* user could have requested verbose verifier output + * and supplied buffer to store the verification trace + */ + err = bpf_vlog_init(&env->log, attr->btf_log_level, + log_ubuf, attr->btf_log_size); + if (err) + goto errout_free; btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); if (!btf) { @@ -5534,16 +5552,16 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, } env->btf = btf; - data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); + data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN); if (!data) { err = -ENOMEM; goto errout; } btf->data = data; - btf->data_size = btf_data_size; + btf->data_size = attr->btf_size; - if (copy_from_bpfptr(data, btf_data, btf_data_size)) { + if (copy_from_bpfptr(data, btf_data, attr->btf_size)) { err = -EFAULT; goto errout; } @@ -5566,7 +5584,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, if (err) goto errout; - struct_meta_tab = btf_parse_struct_metas(log, btf); + struct_meta_tab = btf_parse_struct_metas(&env->log, btf); if (IS_ERR(struct_meta_tab)) { err = PTR_ERR(struct_meta_tab); goto errout; @@ -5583,10 +5601,9 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, } } - if (log->level && bpf_verifier_log_full(log)) { - err = -ENOSPC; - goto errout_meta; - } + err = finalize_log(&env->log, uattr, uattr_size); + if (err) + goto errout_free; btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); @@ -5595,6 +5612,11 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, errout_meta: btf_free_struct_meta_tab(btf); errout: + /* overwrite err with -ENOSPC or -EFAULT */ + ret = finalize_log(&env->log, uattr, uattr_size); + if (ret) + err = ret; +errout_free: btf_verifier_env_free(env); if (btf) btf_free(btf); @@ -5900,12 +5922,8 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) static bool is_int_ptr(struct btf *btf, const struct btf_type *t) { - /* t comes in already as a pointer */ - t = btf_type_by_id(btf, t->type); - - /* allow const */ - if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST) - t = btf_type_by_id(btf, t->type); + /* skip modifiers */ + t = btf_type_skip_modifiers(btf, t->type, NULL); return btf_type_is_int(t); } @@ -6156,7 +6174,8 @@ enum bpf_struct_walk_result { static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, - u32 *next_btf_id, enum bpf_type_flag *flag) + u32 *next_btf_id, enum bpf_type_flag *flag, + const char **field_name) { u32 i, moff, mtrue_end, msize = 0, total_nelems = 0; const struct btf_type *mtype, *elem_type = NULL; @@ -6385,6 +6404,8 @@ error: if (btf_type_is_struct(stype)) { *next_btf_id = id; *flag |= tmp_flag; + if (field_name) + *field_name = mname; return WALK_PTR; } } @@ -6411,7 +6432,8 @@ error: int btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size, enum bpf_access_type atype __maybe_unused, - u32 *next_btf_id, enum bpf_type_flag *flag) + u32 *next_btf_id, enum bpf_type_flag *flag, + const char **field_name) { const struct btf *btf = reg->btf; enum bpf_type_flag tmp_flag = 0; @@ -6443,7 +6465,7 @@ int btf_struct_access(struct bpf_verifier_log *log, t = btf_type_by_id(btf, id); do { - err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag); + err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag, field_name); switch (err) { case WALK_PTR: @@ -6518,7 +6540,7 @@ again: type = btf_type_by_id(btf, id); if (!type) return false; - err = btf_struct_walk(log, btf, type, off, 1, &id, &flag); + err = btf_struct_walk(log, btf, type, off, 1, &id, &flag, NULL); if (err != WALK_STRUCT) return false; @@ -7199,15 +7221,12 @@ static int __btf_new_fd(struct btf *btf) return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC); } -int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr) +int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { struct btf *btf; int ret; - btf = btf_parse(make_bpfptr(attr->btf, uattr.is_kernel), - attr->btf_size, attr->btf_log_level, - u64_to_user_ptr(attr->btf_log_buf), - attr->btf_log_size); + btf = btf_parse(attr, uattr, uattr_size); if (IS_ERR(btf)) return PTR_ERR(btf); @@ -7597,6 +7616,108 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE) BTF_TRACING_TYPE_xxx #undef BTF_TRACING_TYPE +static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name, + const struct btf_type *func, u32 func_flags) +{ + u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY); + const char *name, *sfx, *iter_name; + const struct btf_param *arg; + const struct btf_type *t; + char exp_name[128]; + u32 nr_args; + + /* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */ + if (!flags || (flags & (flags - 1))) + return -EINVAL; + + /* any BPF iter kfunc should have `struct bpf_iter_<type> *` first arg */ + nr_args = btf_type_vlen(func); + if (nr_args < 1) + return -EINVAL; + + arg = &btf_params(func)[0]; + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!t || !btf_type_is_ptr(t)) + return -EINVAL; + t = btf_type_skip_modifiers(btf, t->type, NULL); + if (!t || !__btf_type_is_struct(t)) + return -EINVAL; + + name = btf_name_by_offset(btf, t->name_off); + if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1)) + return -EINVAL; + + /* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to + * fit nicely in stack slots + */ + if (t->size == 0 || (t->size % 8)) + return -EINVAL; + + /* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *) + * naming pattern + */ + iter_name = name + sizeof(ITER_PREFIX) - 1; + if (flags & KF_ITER_NEW) + sfx = "new"; + else if (flags & KF_ITER_NEXT) + sfx = "next"; + else /* (flags & KF_ITER_DESTROY) */ + sfx = "destroy"; + + snprintf(exp_name, sizeof(exp_name), "bpf_iter_%s_%s", iter_name, sfx); + if (strcmp(func_name, exp_name)) + return -EINVAL; + + /* only iter constructor should have extra arguments */ + if (!(flags & KF_ITER_NEW) && nr_args != 1) + return -EINVAL; + + if (flags & KF_ITER_NEXT) { + /* bpf_iter_<type>_next() should return pointer */ + t = btf_type_skip_modifiers(btf, func->type, NULL); + if (!t || !btf_type_is_ptr(t)) + return -EINVAL; + } + + if (flags & KF_ITER_DESTROY) { + /* bpf_iter_<type>_destroy() should return void */ + t = btf_type_by_id(btf, func->type); + if (!t || !btf_type_is_void(t)) + return -EINVAL; + } + + return 0; +} + +static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags) +{ + const struct btf_type *func; + const char *func_name; + int err; + + /* any kfunc should be FUNC -> FUNC_PROTO */ + func = btf_type_by_id(btf, func_id); + if (!func || !btf_type_is_func(func)) + return -EINVAL; + + /* sanity check kfunc name */ + func_name = btf_name_by_offset(btf, func->name_off); + if (!func_name || !func_name[0]) + return -EINVAL; + + func = btf_type_by_id(btf, func->type); + if (!func || !btf_type_is_func_proto(func)) + return -EINVAL; + + if (func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY)) { + err = btf_check_iter_kfuncs(btf, func_name, func, func_flags); + if (err) + return err; + } + + return 0; +} + /* Kernel Function (kfunc) BTF ID set registration API */ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, @@ -7773,7 +7894,7 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, const struct btf_kfunc_id_set *kset) { struct btf *btf; - int ret; + int ret, i; btf = btf_get_module_btf(kset->owner); if (!btf) { @@ -7790,7 +7911,15 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, if (IS_ERR(btf)) return PTR_ERR(btf); + for (i = 0; i < kset->set->cnt; i++) { + ret = btf_check_kfunc_protos(btf, kset->set->pairs[i].id, + kset->set->pairs[i].flags); + if (ret) + goto err_out; + } + ret = btf_populate_kfunc_set(btf, hook, kset->set); +err_out: btf_put(btf); return ret; } @@ -8368,16 +8497,15 @@ out: bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, const char *suffix) + const char *field_name, u32 btf_id, const char *suffix) { struct btf *btf = reg->btf; const struct btf_type *walk_type, *safe_type; const char *tname; char safe_tname[64]; long ret, safe_id; - const struct btf_member *member, *m_walk = NULL; + const struct btf_member *member; u32 i; - const char *walk_name; walk_type = btf_type_by_id(btf, reg->btf_id); if (!walk_type) @@ -8397,30 +8525,17 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, if (!safe_type) return false; - for_each_member(i, walk_type, member) { - u32 moff; - - /* We're looking for the PTR_TO_BTF_ID member in the struct - * type we're walking which matches the specified offset. - * Below, we'll iterate over the fields in the safe variant of - * the struct and see if any of them has a matching type / - * name. - */ - moff = __btf_member_bit_offset(walk_type, member) / 8; - if (off == moff) { - m_walk = member; - break; - } - } - if (m_walk == NULL) - return false; - - walk_name = __btf_name_by_offset(btf, m_walk->name_off); for_each_member(i, safe_type, member) { const char *m_name = __btf_name_by_offset(btf, member->name_off); + const struct btf_type *mtype = btf_type_by_id(btf, member->type); + u32 id; + + if (!btf_type_is_ptr(mtype)) + continue; + btf_type_skip_modifiers(btf, mtype->type, &id); /* If we match on both type and name, the field is considered trusted. */ - if (m_walk->type == member->type && !strcmp(walk_name, m_name)) + if (btf_id == id && !strcmp(field_name, m_name)) return true; } diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 871809e71b4e..8ec18faa74ac 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -540,7 +540,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, } } -static int cpu_map_delete_elem(struct bpf_map *map, void *key) +static long cpu_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); u32 key_cpu = *(u32 *)key; @@ -553,8 +553,8 @@ static int cpu_map_delete_elem(struct bpf_map *map, void *key) return 0; } -static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpumap_val cpumap_value = {}; @@ -667,7 +667,7 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -static int cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags) +static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags) { return __bpf_xdp_redirect_map(map, index, flags, 0, __cpu_map_lookup_elem); diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index b6587ec40f1b..7efdf5d770ca 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -9,6 +9,7 @@ /** * struct bpf_cpumask - refcounted BPF cpumask wrapper structure * @cpumask: The actual cpumask embedded in the struct. + * @rcu: The RCU head used to free the cpumask with RCU safety. * @usage: Object reference counter. When the refcount goes to 0, the * memory is released back to the BPF allocator, which provides * RCU safety. @@ -24,6 +25,7 @@ */ struct bpf_cpumask { cpumask_t cpumask; + struct rcu_head rcu; refcount_t usage; }; @@ -80,32 +82,14 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) return cpumask; } -/** - * bpf_cpumask_kptr_get() - Attempt to acquire a reference to a BPF cpumask - * stored in a map. - * @cpumaskp: A pointer to a BPF cpumask map value. - * - * Attempts to acquire a reference to a BPF cpumask stored in a map value. The - * cpumask returned by this function must either be embedded in a map as a - * kptr, or freed with bpf_cpumask_release(). This function may return NULL if - * no BPF cpumask was found in the specified map value. - */ -__bpf_kfunc struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumaskp) +static void cpumask_free_cb(struct rcu_head *head) { struct bpf_cpumask *cpumask; - /* The BPF memory allocator frees memory backing its caches in an RCU - * callback. Thus, we can safely use RCU to ensure that the cpumask is - * safe to read. - */ - rcu_read_lock(); - - cpumask = READ_ONCE(*cpumaskp); - if (cpumask && !refcount_inc_not_zero(&cpumask->usage)) - cpumask = NULL; - - rcu_read_unlock(); - return cpumask; + cpumask = container_of(head, struct bpf_cpumask, rcu); + migrate_disable(); + bpf_mem_cache_free(&bpf_cpumask_ma, cpumask); + migrate_enable(); } /** @@ -118,14 +102,8 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumas */ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask) { - if (!cpumask) - return; - - if (refcount_dec_and_test(&cpumask->usage)) { - migrate_disable(); - bpf_mem_cache_free(&bpf_cpumask_ma, cpumask); - migrate_enable(); - } + if (refcount_dec_and_test(&cpumask->usage)) + call_rcu(&cpumask->rcu, cpumask_free_cb); } /** @@ -424,9 +402,8 @@ __diag_pop(); BTF_SET8_START(cpumask_kfunc_btf_ids) BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cpumask_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 19b036a228f7..802692fa3905 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -809,7 +809,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu) kfree(dev); } -static int dev_map_delete_elem(struct bpf_map *map, void *key) +static long dev_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; @@ -826,7 +826,7 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) return 0; } -static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) +static long dev_map_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; @@ -897,8 +897,8 @@ err_out: return ERR_PTR(-EINVAL); } -static int __dev_map_update_elem(struct net *net, struct bpf_map *map, - void *key, void *value, u64 map_flags) +static long __dev_map_update_elem(struct net *net, struct bpf_map *map, + void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; @@ -939,15 +939,15 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, return 0; } -static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long dev_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { return __dev_map_update_elem(current->nsproxy->net_ns, map, key, value, map_flags); } -static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, - void *key, void *value, u64 map_flags) +static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, + void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; @@ -999,21 +999,21 @@ out_err: return err; } -static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { return __dev_map_hash_update_elem(current->nsproxy->net_ns, map, key, value, map_flags); } -static int dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) +static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, __dev_map_lookup_elem); } -static int dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) +static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 0df4b0c10f59..00c253b84bf5 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -607,6 +607,8 @@ free_htab: static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd) { + if (likely(key_len % 4 == 0)) + return jhash2(key, key_len / 4, hashrnd); return jhash(key, key_len, hashrnd); } @@ -1073,8 +1075,8 @@ static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, } /* Called from syscall or from eBPF program */ -static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; @@ -1175,8 +1177,8 @@ static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) bpf_lru_push_free(&htab->lru, &elem->lru_node); } -static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new, *l_old = NULL; @@ -1242,9 +1244,9 @@ err: return ret; } -static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags, - bool onallcpus) +static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; @@ -1297,9 +1299,9 @@ err: return ret; } -static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags, - bool onallcpus) +static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new = NULL, *l_old; @@ -1364,21 +1366,21 @@ err: return ret; } -static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static long htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { return __htab_percpu_map_update_elem(map, key, value, map_flags, false); } -static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags) +static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) { return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, false); } /* Called from syscall or from eBPF program */ -static int htab_map_delete_elem(struct bpf_map *map, void *key) +static long htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; @@ -1414,7 +1416,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) return ret; } -static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) +static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; @@ -2134,8 +2136,8 @@ static const struct bpf_iter_seq_info iter_seq_info = { .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), }; -static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn, - void *callback_ctx, u64 flags) +static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn, + void *callback_ctx, u64 flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_nulls_head *head; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 637ac4e92e75..f04e60a4847f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -18,6 +18,7 @@ #include <linux/pid_namespace.h> #include <linux/poison.h> #include <linux/proc_ns.h> +#include <linux/sched/task.h> #include <linux/security.h> #include <linux/btf_ids.h> #include <linux/bpf_mem_alloc.h> @@ -257,7 +258,7 @@ BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size) goto err_clear; /* Verifier guarantees that size > 0 */ - strscpy(buf, task->comm, size); + strscpy_pad(buf, task->comm, size); return 0; err_clear: memset(buf, 0, size); @@ -571,7 +572,7 @@ static const struct bpf_func_proto bpf_strncmp_proto = { .func = bpf_strncmp, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_PTR_TO_CONST_STR, }; @@ -1896,14 +1897,19 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) return p; } +void __bpf_obj_drop_impl(void *p, const struct btf_record *rec) +{ + if (rec) + bpf_obj_free_fields(rec, p); + bpf_mem_free(&bpf_global_ma, p); +} + __bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) { struct btf_struct_meta *meta = meta__ign; void *p = p__alloc; - if (meta) - bpf_obj_free_fields(meta->record, p); - bpf_mem_free(&bpf_global_ma, p); + __bpf_obj_drop_impl(p, meta ? meta->record : NULL); } static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail) @@ -2008,73 +2014,8 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) */ __bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p) { - return get_task_struct(p); -} - -/** - * bpf_task_acquire_not_zero - Acquire a reference to a rcu task object. A task - * acquired by this kfunc which is not stored in a map as a kptr, must be - * released by calling bpf_task_release(). - * @p: The task on which a reference is being acquired. - */ -__bpf_kfunc struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p) -{ - /* For the time being this function returns NULL, as it's not currently - * possible to safely acquire a reference to a task with RCU protection - * using get_task_struct() and put_task_struct(). This is due to the - * slightly odd mechanics of p->rcu_users, and how task RCU protection - * works. - * - * A struct task_struct is refcounted by two different refcount_t - * fields: - * - * 1. p->usage: The "true" refcount field which tracks a task's - * lifetime. The task is freed as soon as this - * refcount drops to 0. - * - * 2. p->rcu_users: An "RCU users" refcount field which is statically - * initialized to 2, and is co-located in a union with - * a struct rcu_head field (p->rcu). p->rcu_users - * essentially encapsulates a single p->usage - * refcount, and when p->rcu_users goes to 0, an RCU - * callback is scheduled on the struct rcu_head which - * decrements the p->usage refcount. - * - * There are two important implications to this task refcounting logic - * described above. The first is that - * refcount_inc_not_zero(&p->rcu_users) cannot be used anywhere, as - * after the refcount goes to 0, the RCU callback being scheduled will - * cause the memory backing the refcount to again be nonzero due to the - * fields sharing a union. The other is that we can't rely on RCU to - * guarantee that a task is valid in a BPF program. This is because a - * task could have already transitioned to being in the TASK_DEAD - * state, had its rcu_users refcount go to 0, and its rcu callback - * invoked in which it drops its single p->usage reference. At this - * point the task will be freed as soon as the last p->usage reference - * goes to 0, without waiting for another RCU gp to elapse. The only - * way that a BPF program can guarantee that a task is valid is in this - * scenario is to hold a p->usage refcount itself. - * - * Until we're able to resolve this issue, either by pulling - * p->rcu_users and p->rcu out of the union, or by getting rid of - * p->usage and just using p->rcu_users for refcounting, we'll just - * return NULL here. - */ - return NULL; -} - -/** - * bpf_task_kptr_get - Acquire a reference on a struct task_struct kptr. A task - * kptr acquired by this kfunc which is not subsequently stored in a map, must - * be released by calling bpf_task_release(). - * @pp: A pointer to a task kptr on which a reference is being acquired. - */ -__bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp) -{ - /* We must return NULL here until we have clarity on how to properly - * leverage RCU for ensuring a task's lifetime. See the comment above - * in bpf_task_acquire_not_zero() for more details. - */ + if (refcount_inc_not_zero(&p->rcu_users)) + return p; return NULL; } @@ -2084,10 +2025,7 @@ __bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp) */ __bpf_kfunc void bpf_task_release(struct task_struct *p) { - if (!p) - return; - - put_task_struct(p); + put_task_struct_rcu_user(p); } #ifdef CONFIG_CGROUPS @@ -2099,39 +2037,7 @@ __bpf_kfunc void bpf_task_release(struct task_struct *p) */ __bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp) { - cgroup_get(cgrp); - return cgrp; -} - -/** - * bpf_cgroup_kptr_get - Acquire a reference on a struct cgroup kptr. A cgroup - * kptr acquired by this kfunc which is not subsequently stored in a map, must - * be released by calling bpf_cgroup_release(). - * @cgrpp: A pointer to a cgroup kptr on which a reference is being acquired. - */ -__bpf_kfunc struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp) -{ - struct cgroup *cgrp; - - rcu_read_lock(); - /* Another context could remove the cgroup from the map and release it - * at any time, including after we've done the lookup above. This is - * safe because we're in an RCU read region, so the cgroup is - * guaranteed to remain valid until at least the rcu_read_unlock() - * below. - */ - cgrp = READ_ONCE(*cgrpp); - - if (cgrp && !cgroup_tryget(cgrp)) - /* If the cgroup had been removed from the map and freed as - * described above, cgroup_tryget() will return false. The - * cgroup will be freed at some point after the current RCU gp - * has ended, so just return NULL to the user. - */ - cgrp = NULL; - rcu_read_unlock(); - - return cgrp; + return cgroup_tryget(cgrp) ? cgrp : NULL; } /** @@ -2143,9 +2049,6 @@ __bpf_kfunc struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp) */ __bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp) { - if (!cgrp) - return; - cgroup_put(cgrp); } @@ -2200,7 +2103,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid) rcu_read_lock(); p = find_task_by_pid_ns(pid, &init_pid_ns); if (p) - bpf_task_acquire(p); + p = bpf_task_acquire(p); rcu_read_unlock(); return p; @@ -2372,17 +2275,14 @@ BTF_ID_FLAGS(func, bpf_list_push_front) BTF_ID_FLAGS(func, bpf_list_push_back) BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE) BTF_ID_FLAGS(func, bpf_rbtree_add) BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) #ifdef CONFIG_CGROUPS -BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL) @@ -2411,6 +2311,9 @@ BTF_ID_FLAGS(func, bpf_rcu_read_lock) BTF_ID_FLAGS(func, bpf_rcu_read_unlock) BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL) BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW) +BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY) BTF_SET8_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index a993560f200a..4c7bbec4a9e4 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -141,8 +141,8 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key) return &READ_ONCE(storage->buf)->data[0]; } -static int cgroup_storage_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) +static long cgroup_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) { struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; @@ -348,7 +348,7 @@ static void cgroup_storage_map_free(struct bpf_map *_map) bpf_map_area_free(map); } -static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) +static long cgroup_storage_delete_elem(struct bpf_map *map, void *key) { return -EINVAL; } diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c new file mode 100644 index 000000000000..046ddff37a76 --- /dev/null +++ b/kernel/bpf/log.c @@ -0,0 +1,330 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io + */ +#include <uapi/linux/btf.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/bpf.h> +#include <linux/bpf_verifier.h> +#include <linux/math64.h> + +static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) +{ + /* ubuf and len_total should both be specified (or not) together */ + if (!!log->ubuf != !!log->len_total) + return false; + /* log buf without log_level is meaningless */ + if (log->ubuf && log->level == 0) + return false; + if (log->level & ~BPF_LOG_MASK) + return false; + if (log->len_total > UINT_MAX >> 2) + return false; + return true; +} + +int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, + char __user *log_buf, u32 log_size) +{ + log->level = log_level; + log->ubuf = log_buf; + log->len_total = log_size; + + /* log attributes have to be sane */ + if (!bpf_verifier_log_attr_valid(log)) + return -EINVAL; + + return 0; +} + +static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len) +{ + /* add_len includes terminal \0, so no need for +1. */ + u64 len = log->end_pos + add_len; + + /* log->len_max could be larger than our current len due to + * bpf_vlog_reset() calls, so we maintain the max of any length at any + * previous point + */ + if (len > UINT_MAX) + log->len_max = UINT_MAX; + else if (len > log->len_max) + log->len_max = len; +} + +void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, + va_list args) +{ + u64 cur_pos; + u32 new_n, n; + + n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); + + WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, + "verifier log line truncated - local buffer too short\n"); + + if (log->level == BPF_LOG_KERNEL) { + bool newline = n > 0 && log->kbuf[n - 1] == '\n'; + + pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n"); + return; + } + + n += 1; /* include terminating zero */ + bpf_vlog_update_len_max(log, n); + + if (log->level & BPF_LOG_FIXED) { + /* check if we have at least something to put into user buf */ + new_n = 0; + if (log->end_pos < log->len_total) { + new_n = min_t(u32, log->len_total - log->end_pos, n); + log->kbuf[new_n - 1] = '\0'; + } + + cur_pos = log->end_pos; + log->end_pos += n - 1; /* don't count terminating '\0' */ + + if (log->ubuf && new_n && + copy_to_user(log->ubuf + cur_pos, log->kbuf, new_n)) + goto fail; + } else { + u64 new_end, new_start; + u32 buf_start, buf_end, new_n; + + new_end = log->end_pos + n; + if (new_end - log->start_pos >= log->len_total) + new_start = new_end - log->len_total; + else + new_start = log->start_pos; + + log->start_pos = new_start; + log->end_pos = new_end - 1; /* don't count terminating '\0' */ + + if (!log->ubuf) + return; + + new_n = min(n, log->len_total); + cur_pos = new_end - new_n; + div_u64_rem(cur_pos, log->len_total, &buf_start); + div_u64_rem(new_end, log->len_total, &buf_end); + /* new_end and buf_end are exclusive indices, so if buf_end is + * exactly zero, then it actually points right to the end of + * ubuf and there is no wrap around + */ + if (buf_end == 0) + buf_end = log->len_total; + + /* if buf_start > buf_end, we wrapped around; + * if buf_start == buf_end, then we fill ubuf completely; we + * can't have buf_start == buf_end to mean that there is + * nothing to write, because we always write at least + * something, even if terminal '\0' + */ + if (buf_start < buf_end) { + /* message fits within contiguous chunk of ubuf */ + if (copy_to_user(log->ubuf + buf_start, + log->kbuf + n - new_n, + buf_end - buf_start)) + goto fail; + } else { + /* message wraps around the end of ubuf, copy in two chunks */ + if (copy_to_user(log->ubuf + buf_start, + log->kbuf + n - new_n, + log->len_total - buf_start)) + goto fail; + if (copy_to_user(log->ubuf, + log->kbuf + n - buf_end, + buf_end)) + goto fail; + } + } + + return; +fail: + log->ubuf = NULL; +} + +void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos) +{ + char zero = 0; + u32 pos; + + if (WARN_ON_ONCE(new_pos > log->end_pos)) + return; + + if (!bpf_verifier_log_needed(log) || log->level == BPF_LOG_KERNEL) + return; + + /* if position to which we reset is beyond current log window, + * then we didn't preserve any useful content and should adjust + * start_pos to end up with an empty log (start_pos == end_pos) + */ + log->end_pos = new_pos; + if (log->end_pos < log->start_pos) + log->start_pos = log->end_pos; + + if (!log->ubuf) + return; + + if (log->level & BPF_LOG_FIXED) + pos = log->end_pos + 1; + else + div_u64_rem(new_pos, log->len_total, &pos); + + if (pos < log->len_total && put_user(zero, log->ubuf + pos)) + log->ubuf = NULL; +} + +static void bpf_vlog_reverse_kbuf(char *buf, int len) +{ + int i, j; + + for (i = 0, j = len - 1; i < j; i++, j--) + swap(buf[i], buf[j]); +} + +static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int end) +{ + /* we split log->kbuf into two equal parts for both ends of array */ + int n = sizeof(log->kbuf) / 2, nn; + char *lbuf = log->kbuf, *rbuf = log->kbuf + n; + + /* Read ubuf's section [start, end) two chunks at a time, from left + * and right side; within each chunk, swap all the bytes; after that + * reverse the order of lbuf and rbuf and write result back to ubuf. + * This way we'll end up with swapped contents of specified + * [start, end) ubuf segment. + */ + while (end - start > 1) { + nn = min(n, (end - start ) / 2); + + if (copy_from_user(lbuf, log->ubuf + start, nn)) + return -EFAULT; + if (copy_from_user(rbuf, log->ubuf + end - nn, nn)) + return -EFAULT; + + bpf_vlog_reverse_kbuf(lbuf, nn); + bpf_vlog_reverse_kbuf(rbuf, nn); + + /* we write lbuf to the right end of ubuf, while rbuf to the + * left one to end up with properly reversed overall ubuf + */ + if (copy_to_user(log->ubuf + start, rbuf, nn)) + return -EFAULT; + if (copy_to_user(log->ubuf + end - nn, lbuf, nn)) + return -EFAULT; + + start += nn; + end -= nn; + } + + return 0; +} + +int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual) +{ + u32 sublen; + int err; + + *log_size_actual = 0; + if (!log || log->level == 0 || log->level == BPF_LOG_KERNEL) + return 0; + + if (!log->ubuf) + goto skip_log_rotate; + /* If we never truncated log, there is nothing to move around. */ + if (log->start_pos == 0) + goto skip_log_rotate; + + /* Otherwise we need to rotate log contents to make it start from the + * buffer beginning and be a continuous zero-terminated string. Note + * that if log->start_pos != 0 then we definitely filled up entire log + * buffer with no gaps, and we just need to shift buffer contents to + * the left by (log->start_pos % log->len_total) bytes. + * + * Unfortunately, user buffer could be huge and we don't want to + * allocate temporary kernel memory of the same size just to shift + * contents in a straightforward fashion. Instead, we'll be clever and + * do in-place array rotation. This is a leetcode-style problem, which + * could be solved by three rotations. + * + * Let's say we have log buffer that has to be shifted left by 7 bytes + * (spaces and vertical bar is just for demonstrative purposes): + * E F G H I J K | A B C D + * + * First, we reverse entire array: + * D C B A | K J I H G F E + * + * Then we rotate first 4 bytes (DCBA) and separately last 7 bytes + * (KJIHGFE), resulting in a properly rotated array: + * A B C D | E F G H I J K + * + * We'll utilize log->kbuf to read user memory chunk by chunk, swap + * bytes, and write them back. Doing it byte-by-byte would be + * unnecessarily inefficient. Altogether we are going to read and + * write each byte twice, for total 4 memory copies between kernel and + * user space. + */ + + /* length of the chopped off part that will be the beginning; + * len(ABCD) in the example above + */ + div_u64_rem(log->start_pos, log->len_total, &sublen); + sublen = log->len_total - sublen; + + err = bpf_vlog_reverse_ubuf(log, 0, log->len_total); + err = err ?: bpf_vlog_reverse_ubuf(log, 0, sublen); + err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total); + if (err) + log->ubuf = NULL; + +skip_log_rotate: + *log_size_actual = log->len_max; + + /* properly initialized log has either both ubuf!=NULL and len_total>0 + * or ubuf==NULL and len_total==0, so if this condition doesn't hold, + * we got a fault somewhere along the way, so report it back + */ + if (!!log->ubuf != !!log->len_total) + return -EFAULT; + + /* did truncation actually happen? */ + if (log->ubuf && log->len_max > log->len_total) + return -ENOSPC; + + return 0; +} + +/* log_level controls verbosity level of eBPF verifier. + * bpf_verifier_log_write() is used to dump the verification trace to the log, + * so the user can figure out what's wrong with the program + */ +__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, + const char *fmt, ...) +{ + va_list args; + + if (!bpf_verifier_log_needed(&env->log)) + return; + + va_start(args, fmt); + bpf_verifier_vlog(&env->log, fmt, args); + va_end(args); +} +EXPORT_SYMBOL_GPL(bpf_verifier_log_write); + +__printf(2, 3) void bpf_log(struct bpf_verifier_log *log, + const char *fmt, ...) +{ + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); +} +EXPORT_SYMBOL_GPL(bpf_log); diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index dc23f2ac9cde..e0d3ddf2037a 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -300,8 +300,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, } /* Called from syscall or from eBPF program */ -static int trie_update_elem(struct bpf_map *map, - void *_key, void *value, u64 flags) +static long trie_update_elem(struct bpf_map *map, + void *_key, void *value, u64 flags) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; @@ -431,7 +431,7 @@ out: } /* Called from syscall or from eBPF program */ -static int trie_delete_elem(struct bpf_map *map, void *_key) +static long trie_delete_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct bpf_lpm_trie_key *key = _key; diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 5fcdacbb8439..410637c225fb 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -121,15 +121,8 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head) return entry; } -static void *__alloc(struct bpf_mem_cache *c, int node) +static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) { - /* Allocate, but don't deplete atomic reserves that typical - * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc - * will allocate from the current numa node which is what we - * want here. - */ - gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT; - if (c->percpu_size) { void **obj = kmalloc_node(c->percpu_size, flags, node); void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); @@ -185,7 +178,12 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) */ obj = __llist_del_first(&c->free_by_rcu); if (!obj) { - obj = __alloc(c, node); + /* Allocate, but don't deplete atomic reserves that typical + * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc + * will allocate from the current numa node which is what we + * want here. + */ + obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT); if (!obj) break; } @@ -676,3 +674,46 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) unit_free(this_cpu_ptr(ma->cache), ptr); } + +/* Directly does a kfree() without putting 'ptr' back to the free_llist + * for reuse and without waiting for a rcu_tasks_trace gp. + * The caller must first go through the rcu_tasks_trace gp for 'ptr' + * before calling bpf_mem_cache_raw_free(). + * It could be used when the rcu_tasks_trace callback does not have + * a hold on the original bpf_mem_alloc object that allocated the + * 'ptr'. This should only be used in the uncommon code path. + * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled + * and may affect performance. + */ +void bpf_mem_cache_raw_free(void *ptr) +{ + if (!ptr) + return; + + kfree(ptr - LLIST_NODE_SZ); +} + +/* When flags == GFP_KERNEL, it signals that the caller will not cause + * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use + * kmalloc if the free_llist is empty. + */ +void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) +{ + struct bpf_mem_cache *c; + void *ret; + + c = this_cpu_ptr(ma->cache); + + ret = unit_alloc(c); + if (!ret && flags == GFP_KERNEL) { + struct mem_cgroup *memcg, *old_memcg; + + memcg = get_memcg(c); + old_memcg = set_active_memcg(memcg); + ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT); + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); + } + + return !ret ? NULL : ret + LLIST_NODE_SZ; +} diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 63ecbbcb349d..601609164ef3 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -95,7 +95,7 @@ static void queue_stack_map_free(struct bpf_map *map) bpf_map_area_free(qs); } -static int __queue_map_get(struct bpf_map *map, void *value, bool delete) +static long __queue_map_get(struct bpf_map *map, void *value, bool delete) { struct bpf_queue_stack *qs = bpf_queue_stack(map); unsigned long flags; @@ -124,7 +124,7 @@ out: } -static int __stack_map_get(struct bpf_map *map, void *value, bool delete) +static long __stack_map_get(struct bpf_map *map, void *value, bool delete) { struct bpf_queue_stack *qs = bpf_queue_stack(map); unsigned long flags; @@ -156,32 +156,32 @@ out: } /* Called from syscall or from eBPF program */ -static int queue_map_peek_elem(struct bpf_map *map, void *value) +static long queue_map_peek_elem(struct bpf_map *map, void *value) { return __queue_map_get(map, value, false); } /* Called from syscall or from eBPF program */ -static int stack_map_peek_elem(struct bpf_map *map, void *value) +static long stack_map_peek_elem(struct bpf_map *map, void *value) { return __stack_map_get(map, value, false); } /* Called from syscall or from eBPF program */ -static int queue_map_pop_elem(struct bpf_map *map, void *value) +static long queue_map_pop_elem(struct bpf_map *map, void *value) { return __queue_map_get(map, value, true); } /* Called from syscall or from eBPF program */ -static int stack_map_pop_elem(struct bpf_map *map, void *value) +static long stack_map_pop_elem(struct bpf_map *map, void *value) { return __stack_map_get(map, value, true); } /* Called from syscall or from eBPF program */ -static int queue_stack_map_push_elem(struct bpf_map *map, void *value, - u64 flags) +static long queue_stack_map_push_elem(struct bpf_map *map, void *value, + u64 flags) { struct bpf_queue_stack *qs = bpf_queue_stack(map); unsigned long irq_flags; @@ -227,14 +227,14 @@ static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key) } /* Called from syscall or from eBPF program */ -static int queue_stack_map_update_elem(struct bpf_map *map, void *key, - void *value, u64 flags) +static long queue_stack_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) { return -EINVAL; } /* Called from syscall or from eBPF program */ -static int queue_stack_map_delete_elem(struct bpf_map *map, void *key) +static long queue_stack_map_delete_elem(struct bpf_map *map, void *key) { return -EINVAL; } diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 71cb72f5b733..cbf2d8d784b8 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -59,7 +59,7 @@ static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key) } /* Called from syscall only */ -static int reuseport_array_delete_elem(struct bpf_map *map, void *key) +static long reuseport_array_delete_elem(struct bpf_map *map, void *key) { struct reuseport_array *array = reuseport_array(map); u32 index = *(u32 *)key; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 0d2a45ff83f1..875ac9b698d9 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -242,13 +242,13 @@ static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) return ERR_PTR(-ENOTSUPP); } -static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 flags) +static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 flags) { return -ENOTSUPP; } -static int ringbuf_map_delete_elem(struct bpf_map *map, void *key) +static long ringbuf_map_delete_elem(struct bpf_map *map, void *key) { return -ENOTSUPP; } diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 0f1d8dced933..b25fce425b2c 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -618,14 +618,14 @@ static int stack_map_get_next_key(struct bpf_map *map, void *key, return 0; } -static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static long stack_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { return -EINVAL; } /* Called from syscall or from eBPF program */ -static int stack_map_delete_elem(struct bpf_map *map, void *key) +static long stack_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *old_bucket; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f406dfa13792..6d575505f89c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -520,14 +520,14 @@ static int btf_field_cmp(const void *a, const void *b) } struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset, - enum btf_field_type type) + u32 field_mask) { struct btf_field *field; - if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & type)) + if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask)) return NULL; field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp); - if (!field || !(field->type & type)) + if (!field || !(field->type & field_mask)) return NULL; return field; } @@ -650,6 +650,8 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj) bpf_timer_cancel_and_free(obj + rec->timer_off); } +extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec); + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -659,8 +661,10 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) return; fields = rec->fields; for (i = 0; i < rec->cnt; i++) { + struct btf_struct_meta *pointee_struct_meta; const struct btf_field *field = &fields[i]; void *field_ptr = obj + field->offset; + void *xchgd_field; switch (fields[i].type) { case BPF_SPIN_LOCK: @@ -672,7 +676,22 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) WRITE_ONCE(*(u64 *)field_ptr, 0); break; case BPF_KPTR_REF: - field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0)); + xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0); + if (!xchgd_field) + break; + + if (!btf_is_kernel(field->kptr.btf)) { + pointee_struct_meta = btf_find_struct_meta(field->kptr.btf, + field->kptr.btf_id); + WARN_ON_ONCE(!pointee_struct_meta); + migrate_disable(); + __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ? + pointee_struct_meta->record : + NULL); + migrate_enable(); + } else { + field->kptr.dtor(xchgd_field); + } break; case BPF_LIST_HEAD: if (WARN_ON_ONCE(rec->spin_lock_off < 0)) @@ -1287,8 +1306,10 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd) return map; } -/* map_idr_lock should have been held */ -static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) +/* map_idr_lock should have been held or the map should have been + * protected by rcu read lock. + */ +struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) { int refold; @@ -2051,6 +2072,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) { bpf_prog_kallsyms_del_all(prog); btf_put(prog->aux->btf); + module_put(prog->aux->mod); kvfree(prog->aux->jited_linfo); kvfree(prog->aux->linfo); kfree(prog->aux->kfunc_tab); @@ -2479,9 +2501,9 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size +#define BPF_PROG_LOAD_LAST_FIELD log_true_size -static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) +static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog, *dst_prog = NULL; @@ -2631,7 +2653,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) goto free_prog_sec; /* run eBPF verifier */ - err = bpf_check(&prog, attr, uattr); + err = bpf_check(&prog, attr, uattr, uattr_size); if (err < 0) goto free_used_maps; @@ -2806,16 +2828,19 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) const struct bpf_prog *prog = link->prog; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "link_type:\t%s\n" - "link_id:\t%u\n" - "prog_tag:\t%s\n" - "prog_id:\t%u\n", + "link_id:\t%u\n", bpf_link_type_strs[link->type], - link->id, - prog_tag, - prog->aux->id); + link->id); + if (prog) { + bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); + seq_printf(m, + "prog_tag:\t%s\n" + "prog_id:\t%u\n", + prog_tag, + prog->aux->id); + } if (link->ops->show_fdinfo) link->ops->show_fdinfo(link, m); } @@ -3097,6 +3122,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, if (err) goto out_unlock; + if (tgt_info.tgt_mod) { + module_put(prog->aux->mod); + prog->aux->mod = tgt_info.tgt_mod; + } + tr = bpf_trampoline_get(key, &tgt_info); if (!tr) { err = -ENOMEM; @@ -4290,7 +4320,8 @@ static int bpf_link_get_info_by_fd(struct file *file, info.type = link->type; info.id = link->id; - info.prog_id = link->prog->aux->id; + if (link->prog) + info.prog_id = link->prog->aux->id; if (link->ops->fill_link_info) { err = link->ops->fill_link_info(link, &info); @@ -4340,9 +4371,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return err; } -#define BPF_BTF_LOAD_LAST_FIELD btf_log_level +#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size -static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr) +static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; @@ -4350,7 +4381,7 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr) if (!bpf_capable()) return -EPERM; - return btf_new_fd(attr, uattr); + return btf_new_fd(attr, uattr, uattr_size); } #define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id @@ -4553,6 +4584,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; + if (attr->link_create.attach_type == BPF_STRUCT_OPS) + return bpf_struct_ops_link_create(attr); + prog = bpf_prog_get(attr->link_create.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); @@ -4651,6 +4685,35 @@ out: return ret; } +static int link_update_map(struct bpf_link *link, union bpf_attr *attr) +{ + struct bpf_map *new_map, *old_map = NULL; + int ret; + + new_map = bpf_map_get(attr->link_update.new_map_fd); + if (IS_ERR(new_map)) + return PTR_ERR(new_map); + + if (attr->link_update.flags & BPF_F_REPLACE) { + old_map = bpf_map_get(attr->link_update.old_map_fd); + if (IS_ERR(old_map)) { + ret = PTR_ERR(old_map); + goto out_put; + } + } else if (attr->link_update.old_map_fd) { + ret = -EINVAL; + goto out_put; + } + + ret = link->ops->update_map(link, new_map, old_map); + + if (old_map) + bpf_map_put(old_map); +out_put: + bpf_map_put(new_map); + return ret; +} + #define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd static int link_update(union bpf_attr *attr) @@ -4671,6 +4734,11 @@ static int link_update(union bpf_attr *attr) if (IS_ERR(link)) return PTR_ERR(link); + if (link->ops->update_map) { + ret = link_update_map(link, attr); + goto out_put_link; + } + new_prog = bpf_prog_get(attr->link_update.new_prog_fd); if (IS_ERR(new_prog)) { ret = PTR_ERR(new_prog); @@ -4991,7 +5059,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) err = map_freeze(&attr); break; case BPF_PROG_LOAD: - err = bpf_prog_load(&attr, uattr); + err = bpf_prog_load(&attr, uattr, size); break; case BPF_OBJ_PIN: err = bpf_obj_pin(&attr); @@ -5036,7 +5104,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) err = bpf_raw_tracepoint_open(&attr); break; case BPF_BTF_LOAD: - err = bpf_btf_load(&attr, uattr); + err = bpf_btf_load(&attr, uattr, size); break; case BPF_BTF_GET_FD_BY_ID: err = bpf_btf_get_fd_by_id(&attr); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index d0ed7d6f5eec..f61d5138b12b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -9,7 +9,6 @@ #include <linux/btf.h> #include <linux/rcupdate_trace.h> #include <linux/rcupdate_wait.h> -#include <linux/module.h> #include <linux/static_call.h> #include <linux/bpf_verifier.h> #include <linux/bpf_lsm.h> @@ -172,26 +171,6 @@ out: return tr; } -static int bpf_trampoline_module_get(struct bpf_trampoline *tr) -{ - struct module *mod; - int err = 0; - - preempt_disable(); - mod = __module_text_address((unsigned long) tr->func.addr); - if (mod && !try_module_get(mod)) - err = -ENOENT; - preempt_enable(); - tr->mod = mod; - return err; -} - -static void bpf_trampoline_module_put(struct bpf_trampoline *tr) -{ - module_put(tr->mod); - tr->mod = NULL; -} - static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) { void *ip = tr->func.addr; @@ -202,8 +181,6 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) else ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); - if (!ret) - bpf_trampoline_module_put(tr); return ret; } @@ -238,9 +215,6 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) tr->func.ftrace_managed = true; } - if (bpf_trampoline_module_get(tr)) - return -ENOENT; - if (tr->func.ftrace_managed) { ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); ret = register_ftrace_direct_multi(tr->fops, (long)new_addr); @@ -248,8 +222,6 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); } - if (ret) - bpf_trampoline_module_put(tr); return ret; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b2116ca78d9a..d6db6de3e9ea 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -24,6 +24,7 @@ #include <linux/bpf_lsm.h> #include <linux/btf_ids.h> #include <linux/poison.h> +#include <linux/module.h> #include "disasm.h" @@ -302,6 +303,10 @@ struct bpf_kfunc_call_arg_meta { enum bpf_dynptr_type type; u32 id; } initialized_dynptr; + struct { + u8 spi; + u8 frameno; + } iter; u64 mem_size; }; @@ -330,61 +335,6 @@ find_linfo(const struct bpf_verifier_env *env, u32 insn_off) return &linfo[i - 1]; } -void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, - va_list args) -{ - unsigned int n; - - n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); - - WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, - "verifier log line truncated - local buffer too short\n"); - - if (log->level == BPF_LOG_KERNEL) { - bool newline = n > 0 && log->kbuf[n - 1] == '\n'; - - pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n"); - return; - } - - n = min(log->len_total - log->len_used - 1, n); - log->kbuf[n] = '\0'; - if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) - log->len_used += n; - else - log->ubuf = NULL; -} - -static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos) -{ - char zero = 0; - - if (!bpf_verifier_log_needed(log)) - return; - - log->len_used = new_pos; - if (put_user(zero, log->ubuf + new_pos)) - log->ubuf = NULL; -} - -/* log_level controls verbosity level of eBPF verifier. - * bpf_verifier_log_write() is used to dump the verification trace to the log, - * so the user can figure out what's wrong with the program - */ -__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, - const char *fmt, ...) -{ - va_list args; - - if (!bpf_verifier_log_needed(&env->log)) - return; - - va_start(args, fmt); - bpf_verifier_vlog(&env->log, fmt, args); - va_end(args); -} -EXPORT_SYMBOL_GPL(bpf_verifier_log_write); - __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) { struct bpf_verifier_env *env = private_data; @@ -398,20 +348,6 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) va_end(args); } -__printf(2, 3) void bpf_log(struct bpf_verifier_log *log, - const char *fmt, ...) -{ - va_list args; - - if (!bpf_verifier_log_needed(log)) - return; - - va_start(args, fmt); - bpf_verifier_vlog(log, fmt, args); - va_end(args); -} -EXPORT_SYMBOL_GPL(bpf_log); - static const char *ltrim(const char *s) { while (isspace(*s)) @@ -481,8 +417,17 @@ static bool type_is_sk_pointer(enum bpf_reg_type type) type == PTR_TO_XDP_SOCK; } +static bool type_may_be_null(u32 type) +{ + return type & PTR_MAYBE_NULL; +} + static bool reg_type_not_null(enum bpf_reg_type type) { + if (type_may_be_null(type)) + return false; + + type = base_type(type); return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || type == PTR_TO_MAP_VALUE || @@ -526,11 +471,6 @@ static bool type_is_rdonly_mem(u32 type) return type & MEM_RDONLY; } -static bool type_may_be_null(u32 type) -{ - return type & PTR_MAYBE_NULL; -} - static bool is_acquire_function(enum bpf_func_id func_id, const struct bpf_map *map) { @@ -668,6 +608,7 @@ static char slot_type_char[] = { [STACK_MISC] = 'm', [STACK_ZERO] = '0', [STACK_DYNPTR] = 'd', + [STACK_ITER] = 'i', }; static void print_liveness(struct bpf_verifier_env *env, @@ -742,7 +683,12 @@ static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *re return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS); } -static const char *kernel_type_name(const struct btf* btf, u32 id) +static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots) +{ + return stack_slot_obj_get_spi(env, reg, "iter", nr_slots); +} + +static const char *btf_type_name(const struct btf *btf, u32 id) { return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); } @@ -766,6 +712,30 @@ static const char *dynptr_type_str(enum bpf_dynptr_type type) } } +static const char *iter_type_str(const struct btf *btf, u32 btf_id) +{ + if (!btf || btf_id == 0) + return "<invalid>"; + + /* we already validated that type is valid and has conforming name */ + return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1; +} + +static const char *iter_state_str(enum bpf_iter_state state) +{ + switch (state) { + case BPF_ITER_STATE_ACTIVE: + return "active"; + case BPF_ITER_STATE_DRAINED: + return "drained"; + case BPF_ITER_STATE_INVALID: + return "<invalid>"; + default: + WARN_ONCE(1, "unknown iter state %d\n", state); + return "<unknown>"; + } +} + static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) { env->scratched_regs |= 1U << regno; @@ -1118,6 +1088,157 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg } } +static void __mark_reg_known_zero(struct bpf_reg_state *reg); + +static int mark_stack_slots_iter(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, int insn_idx, + struct btf *btf, u32 btf_id, int nr_slots) +{ + struct bpf_func_state *state = func(env, reg); + int spi, i, j, id; + + spi = iter_get_spi(env, reg, nr_slots); + if (spi < 0) + return spi; + + id = acquire_reference_state(env, insn_idx); + if (id < 0) + return id; + + for (i = 0; i < nr_slots; i++) { + struct bpf_stack_state *slot = &state->stack[spi - i]; + struct bpf_reg_state *st = &slot->spilled_ptr; + + __mark_reg_known_zero(st); + st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ + st->live |= REG_LIVE_WRITTEN; + st->ref_obj_id = i == 0 ? id : 0; + st->iter.btf = btf; + st->iter.btf_id = btf_id; + st->iter.state = BPF_ITER_STATE_ACTIVE; + st->iter.depth = 0; + + for (j = 0; j < BPF_REG_SIZE; j++) + slot->slot_type[j] = STACK_ITER; + + mark_stack_slot_scratched(env, spi - i); + } + + return 0; +} + +static int unmark_stack_slots_iter(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, int nr_slots) +{ + struct bpf_func_state *state = func(env, reg); + int spi, i, j; + + spi = iter_get_spi(env, reg, nr_slots); + if (spi < 0) + return spi; + + for (i = 0; i < nr_slots; i++) { + struct bpf_stack_state *slot = &state->stack[spi - i]; + struct bpf_reg_state *st = &slot->spilled_ptr; + + if (i == 0) + WARN_ON_ONCE(release_reference(env, st->ref_obj_id)); + + __mark_reg_not_init(env, st); + + /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */ + st->live |= REG_LIVE_WRITTEN; + + for (j = 0; j < BPF_REG_SIZE; j++) + slot->slot_type[j] = STACK_INVALID; + + mark_stack_slot_scratched(env, spi - i); + } + + return 0; +} + +static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, int nr_slots) +{ + struct bpf_func_state *state = func(env, reg); + int spi, i, j; + + /* For -ERANGE (i.e. spi not falling into allocated stack slots), we + * will do check_mem_access to check and update stack bounds later, so + * return true for that case. + */ + spi = iter_get_spi(env, reg, nr_slots); + if (spi == -ERANGE) + return true; + if (spi < 0) + return false; + + for (i = 0; i < nr_slots; i++) { + struct bpf_stack_state *slot = &state->stack[spi - i]; + + for (j = 0; j < BPF_REG_SIZE; j++) + if (slot->slot_type[j] == STACK_ITER) + return false; + } + + return true; +} + +static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + struct btf *btf, u32 btf_id, int nr_slots) +{ + struct bpf_func_state *state = func(env, reg); + int spi, i, j; + + spi = iter_get_spi(env, reg, nr_slots); + if (spi < 0) + return false; + + for (i = 0; i < nr_slots; i++) { + struct bpf_stack_state *slot = &state->stack[spi - i]; + struct bpf_reg_state *st = &slot->spilled_ptr; + + /* only main (first) slot has ref_obj_id set */ + if (i == 0 && !st->ref_obj_id) + return false; + if (i != 0 && st->ref_obj_id) + return false; + if (st->iter.btf != btf || st->iter.btf_id != btf_id) + return false; + + for (j = 0; j < BPF_REG_SIZE; j++) + if (slot->slot_type[j] != STACK_ITER) + return false; + } + + return true; +} + +/* Check if given stack slot is "special": + * - spilled register state (STACK_SPILL); + * - dynptr state (STACK_DYNPTR); + * - iter state (STACK_ITER). + */ +static bool is_stack_slot_special(const struct bpf_stack_state *stack) +{ + enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1]; + + switch (type) { + case STACK_SPILL: + case STACK_DYNPTR: + case STACK_ITER: + return true; + case STACK_INVALID: + case STACK_MISC: + case STACK_ZERO: + return false; + default: + WARN_ONCE(1, "unknown stack slot type %d\n", type); + return true; + } +} + /* The reg state of a pointer or a bounded scalar was saved when * it was spilled to the stack. */ @@ -1164,7 +1285,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, "%s", reg_type_str(env, t)); if (base_type(t) == PTR_TO_BTF_ID) - verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id)); + verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id)); verbose(env, "("); /* * _a stands for append, was shortened to avoid multiline statements below. @@ -1267,6 +1388,19 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (reg->ref_obj_id) verbose(env, "(ref_id=%d)", reg->ref_obj_id); break; + case STACK_ITER: + /* only main slot has ref_obj_id set; skip others */ + reg = &state->stack[i].spilled_ptr; + if (!reg->ref_obj_id) + continue; + + verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); + print_liveness(env, reg->live); + verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)", + iter_type_str(reg->iter.btf, reg->iter.btf_id), + reg->ref_obj_id, iter_state_str(reg->iter.state), + reg->iter.depth); + break; case STACK_MISC: case STACK_ZERO: default: @@ -1305,10 +1439,10 @@ static inline u32 vlog_alignment(u32 pos) static void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state) { - if (env->prev_log_len && env->prev_log_len == env->log.len_used) { + if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) { /* remove new line character */ - bpf_vlog_reset(&env->log, env->prev_log_len - 1); - verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_len), ' '); + bpf_vlog_reset(&env->log, env->prev_log_pos - 1); + verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' '); } else { verbose(env, "%d:", env->insn_idx); } @@ -1616,7 +1750,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; - elem->log_pos = env->log.len_used; + elem->log_pos = env->log.end_pos; env->head = elem; env->stack_size++; err = copy_verifier_state(&elem->st, cur); @@ -1946,9 +2080,9 @@ static void __reg_bound_offset(struct bpf_reg_state *reg) struct tnum var64_off = tnum_intersect(reg->var_off, tnum_range(reg->umin_value, reg->umax_value)); - struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off), - tnum_range(reg->u32_min_value, - reg->u32_max_value)); + struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off), + tnum_range(reg->u32_min_value, + reg->u32_max_value)); reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off); } @@ -2152,7 +2286,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; - elem->log_pos = env->log.len_used; + elem->log_pos = env->log.end_pos; env->head = elem; env->stack_size++; if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { @@ -2710,6 +2844,25 @@ static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state * state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64); } +static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + int spi, int nr_slots) +{ + struct bpf_func_state *state = func(env, reg); + int err, i; + + for (i = 0; i < nr_slots; i++) { + struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr; + + err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64); + if (err) + return err; + + mark_stack_slot_scratched(env, spi - i); + } + + return 0; +} + /* This function is supposed to be used by the following 32-bit optimization * code only. It returns TRUE if the source or destination register operates * on 64-bit, otherwise return FALSE. @@ -3691,8 +3844,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, /* regular write of data into stack destroys any spilled ptr */ state->stack[spi].spilled_ptr.type = NOT_INIT; - /* Mark slots as STACK_MISC if they belonged to spilled ptr. */ - if (is_spilled_reg(&state->stack[spi])) + /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */ + if (is_stack_slot_special(&state->stack[spi])) for (i = 0; i < BPF_REG_SIZE; i++) scrub_spilled_slot(&state->stack[spi].slot_type[i]); @@ -4085,17 +4238,13 @@ static int check_stack_read(struct bpf_verifier_env *env, } /* Variable offset is prohibited for unprivileged mode for simplicity * since it requires corresponding support in Spectre masking for stack - * ALU. See also retrieve_ptr_limit(). + * ALU. See also retrieve_ptr_limit(). The check in + * check_stack_access_for_ptr_arithmetic() called by + * adjust_ptr_min_max_vals() prevents users from creating stack pointers + * with variable offsets, therefore no check is required here. Further, + * just checking it here would be insufficient as speculative stack + * writes could still lead to unsafe speculative behaviour. */ - if (!env->bypass_spec_v1 && var_off) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n", - ptr_regno, tn_buf); - return -EACCES; - } - if (!var_off) { off += reg->var_off.value; err = check_stack_read_fixed_off(env, state, off, size, @@ -4301,7 +4450,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, struct btf_field *kptr_field, struct bpf_reg_state *reg, u32 regno) { - const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); + const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU; const char *reg_name = ""; @@ -4317,7 +4466,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, return -EINVAL; } /* We need to verify reg->type and reg->btf, before accessing reg->btf */ - reg_name = kernel_type_name(reg->btf, reg->btf_id); + reg_name = btf_type_name(reg->btf, reg->btf_id); /* For ref_ptr case, release function check should ensure we get one * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the @@ -4381,6 +4530,8 @@ static bool in_rcu_cs(struct bpf_verifier_env *env) BTF_SET_START(rcu_protected_types) BTF_ID(struct, prog_test_ref_kfunc) BTF_ID(struct, cgroup) +BTF_ID(struct, bpf_cpumask) +BTF_ID(struct, task_struct) BTF_SET_END(rcu_protected_types) static bool rcu_protected_object(const struct btf *btf, u32 btf_id) @@ -4754,6 +4905,11 @@ static bool is_rcu_reg(const struct bpf_reg_state *reg) return reg->type & MEM_RCU; } +static void clear_trusted_flags(enum bpf_type_flag *flag) +{ + *flag &= ~(BPF_REG_TRUSTED_MODIFIERS | MEM_RCU); +} + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int off, int size, bool strict) @@ -5158,6 +5314,7 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) } #define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu) +#define BTF_TYPE_SAFE_RCU_OR_NULL(__type) __PASTE(__type, __safe_rcu_or_null) #define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted) /* @@ -5174,18 +5331,39 @@ BTF_TYPE_SAFE_RCU(struct task_struct) { struct task_struct *group_leader; }; +BTF_TYPE_SAFE_RCU(struct cgroup) { + /* cgrp->kn is always accessible as documented in kernel/cgroup/cgroup.c */ + struct kernfs_node *kn; +}; + BTF_TYPE_SAFE_RCU(struct css_set) { struct cgroup *dfl_cgrp; }; +/* RCU trusted: these fields are trusted in RCU CS and can be NULL */ +BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) { + struct file __rcu *exe_file; +}; + +/* skb->sk, req->sk are not RCU protected, but we mark them as such + * because bpf prog accessible sockets are SOCK_RCU_FREE. + */ +BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff) { + struct sock *sk; +}; + +BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock) { + struct sock *sk; +}; + /* full trusted: these fields are trusted even outside of RCU CS and never NULL */ BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) { - __bpf_md_ptr(struct seq_file *, seq); + struct seq_file *seq; }; BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) { - __bpf_md_ptr(struct bpf_iter_meta *, meta); - __bpf_md_ptr(struct task_struct *, task); + struct bpf_iter_meta *meta; + struct task_struct *task; }; BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) { @@ -5207,17 +5385,29 @@ BTF_TYPE_SAFE_TRUSTED(struct socket) { static bool type_is_rcu(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int off) + const char *field_name, u32 btf_id) { BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set)); - return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_rcu"); + return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu"); +} + +static bool type_is_rcu_or_null(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + const char *field_name, u32 btf_id) +{ + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock)); + + return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu_or_null"); } static bool type_is_trusted(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - int off) + const char *field_name, u32 btf_id) { BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task)); @@ -5226,7 +5416,7 @@ static bool type_is_trusted(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket)); - return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_trusted"); + return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted"); } static int check_ptr_to_btf_access(struct bpf_verifier_env *env, @@ -5238,8 +5428,9 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg = regs + regno; const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); const char *tname = btf_name_by_offset(reg->btf, t->name_off); + const char *field_name = NULL; enum bpf_type_flag flag = 0; - u32 btf_id; + u32 btf_id = 0; int ret; if (!env->allow_ptr_leaks) { @@ -5284,12 +5475,12 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } - if (env->ops->btf_struct_access && !type_is_alloc(reg->type)) { + if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) { if (!btf_is_kernel(reg->btf)) { verbose(env, "verifier internal error: reg->btf must be kernel btf\n"); return -EFAULT; } - ret = env->ops->btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag); + ret = env->ops->btf_struct_access(&env->log, reg, off, size); } else { /* Writes are permitted with default btf_struct_access for * program allocated objects (which always have ref_obj_id > 0), @@ -5306,7 +5497,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EFAULT; } - ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag); + ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag, &field_name); } if (ret < 0) @@ -5334,20 +5525,21 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, * A regular RCU-protected pointer with __rcu tag can also be deemed * trusted if we are in an RCU CS. Such pointer can be NULL. */ - if (type_is_trusted(env, reg, off)) { + if (type_is_trusted(env, reg, field_name, btf_id)) { flag |= PTR_TRUSTED; } else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) { - if (type_is_rcu(env, reg, off)) { + if (type_is_rcu(env, reg, field_name, btf_id)) { /* ignore __rcu tag and mark it MEM_RCU */ flag |= MEM_RCU; - } else if (flag & MEM_RCU) { + } else if (flag & MEM_RCU || + type_is_rcu_or_null(env, reg, field_name, btf_id)) { /* __rcu tagged pointers can be NULL */ - flag |= PTR_MAYBE_NULL; + flag |= MEM_RCU | PTR_MAYBE_NULL; } else if (flag & (MEM_PERCPU | MEM_USER)) { /* keep as-is */ } else { - /* walking unknown pointers yields untrusted pointer */ - flag = PTR_UNTRUSTED; + /* walking unknown pointers yields old deprecated PTR_TO_BTF_ID */ + clear_trusted_flags(&flag); } } else { /* @@ -5361,7 +5553,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } } else { /* Old compat. Deprecated */ - flag &= ~PTR_TRUSTED; + clear_trusted_flags(&flag); } if (atype == BPF_READ && value_regno >= 0) @@ -5420,7 +5612,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, /* Simulate access to a PTR_TO_BTF_ID */ memset(&map_reg, 0, sizeof(map_reg)); mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0); - ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag); + ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL); if (ret < 0) return ret; @@ -6086,6 +6278,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, env, regno, reg->off, access_size, zero_size_allowed, ACCESS_HELPER, meta); + case PTR_TO_BTF_ID: + return check_ptr_to_btf_access(env, regs, regno, reg->off, + access_size, BPF_READ, -1); case PTR_TO_CTX: /* in case the function doesn't know how to access the context, * (because we are in a program of type SYSCALL for example), we @@ -6506,6 +6701,203 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn return err; } +static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi) +{ + struct bpf_func_state *state = func(env, reg); + + return state->stack[spi].spilled_ptr.ref_obj_id; +} + +static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY); +} + +static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_ITER_NEW; +} + +static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_ITER_NEXT; +} + +static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_ITER_DESTROY; +} + +static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg) +{ + /* btf_check_iter_kfuncs() guarantees that first argument of any iter + * kfunc is iter state pointer + */ + return arg == 0 && is_iter_kfunc(meta); +} + +static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + const struct btf_type *t; + const struct btf_param *arg; + int spi, err, i, nr_slots; + u32 btf_id; + + /* btf_check_iter_kfuncs() ensures we don't need to validate anything here */ + arg = &btf_params(meta->func_proto)[0]; + t = btf_type_skip_modifiers(meta->btf, arg->type, NULL); /* PTR */ + t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id); /* STRUCT */ + nr_slots = t->size / BPF_REG_SIZE; + + if (is_iter_new_kfunc(meta)) { + /* bpf_iter_<type>_new() expects pointer to uninit iter state */ + if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) { + verbose(env, "expected uninitialized iter_%s as arg #%d\n", + iter_type_str(meta->btf, btf_id), regno); + return -EINVAL; + } + + for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) { + err = check_mem_access(env, insn_idx, regno, + i, BPF_DW, BPF_WRITE, -1, false); + if (err) + return err; + } + + err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots); + if (err) + return err; + } else { + /* iter_next() or iter_destroy() expect initialized iter state*/ + if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) { + verbose(env, "expected an initialized iter_%s as arg #%d\n", + iter_type_str(meta->btf, btf_id), regno); + return -EINVAL; + } + + spi = iter_get_spi(env, reg, nr_slots); + if (spi < 0) + return spi; + + err = mark_iter_read(env, reg, spi, nr_slots); + if (err) + return err; + + /* remember meta->iter info for process_iter_next_call() */ + meta->iter.spi = spi; + meta->iter.frameno = reg->frameno; + meta->ref_obj_id = iter_ref_obj_id(env, reg, spi); + + if (is_iter_destroy_kfunc(meta)) { + err = unmark_stack_slots_iter(env, reg, nr_slots); + if (err) + return err; + } + } + + return 0; +} + +/* process_iter_next_call() is called when verifier gets to iterator's next + * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer + * to it as just "iter_next()" in comments below. + * + * BPF verifier relies on a crucial contract for any iter_next() + * implementation: it should *eventually* return NULL, and once that happens + * it should keep returning NULL. That is, once iterator exhausts elements to + * iterate, it should never reset or spuriously return new elements. + * + * With the assumption of such contract, process_iter_next_call() simulates + * a fork in the verifier state to validate loop logic correctness and safety + * without having to simulate infinite amount of iterations. + * + * In current state, we first assume that iter_next() returned NULL and + * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such + * conditions we should not form an infinite loop and should eventually reach + * exit. + * + * Besides that, we also fork current state and enqueue it for later + * verification. In a forked state we keep iterator state as ACTIVE + * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We + * also bump iteration depth to prevent erroneous infinite loop detection + * later on (see iter_active_depths_differ() comment for details). In this + * state we assume that we'll eventually loop back to another iter_next() + * calls (it could be in exactly same location or in some other instruction, + * it doesn't matter, we don't make any unnecessary assumptions about this, + * everything revolves around iterator state in a stack slot, not which + * instruction is calling iter_next()). When that happens, we either will come + * to iter_next() with equivalent state and can conclude that next iteration + * will proceed in exactly the same way as we just verified, so it's safe to + * assume that loop converges. If not, we'll go on another iteration + * simulation with a different input state, until all possible starting states + * are validated or we reach maximum number of instructions limit. + * + * This way, we will either exhaustively discover all possible input states + * that iterator loop can start with and eventually will converge, or we'll + * effectively regress into bounded loop simulation logic and either reach + * maximum number of instructions if loop is not provably convergent, or there + * is some statically known limit on number of iterations (e.g., if there is + * an explicit `if n > 100 then break;` statement somewhere in the loop). + * + * One very subtle but very important aspect is that we *always* simulate NULL + * condition first (as the current state) before we simulate non-NULL case. + * This has to do with intricacies of scalar precision tracking. By simulating + * "exit condition" of iter_next() returning NULL first, we make sure all the + * relevant precision marks *that will be set **after** we exit iterator loop* + * are propagated backwards to common parent state of NULL and non-NULL + * branches. Thanks to that, state equivalence checks done later in forked + * state, when reaching iter_next() for ACTIVE iterator, can assume that + * precision marks are finalized and won't change. Because simulating another + * ACTIVE iterator iteration won't change them (because given same input + * states we'll end up with exactly same output states which we are currently + * comparing; and verification after the loop already propagated back what + * needs to be **additionally** tracked as precise). It's subtle, grok + * precision tracking for more intuitive understanding. + */ +static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_verifier_state *cur_st = env->cur_state, *queued_st; + struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr; + struct bpf_reg_state *cur_iter, *queued_iter; + int iter_frameno = meta->iter.frameno; + int iter_spi = meta->iter.spi; + + BTF_TYPE_EMIT(struct bpf_iter); + + cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr; + + if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE && + cur_iter->iter.state != BPF_ITER_STATE_DRAINED) { + verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n", + cur_iter->iter.state, iter_state_str(cur_iter->iter.state)); + return -EFAULT; + } + + if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) { + /* branch out active iter state */ + queued_st = push_stack(env, insn_idx + 1, insn_idx, false); + if (!queued_st) + return -ENOMEM; + + queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr; + queued_iter->iter.state = BPF_ITER_STATE_ACTIVE; + queued_iter->iter.depth++; + + queued_fr = queued_st->frame[queued_st->curframe]; + mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]); + } + + /* switch to DRAINED state, but keep the depth unchanged */ + /* mark current iter state as drained and assume returned NULL */ + cur_iter->iter.state = BPF_ITER_STATE_DRAINED; + __mark_reg_const_zero(&cur_fr->regs[BPF_REG_0]); + + return 0; +} + static bool arg_type_is_mem_size(enum bpf_arg_type type) { return type == ARG_CONST_SIZE || @@ -6600,6 +6992,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_MEM, PTR_TO_MEM | MEM_RINGBUF, PTR_TO_BUF, + PTR_TO_BTF_ID | PTR_TRUSTED, }, }; @@ -6709,6 +7102,9 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, if (arg_type & PTR_MAYBE_NULL) type &= ~PTR_MAYBE_NULL; + if (meta->func_id == BPF_FUNC_kptr_xchg && type & MEM_ALLOC) + type &= ~MEM_ALLOC; + for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { expected = compatible->types[i]; if (expected == NOT_INIT) @@ -6728,10 +7124,23 @@ found: if (base_type(reg->type) != PTR_TO_BTF_ID) return 0; + if (compatible == &mem_types) { + if (!(arg_type & MEM_RDONLY)) { + verbose(env, + "%s() may write into memory pointed by R%d type=%s\n", + func_id_name(meta->func_id), + regno, reg_type_str(env, reg->type)); + return -EACCES; + } + return 0; + } + switch ((int)reg->type) { case PTR_TO_BTF_ID: case PTR_TO_BTF_ID | PTR_TRUSTED: case PTR_TO_BTF_ID | MEM_RCU: + case PTR_TO_BTF_ID | PTR_MAYBE_NULL: + case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU: { /* For bpf_sk_release, it needs to match against first member * 'struct sock_common', hence make an exception for it. This @@ -6740,6 +7149,12 @@ found: bool strict_type_match = arg_type_is_release(arg_type) && meta->func_id != BPF_FUNC_sk_release; + if (type_may_be_null(reg->type) && + (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) { + verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno); + return -EACCES; + } + if (!arg_btf_id) { if (!compatible->btf_id) { verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); @@ -6763,15 +7178,16 @@ found: btf_vmlinux, *arg_btf_id, strict_type_match)) { verbose(env, "R%d is of type %s but %s is expected\n", - regno, kernel_type_name(reg->btf, reg->btf_id), - kernel_type_name(btf_vmlinux, *arg_btf_id)); + regno, btf_type_name(reg->btf, reg->btf_id), + btf_type_name(btf_vmlinux, *arg_btf_id)); return -EACCES; } } break; } case PTR_TO_BTF_ID | MEM_ALLOC: - if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock) { + if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock && + meta->func_id != BPF_FUNC_kptr_xchg) { verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); return -EFAULT; } @@ -6834,7 +7250,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, verbose(env, "R%d must have zero offset when passed to release func\n", regno); verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno, - kernel_type_name(reg->btf, reg->btf_id), reg->off); + btf_type_name(reg->btf, reg->btf_id), reg->off); return -EINVAL; } @@ -8737,6 +9153,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (func_id == BPF_FUNC_kptr_xchg) { ret_btf = meta.kptr_field->kptr.btf; ret_btf_id = meta.kptr_field->kptr.btf_id; + if (!btf_is_kernel(ret_btf)) + regs[BPF_REG_0].type |= MEM_ALLOC; } else { if (fn->ret_btf_id == BPF_PTR_POISON) { verbose(env, "verifier internal error:"); @@ -8870,7 +9288,7 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta) { - return meta->kfunc_flags & KF_TRUSTED_ARGS; + return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta); } static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta) @@ -9099,6 +9517,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */ KF_ARG_PTR_TO_KPTR, /* PTR_TO_KPTR but type specific */ KF_ARG_PTR_TO_DYNPTR, + KF_ARG_PTR_TO_ITER, KF_ARG_PTR_TO_LIST_HEAD, KF_ARG_PTR_TO_LIST_NODE, KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */ @@ -9220,6 +9639,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_dynptr(meta->btf, &args[argno])) return KF_ARG_PTR_TO_DYNPTR; + if (is_kfunc_arg_iter(meta, argno)) + return KF_ARG_PTR_TO_ITER; + if (is_kfunc_arg_list_head(meta->btf, &args[argno])) return KF_ARG_PTR_TO_LIST_HEAD; @@ -9848,6 +10270,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; case KF_ARG_PTR_TO_KPTR: case KF_ARG_PTR_TO_DYNPTR: + case KF_ARG_PTR_TO_ITER: case KF_ARG_PTR_TO_LIST_HEAD: case KF_ARG_PTR_TO_LIST_NODE: case KF_ARG_PTR_TO_RB_ROOT: @@ -9944,6 +10367,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; } + case KF_ARG_PTR_TO_ITER: + ret = process_iter_arg(env, regno, insn_idx, meta); + if (ret < 0) + return ret; + break; case KF_ARG_PTR_TO_LIST_HEAD: if (reg->type != PTR_TO_MAP_VALUE && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { @@ -10079,24 +10507,21 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return 0; } -static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, - int *insn_idx_p) +static int fetch_kfunc_meta(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct bpf_kfunc_call_arg_meta *meta, + const char **kfunc_name) { - const struct btf_type *t, *func, *func_proto, *ptr_type; - u32 i, nargs, func_id, ptr_type_id, release_ref_obj_id; - struct bpf_reg_state *regs = cur_regs(env); - const char *func_name, *ptr_type_name; - bool sleepable, rcu_lock, rcu_unlock; - struct bpf_kfunc_call_arg_meta meta; - int err, insn_idx = *insn_idx_p; - const struct btf_param *args; - const struct btf_type *ret_t; + const struct btf_type *func, *func_proto; + u32 func_id, *kfunc_flags; + const char *func_name; struct btf *desc_btf; - u32 *kfunc_flags; - /* skip for now, but return error when we find this in fixup_kfunc_call */ + if (kfunc_name) + *kfunc_name = NULL; + if (!insn->imm) - return 0; + return -EINVAL; desc_btf = find_kfunc_desc_btf(env, insn->off); if (IS_ERR(desc_btf)) @@ -10105,22 +10530,53 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, func_id = insn->imm; func = btf_type_by_id(desc_btf, func_id); func_name = btf_name_by_offset(desc_btf, func->name_off); + if (kfunc_name) + *kfunc_name = func_name; func_proto = btf_type_by_id(desc_btf, func->type); kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id); if (!kfunc_flags) { - verbose(env, "calling kernel function %s is not allowed\n", - func_name); return -EACCES; } - /* Prepare kfunc call metadata */ - memset(&meta, 0, sizeof(meta)); - meta.btf = desc_btf; - meta.func_id = func_id; - meta.kfunc_flags = *kfunc_flags; - meta.func_proto = func_proto; - meta.func_name = func_name; + memset(meta, 0, sizeof(*meta)); + meta->btf = desc_btf; + meta->func_id = func_id; + meta->kfunc_flags = *kfunc_flags; + meta->func_proto = func_proto; + meta->func_name = func_name; + + return 0; +} + +static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx_p) +{ + const struct btf_type *t, *ptr_type; + u32 i, nargs, ptr_type_id, release_ref_obj_id; + struct bpf_reg_state *regs = cur_regs(env); + const char *func_name, *ptr_type_name; + bool sleepable, rcu_lock, rcu_unlock; + struct bpf_kfunc_call_arg_meta meta; + struct bpf_insn_aux_data *insn_aux; + int err, insn_idx = *insn_idx_p; + const struct btf_param *args; + const struct btf_type *ret_t; + struct btf *desc_btf; + + /* skip for now, but return error when we find this in fixup_kfunc_call */ + if (!insn->imm) + return 0; + + err = fetch_kfunc_meta(env, insn, &meta, &func_name); + if (err == -EACCES && func_name) + verbose(env, "calling kernel function %s is not allowed\n", func_name); + if (err) + return err; + desc_btf = meta.btf; + insn_aux = &env->insn_aux_data[insn_idx]; + + insn_aux->is_iter_next = is_iter_next_kfunc(&meta); if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) { verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n"); @@ -10173,7 +10629,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, err = release_reference(env, regs[meta.release_regno].ref_obj_id); if (err) { verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, func_id); + func_name, meta.func_id); return err; } } @@ -10185,14 +10641,14 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, err = ref_convert_owning_non_owning(env, release_ref_obj_id); if (err) { verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n", - func_name, func_id); + func_name, meta.func_id); return err; } err = release_reference(env, release_ref_obj_id); if (err) { verbose(env, "kfunc %s#%d reference has not been acquired before\n", - func_name, func_id); + func_name, meta.func_id); return err; } } @@ -10202,7 +10658,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, set_rbtree_add_callback_state); if (err) { verbose(env, "kfunc %s#%d failed callback verification\n", - func_name, func_id); + func_name, meta.func_id); return err; } } @@ -10211,7 +10667,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, mark_reg_not_init(env, regs, caller_saved[i]); /* Check return type */ - t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL); + t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL); if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) { /* Only exception is bpf_obj_new_impl */ @@ -10260,13 +10716,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].btf = ret_btf; regs[BPF_REG_0].btf_id = ret_btf_id; - env->insn_aux_data[insn_idx].obj_new_size = ret_t->size; - env->insn_aux_data[insn_idx].kptr_struct_meta = + insn_aux->obj_new_size = ret_t->size; + insn_aux->kptr_struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id); - } else if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { - env->insn_aux_data[insn_idx].kptr_struct_meta = - btf_find_struct_meta(meta.arg_obj_drop.btf, - meta.arg_obj_drop.btf_id); } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] || meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) { struct btf_field *field = meta.arg_list_head.field; @@ -10395,10 +10847,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (reg_may_point_to_spin_lock(®s[BPF_REG_0]) && !regs[BPF_REG_0].id) regs[BPF_REG_0].id = ++env->id_gen; - } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ + } else if (btf_type_is_void(t)) { + if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { + if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { + insn_aux->kptr_struct_meta = + btf_find_struct_meta(meta.arg_obj_drop.btf, + meta.arg_obj_drop.btf_id); + } + } + } - nargs = btf_type_vlen(func_proto); - args = (const struct btf_param *)(func_proto + 1); + nargs = btf_type_vlen(meta.func_proto); + args = (const struct btf_param *)(meta.func_proto + 1); for (i = 0; i < nargs; i++) { u32 regno = i + 1; @@ -10410,6 +10870,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, mark_btf_func_reg_size(env, regno, t->size); } + if (is_iter_next_kfunc(&meta)) { + err = process_iter_next_call(env, insn_idx, &meta); + if (err) + return err; + } + return 0; } @@ -12116,10 +12582,14 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) case BPF_JEQ: if (tnum_is_const(subreg)) return !!tnum_equals_const(subreg, val); + else if (val < reg->u32_min_value || val > reg->u32_max_value) + return 0; break; case BPF_JNE: if (tnum_is_const(subreg)) return !tnum_equals_const(subreg, val); + else if (val < reg->u32_min_value || val > reg->u32_max_value) + return 1; break; case BPF_JSET: if ((~subreg.mask & subreg.value) & val) @@ -12189,10 +12659,14 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) case BPF_JEQ: if (tnum_is_const(reg->var_off)) return !!tnum_equals_const(reg->var_off, val); + else if (val < reg->umin_value || val > reg->umax_value) + return 0; break; case BPF_JNE: if (tnum_is_const(reg->var_off)) return !tnum_equals_const(reg->var_off, val); + else if (val < reg->umin_value || val > reg->umax_value) + return 1; break; case BPF_JSET: if ((~reg->var_off.mask & reg->var_off.value) & val) @@ -12813,6 +13287,18 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, src_reg->var_off.value, opcode, is_jmp32); + } else if (dst_reg->type == SCALAR_VALUE && + is_jmp32 && tnum_is_const(tnum_subreg(dst_reg->var_off))) { + pred = is_branch_taken(src_reg, + tnum_subreg(dst_reg->var_off).value, + flip_opcode(opcode), + is_jmp32); + } else if (dst_reg->type == SCALAR_VALUE && + !is_jmp32 && tnum_is_const(dst_reg->var_off)) { + pred = is_branch_taken(src_reg, + dst_reg->var_off.value, + flip_opcode(opcode), + is_jmp32); } else if (reg_is_pkt_pointer_any(dst_reg) && reg_is_pkt_pointer_any(src_reg) && !is_jmp32) { @@ -13407,6 +13893,17 @@ static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx) return env->insn_aux_data[insn_idx].prune_point; } +static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx) +{ + env->insn_aux_data[idx].force_checkpoint = true; +} + +static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].force_checkpoint; +} + + enum { DONE_EXPLORING = 0, KEEP_EXPLORING = 1, @@ -13522,6 +14019,26 @@ static int visit_insn(int t, struct bpf_verifier_env *env) * async state will be pushed for further exploration. */ mark_prune_point(env, t); + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + struct bpf_kfunc_call_arg_meta meta; + + ret = fetch_kfunc_meta(env, insn, &meta, NULL); + if (ret == 0 && is_iter_next_kfunc(&meta)) { + mark_prune_point(env, t); + /* Checking and saving state checkpoints at iter_next() call + * is crucial for fast convergence of open-coded iterator loop + * logic, so we need to force it. If we don't do that, + * is_state_visited() might skip saving a checkpoint, causing + * unnecessarily long sequence of not checkpointed + * instructions and jumps, leading to exhaustion of jump + * history buffer, and potentially other undesired outcomes. + * It is expected that with correct open-coded iterators + * convergence will happen quickly, so we don't run a risk of + * exhausting memory. + */ + mark_force_checkpoint(env, t); + } + } return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); case BPF_JA: @@ -14275,6 +14792,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, * didn't use them */ for (i = 0; i < old->allocated_stack; i++) { + struct bpf_reg_state *old_reg, *cur_reg; + spi = i / BPF_REG_SIZE; if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) { @@ -14331,9 +14850,6 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, return false; break; case STACK_DYNPTR: - { - const struct bpf_reg_state *old_reg, *cur_reg; - old_reg = &old->stack[spi].spilled_ptr; cur_reg = &cur->stack[spi].spilled_ptr; if (old_reg->dynptr.type != cur_reg->dynptr.type || @@ -14341,7 +14857,22 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) return false; break; - } + case STACK_ITER: + old_reg = &old->stack[spi].spilled_ptr; + cur_reg = &cur->stack[spi].spilled_ptr; + /* iter.depth is not compared between states as it + * doesn't matter for correctness and would otherwise + * prevent convergence; we maintain it only to prevent + * infinite loop check triggering, see + * iter_active_depths_differ() + */ + if (old_reg->iter.btf != cur_reg->iter.btf || + old_reg->iter.btf_id != cur_reg->iter.btf_id || + old_reg->iter.state != cur_reg->iter.state || + /* ignore {old_reg,cur_reg}->iter.depth, see above */ + !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap)) + return false; + break; case STACK_MISC: case STACK_ZERO: case STACK_INVALID: @@ -14555,10 +15086,11 @@ static int propagate_precision(struct bpf_verifier_env *env, state_reg = state->regs; for (i = 0; i < BPF_REG_FP; i++, state_reg++) { if (state_reg->type != SCALAR_VALUE || - !state_reg->precise) + !state_reg->precise || + !(state_reg->live & REG_LIVE_READ)) continue; if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "frame %d: propagating r%d\n", i, fr); + verbose(env, "frame %d: propagating r%d\n", fr, i); err = mark_chain_precision_frame(env, fr, i); if (err < 0) return err; @@ -14569,11 +15101,12 @@ static int propagate_precision(struct bpf_verifier_env *env, continue; state_reg = &state->stack[i].spilled_ptr; if (state_reg->type != SCALAR_VALUE || - !state_reg->precise) + !state_reg->precise || + !(state_reg->live & REG_LIVE_READ)) continue; if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "frame %d: propagating fp%d\n", - (-i - 1) * BPF_REG_SIZE, fr); + fr, (-i - 1) * BPF_REG_SIZE); err = mark_chain_precision_stack_frame(env, fr, i); if (err < 0) return err; @@ -14600,6 +15133,92 @@ static bool states_maybe_looping(struct bpf_verifier_state *old, return true; } +static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].is_iter_next; +} + +/* is_state_visited() handles iter_next() (see process_iter_next_call() for + * terminology) calls specially: as opposed to bounded BPF loops, it *expects* + * states to match, which otherwise would look like an infinite loop. So while + * iter_next() calls are taken care of, we still need to be careful and + * prevent erroneous and too eager declaration of "ininite loop", when + * iterators are involved. + * + * Here's a situation in pseudo-BPF assembly form: + * + * 0: again: ; set up iter_next() call args + * 1: r1 = &it ; <CHECKPOINT HERE> + * 2: call bpf_iter_num_next ; this is iter_next() call + * 3: if r0 == 0 goto done + * 4: ... something useful here ... + * 5: goto again ; another iteration + * 6: done: + * 7: r1 = &it + * 8: call bpf_iter_num_destroy ; clean up iter state + * 9: exit + * + * This is a typical loop. Let's assume that we have a prune point at 1:, + * before we get to `call bpf_iter_num_next` (e.g., because of that `goto + * again`, assuming other heuristics don't get in a way). + * + * When we first time come to 1:, let's say we have some state X. We proceed + * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit. + * Now we come back to validate that forked ACTIVE state. We proceed through + * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we + * are converging. But the problem is that we don't know that yet, as this + * convergence has to happen at iter_next() call site only. So if nothing is + * done, at 1: verifier will use bounded loop logic and declare infinite + * looping (and would be *technically* correct, if not for iterator's + * "eventual sticky NULL" contract, see process_iter_next_call()). But we + * don't want that. So what we do in process_iter_next_call() when we go on + * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's + * a different iteration. So when we suspect an infinite loop, we additionally + * check if any of the *ACTIVE* iterator states depths differ. If yes, we + * pretend we are not looping and wait for next iter_next() call. + * + * This only applies to ACTIVE state. In DRAINED state we don't expect to + * loop, because that would actually mean infinite loop, as DRAINED state is + * "sticky", and so we'll keep returning into the same instruction with the + * same state (at least in one of possible code paths). + * + * This approach allows to keep infinite loop heuristic even in the face of + * active iterator. E.g., C snippet below is and will be detected as + * inifintely looping: + * + * struct bpf_iter_num it; + * int *p, x; + * + * bpf_iter_num_new(&it, 0, 10); + * while ((p = bpf_iter_num_next(&t))) { + * x = p; + * while (x--) {} // <<-- infinite loop here + * } + * + */ +static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur) +{ + struct bpf_reg_state *slot, *cur_slot; + struct bpf_func_state *state; + int i, fr; + + for (fr = old->curframe; fr >= 0; fr--) { + state = old->frame[fr]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_ITER) + continue; + + slot = &state->stack[i].spilled_ptr; + if (slot->iter.state != BPF_ITER_STATE_ACTIVE) + continue; + + cur_slot = &cur->frame[fr]->stack[i].spilled_ptr; + if (cur_slot->iter.depth != slot->iter.depth) + return true; + } + } + return false; +} static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { @@ -14607,7 +15226,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; - bool add_new_state = env->test_state_freq ? true : false; + bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx); + bool add_new_state = force_new_state; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -14647,8 +15267,46 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * Since the verifier still needs to catch infinite loops * inside async callbacks. */ - } else if (states_maybe_looping(&sl->state, cur) && - states_equal(env, &sl->state, cur)) { + goto skip_inf_loop_check; + } + /* BPF open-coded iterators loop detection is special. + * states_maybe_looping() logic is too simplistic in detecting + * states that *might* be equivalent, because it doesn't know + * about ID remapping, so don't even perform it. + * See process_iter_next_call() and iter_active_depths_differ() + * for overview of the logic. When current and one of parent + * states are detected as equivalent, it's a good thing: we prove + * convergence and can stop simulating further iterations. + * It's safe to assume that iterator loop will finish, taking into + * account iter_next() contract of eventually returning + * sticky NULL result. + */ + if (is_iter_next_insn(env, insn_idx)) { + if (states_equal(env, &sl->state, cur)) { + struct bpf_func_state *cur_frame; + struct bpf_reg_state *iter_state, *iter_reg; + int spi; + + cur_frame = cur->frame[cur->curframe]; + /* btf_check_iter_kfuncs() enforces that + * iter state pointer is always the first arg + */ + iter_reg = &cur_frame->regs[BPF_REG_1]; + /* current state is valid due to states_equal(), + * so we can assume valid iter and reg state, + * no need for extra (re-)validations + */ + spi = __get_spi(iter_reg->off + iter_reg->var_off.value); + iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr; + if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) + goto hit; + } + goto skip_inf_loop_check; + } + /* attempt to detect infinite loop to avoid unnecessary doomed work */ + if (states_maybe_looping(&sl->state, cur) && + states_equal(env, &sl->state, cur) && + !iter_active_depths_differ(&sl->state, cur)) { verbose_linfo(env, insn_idx, "; "); verbose(env, "infinite loop detected at insn %d\n", insn_idx); return -EINVAL; @@ -14665,13 +15323,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * This threshold shouldn't be too high either, since states * at the end of the loop are likely to be useful in pruning. */ - if (!env->test_state_freq && +skip_inf_loop_check: + if (!force_new_state && env->jmps_processed - env->prev_jmps_processed < 20 && env->insn_processed - env->prev_insn_processed < 100) add_new_state = false; goto miss; } if (states_equal(env, &sl->state, cur)) { +hit: sl->hit_cnt++; /* reached equivalent register/stack state, * prune the search. @@ -14978,11 +15638,11 @@ static int do_check(struct bpf_verifier_env *env) print_insn_state(env, state->frame[state->curframe]); verbose_linfo(env, env->insn_idx, "; "); - env->prev_log_len = env->log.len_used; + env->prev_log_pos = env->log.end_pos; verbose(env, "%d: ", env->insn_idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); - env->prev_insn_print_len = env->log.len_used - env->prev_log_len; - env->prev_log_len = env->log.len_used; + env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos; + env->prev_log_pos = env->log.end_pos; } if (bpf_prog_is_offloaded(env->prog->aux)) { @@ -15301,8 +15961,8 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, goto err_put; } - if (!btf_type_is_var(t)) { - verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", id); + if (!btf_type_is_var(t) && !btf_type_is_func(t)) { + verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR or KIND_FUNC\n", id); err = -EINVAL; goto err_put; } @@ -15315,6 +15975,14 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, err = -ENOENT; goto err_put; } + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; + + if (btf_type_is_func(t)) { + aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY; + aux->btf_var.mem_size = 0; + goto check_btf; + } datasec_id = find_btf_percpu_datasec(btf); if (datasec_id > 0) { @@ -15327,9 +15995,6 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, } } - insn[0].imm = (u32)addr; - insn[1].imm = addr >> 32; - type = t->type; t = btf_type_skip_modifiers(btf, type, NULL); if (percpu) { @@ -15357,7 +16022,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } - +check_btf: /* check whether we recorded this BTF (and maybe module) already */ for (i = 0; i < env->used_btf_cnt; i++) { if (env->used_btfs[i].btf == btf) { @@ -17032,21 +17697,21 @@ static int do_misc_fixups(struct bpf_verifier_env *env) BUILD_BUG_ON(!__same_type(ops->map_lookup_elem, (void *(*)(struct bpf_map *map, void *key))NULL)); BUILD_BUG_ON(!__same_type(ops->map_delete_elem, - (int (*)(struct bpf_map *map, void *key))NULL)); + (long (*)(struct bpf_map *map, void *key))NULL)); BUILD_BUG_ON(!__same_type(ops->map_update_elem, - (int (*)(struct bpf_map *map, void *key, void *value, + (long (*)(struct bpf_map *map, void *key, void *value, u64 flags))NULL)); BUILD_BUG_ON(!__same_type(ops->map_push_elem, - (int (*)(struct bpf_map *map, void *value, + (long (*)(struct bpf_map *map, void *value, u64 flags))NULL)); BUILD_BUG_ON(!__same_type(ops->map_pop_elem, - (int (*)(struct bpf_map *map, void *value))NULL)); + (long (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_peek_elem, - (int (*)(struct bpf_map *map, void *value))NULL)); + (long (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_redirect, - (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL)); + (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL)); BUILD_BUG_ON(!__same_type(ops->map_for_each_callback, - (int (*)(struct bpf_map *map, + (long (*)(struct bpf_map *map, bpf_callback_t callback_fn, void *callback_ctx, u64 flags))NULL)); @@ -17654,6 +18319,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, const char *tname; struct btf *btf; long addr = 0; + struct module *mod = NULL; if (!btf_id) { bpf_log(log, "Tracing programs must provide btf_id\n"); @@ -17827,8 +18493,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, else addr = (long) tgt_prog->aux->func[subprog]->bpf_func; } else { - addr = kallsyms_lookup_name(tname); + if (btf_is_module(btf)) { + mod = btf_try_get_module(btf); + if (mod) + addr = find_kallsyms_symbol_value(mod, tname); + else + addr = 0; + } else { + addr = kallsyms_lookup_name(tname); + } if (!addr) { + module_put(mod); bpf_log(log, "The address of function %s cannot be found\n", tname); @@ -17868,11 +18543,13 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, break; } if (ret) { + module_put(mod); bpf_log(log, "%s is not sleepable\n", tname); return ret; } } else if (prog->expected_attach_type == BPF_MODIFY_RETURN) { if (tgt_prog) { + module_put(mod); bpf_log(log, "can't modify return codes of BPF programs\n"); return -EINVAL; } @@ -17881,6 +18558,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, !check_attach_modify_return(addr, tname)) ret = 0; if (ret) { + module_put(mod); bpf_log(log, "%s() is not modifiable\n", tname); return ret; } @@ -17891,6 +18569,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, tgt_info->tgt_addr = addr; tgt_info->tgt_name = tname; tgt_info->tgt_type = t; + tgt_info->tgt_mod = mod; return 0; } @@ -17970,6 +18649,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) /* store info about the attachment target that will be used later */ prog->aux->attach_func_proto = tgt_info.tgt_type; prog->aux->attach_func_name = tgt_info.tgt_name; + prog->aux->mod = tgt_info.tgt_mod; if (tgt_prog) { prog->aux->saved_dst_prog_type = tgt_prog->type; @@ -18014,12 +18694,12 @@ struct btf *bpf_get_btf_vmlinux(void) return btf_vmlinux; } -int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { u64 start_time = ktime_get_ns(); struct bpf_verifier_env *env; - struct bpf_verifier_log *log; - int i, len, ret = -EINVAL; + int i, len, ret = -EINVAL, err; + u32 log_true_size; bool is_priv; /* no program is valid */ @@ -18032,7 +18712,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; - log = &env->log; len = (*prog)->len; env->insn_aux_data = @@ -18053,20 +18732,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) if (!is_priv) mutex_lock(&bpf_verifier_lock); - if (attr->log_level || attr->log_buf || attr->log_size) { - /* user requested verbose verifier output - * and supplied buffer to store the verification trace - */ - log->level = attr->log_level; - log->ubuf = (char __user *) (unsigned long) attr->log_buf; - log->len_total = attr->log_size; - - /* log attributes have to be sane */ - if (!bpf_verifier_log_attr_valid(log)) { - ret = -EINVAL; - goto err_unlock; - } - } + /* user could have requested verbose verifier output + * and supplied buffer to store the verification trace + */ + ret = bpf_vlog_init(&env->log, attr->log_level, + (char __user *) (unsigned long) attr->log_buf, + attr->log_size); + if (ret) + goto err_unlock; mark_verifier_state_clean(env); @@ -18180,9 +18853,14 @@ skip_full_check: print_verification_stats(env); env->prog->aux->verified_insns = env->insn_processed; - if (log->level && bpf_verifier_log_full(log)) - ret = -ENOSPC; - if (log->level && !log->ubuf) { + /* preserve original error even if log finalization is successful */ + err = bpf_vlog_finalize(&env->log, &log_true_size); + if (err) + ret = err; + + if (uattr_size >= offsetofend(union bpf_attr, log_true_size) && + copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size), + &log_true_size, sizeof(log_true_size))) { ret = -EFAULT; goto err_release_maps; } |