aboutsummaryrefslogtreecommitdiff
path: root/kernel/bpf
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/arraymap.c12
-rw-r--r--kernel/bpf/bloom_filter.c29
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c23
-rw-r--r--kernel/bpf/bpf_inode_storage.c22
-rw-r--r--kernel/bpf/bpf_iter.c70
-rw-r--r--kernel/bpf/bpf_local_storage.c382
-rw-r--r--kernel/bpf/bpf_struct_ops.c260
-rw-r--r--kernel/bpf/bpf_task_storage.c27
-rw-r--r--kernel/bpf/btf.c279
-rw-r--r--kernel/bpf/cpumap.c8
-rw-r--r--kernel/bpf/cpumask.c43
-rw-r--r--kernel/bpf/devmap.c24
-rw-r--r--kernel/bpf/hashtab.c38
-rw-r--r--kernel/bpf/helpers.c139
-rw-r--r--kernel/bpf/local_storage.c6
-rw-r--r--kernel/bpf/log.c330
-rw-r--r--kernel/bpf/lpm_trie.c6
-rw-r--r--kernel/bpf/memalloc.c59
-rw-r--r--kernel/bpf/queue_stack_maps.c22
-rw-r--r--kernel/bpf/reuseport_array.c2
-rw-r--r--kernel/bpf/ringbuf.c6
-rw-r--r--kernel/bpf/stackmap.c6
-rw-r--r--kernel/bpf/syscall.c112
-rw-r--r--kernel/bpf/trampoline.c28
-rw-r--r--kernel/bpf/verifier.c1096
26 files changed, 2246 insertions, 786 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 02242614dcc7..1d3892168d32 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,8 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
endif
CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1588c793a715..2058e89b5ddd 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -307,8 +307,8 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
}
/* Called from syscall or from eBPF program */
-static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long array_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
@@ -386,7 +386,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
}
/* Called from syscall or from eBPF program */
-static int array_map_delete_elem(struct bpf_map *map, void *key)
+static long array_map_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
@@ -686,8 +686,8 @@ static const struct bpf_iter_seq_info iter_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info),
};
-static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
- void *callback_ctx, u64 flags)
+static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+ void *callback_ctx, u64 flags)
{
u32 i, key, num_elems = 0;
struct bpf_array *array;
@@ -871,7 +871,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
return 0;
}
-static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
+static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
void *old_ptr;
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index 6350c5d35a9b..540331b610a9 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -16,13 +16,6 @@ struct bpf_bloom_filter {
struct bpf_map map;
u32 bitset_mask;
u32 hash_seed;
- /* If the size of the values in the bloom filter is u32 aligned,
- * then it is more performant to use jhash2 as the underlying hash
- * function, else we use jhash. This tracks the number of u32s
- * in an u32-aligned value size. If the value size is not u32 aligned,
- * this will be 0.
- */
- u32 aligned_u32_count;
u32 nr_hash_funcs;
unsigned long bitset[];
};
@@ -32,16 +25,15 @@ static u32 hash(struct bpf_bloom_filter *bloom, void *value,
{
u32 h;
- if (bloom->aligned_u32_count)
- h = jhash2(value, bloom->aligned_u32_count,
- bloom->hash_seed + index);
+ if (likely(value_size % 4 == 0))
+ h = jhash2(value, value_size / 4, bloom->hash_seed + index);
else
h = jhash(value, value_size, bloom->hash_seed + index);
return h & bloom->bitset_mask;
}
-static int bloom_map_peek_elem(struct bpf_map *map, void *value)
+static long bloom_map_peek_elem(struct bpf_map *map, void *value)
{
struct bpf_bloom_filter *bloom =
container_of(map, struct bpf_bloom_filter, map);
@@ -56,7 +48,7 @@ static int bloom_map_peek_elem(struct bpf_map *map, void *value)
return 0;
}
-static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
+static long bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
{
struct bpf_bloom_filter *bloom =
container_of(map, struct bpf_bloom_filter, map);
@@ -73,12 +65,12 @@ static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
return 0;
}
-static int bloom_map_pop_elem(struct bpf_map *map, void *value)
+static long bloom_map_pop_elem(struct bpf_map *map, void *value)
{
return -EOPNOTSUPP;
}
-static int bloom_map_delete_elem(struct bpf_map *map, void *value)
+static long bloom_map_delete_elem(struct bpf_map *map, void *value)
{
return -EOPNOTSUPP;
}
@@ -152,11 +144,6 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
bloom->nr_hash_funcs = nr_hash_funcs;
bloom->bitset_mask = bitset_mask;
- /* Check whether the value size is u32-aligned */
- if ((attr->value_size & (sizeof(u32) - 1)) == 0)
- bloom->aligned_u32_count =
- attr->value_size / sizeof(u32);
-
if (!(attr->map_flags & BPF_F_ZERO_SEED))
bloom->hash_seed = get_random_u32();
@@ -177,8 +164,8 @@ static void *bloom_map_lookup_elem(struct bpf_map *map, void *key)
return ERR_PTR(-EINVAL);
}
-static int bloom_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long bloom_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
/* The eBPF program should use map_push_elem instead */
return -EINVAL;
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 9ae07aedaf23..d44fe8dd9732 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -46,8 +46,6 @@ static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner)
void bpf_cgrp_storage_free(struct cgroup *cgroup)
{
struct bpf_local_storage *local_storage;
- bool free_cgroup_storage = false;
- unsigned long flags;
rcu_read_lock();
local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
@@ -57,14 +55,9 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
}
bpf_cgrp_storage_lock();
- raw_spin_lock_irqsave(&local_storage->lock, flags);
- free_cgroup_storage = bpf_local_storage_unlink_nolock(local_storage);
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_local_storage_destroy(local_storage);
bpf_cgrp_storage_unlock();
rcu_read_unlock();
-
- if (free_cgroup_storage)
- kfree_rcu(local_storage, rcu);
}
static struct bpf_local_storage_data *
@@ -100,8 +93,8 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
return sdata ? sdata->data : NULL;
}
-static int bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
struct cgroup *cgroup;
@@ -128,11 +121,11 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- bpf_selem_unlink(SELEM(sdata), true);
+ bpf_selem_unlink(SELEM(sdata), false);
return 0;
}
-static int bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
{
struct cgroup *cgroup;
int err, fd;
@@ -156,7 +149,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &cgroup_cache);
+ return bpf_local_storage_map_alloc(attr, &cgroup_cache, true);
}
static void cgroup_storage_map_free(struct bpf_map *map)
@@ -231,7 +224,7 @@ const struct bpf_func_proto bpf_cgrp_storage_get_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &bpf_cgroup_btf_id[0],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
@@ -242,6 +235,6 @@ const struct bpf_func_proto bpf_cgrp_storage_delete_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &bpf_cgroup_btf_id[0],
};
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 43e2619c8167..a4d93df78c75 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -57,7 +57,6 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
void bpf_inode_storage_free(struct inode *inode)
{
struct bpf_local_storage *local_storage;
- bool free_inode_storage = false;
struct bpf_storage_blob *bsb;
bsb = bpf_inode(inode);
@@ -72,13 +71,8 @@ void bpf_inode_storage_free(struct inode *inode)
return;
}
- raw_spin_lock_bh(&local_storage->lock);
- free_inode_storage = bpf_local_storage_unlink_nolock(local_storage);
- raw_spin_unlock_bh(&local_storage->lock);
+ bpf_local_storage_destroy(local_storage);
rcu_read_unlock();
-
- if (free_inode_storage)
- kfree_rcu(local_storage, rcu);
}
static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
@@ -97,8 +91,8 @@ static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
return sdata ? sdata->data : NULL;
}
-static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
struct file *f;
@@ -128,12 +122,12 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- bpf_selem_unlink(SELEM(sdata), true);
+ bpf_selem_unlink(SELEM(sdata), false);
return 0;
}
-static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
{
struct file *f;
int fd, err;
@@ -205,7 +199,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &inode_cache);
+ return bpf_local_storage_map_alloc(attr, &inode_cache, false);
}
static void inode_storage_map_free(struct bpf_map *map)
@@ -235,7 +229,7 @@ const struct bpf_func_proto bpf_inode_storage_get_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &bpf_inode_storage_btf_ids[0],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
@@ -246,6 +240,6 @@ const struct bpf_func_proto bpf_inode_storage_delete_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &bpf_inode_storage_btf_ids[0],
};
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 5dc307bdeaeb..96856f130cbf 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -776,3 +776,73 @@ const struct bpf_func_proto bpf_loop_proto = {
.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
+
+struct bpf_iter_num_kern {
+ int cur; /* current value, inclusive */
+ int end; /* final value, exclusive */
+} __aligned(8);
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num));
+
+ BTF_TYPE_EMIT(struct btf_iter_num);
+
+ /* start == end is legit, it's an empty range and we'll just get NULL
+ * on first (and any subsequent) bpf_iter_num_next() call
+ */
+ if (start > end) {
+ s->cur = s->end = 0;
+ return -EINVAL;
+ }
+
+ /* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */
+ if ((s64)end - (s64)start > BPF_MAX_LOOPS) {
+ s->cur = s->end = 0;
+ return -E2BIG;
+ }
+
+ /* user will call bpf_iter_num_next() first,
+ * which will set s->cur to exactly start value;
+ * underflow shouldn't matter
+ */
+ s->cur = start - 1;
+ s->end = end;
+
+ return 0;
+}
+
+__bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ /* check failed initialization or if we are done (same behavior);
+ * need to be careful about overflow, so convert to s64 for checks,
+ * e.g., if s->cur == s->end == INT_MAX, we can't just do
+ * s->cur + 1 >= s->end
+ */
+ if ((s64)(s->cur + 1) >= s->end) {
+ s->cur = s->end = 0;
+ return NULL;
+ }
+
+ s->cur++;
+
+ return &s->cur;
+}
+
+__bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ s->cur = s->end = 0;
+}
+
+__diag_pop();
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index d3ba3f2db640..47d9948d768f 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -80,8 +80,24 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
if (charge_mem && mem_charge(smap, owner, smap->elem_size))
return NULL;
- selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
- gfp_flags | __GFP_NOWARN);
+ if (smap->bpf_ma) {
+ migrate_disable();
+ selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags);
+ migrate_enable();
+ if (selem)
+ /* Keep the original bpf_map_kzalloc behavior
+ * before started using the bpf_mem_cache_alloc.
+ *
+ * No need to use zero_map_value. The bpf_selem_free()
+ * only does bpf_mem_cache_free when there is
+ * no other bpf prog is using the selem.
+ */
+ memset(SDATA(selem)->data, 0, smap->map.value_size);
+ } else {
+ selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
+ gfp_flags | __GFP_NOWARN);
+ }
+
if (selem) {
if (value)
copy_map_value(&smap->map, SDATA(selem)->data, value);
@@ -95,7 +111,8 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;
}
-void bpf_local_storage_free_rcu(struct rcu_head *rcu)
+/* rcu tasks trace callback for bpf_ma == false */
+static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
@@ -109,28 +126,66 @@ void bpf_local_storage_free_rcu(struct rcu_head *rcu)
kfree_rcu(local_storage, rcu);
}
-static void bpf_selem_free_fields_rcu(struct rcu_head *rcu)
+static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
{
- struct bpf_local_storage_elem *selem;
- struct bpf_local_storage_map *smap;
+ struct bpf_local_storage *local_storage;
- selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
- /* protected by the rcu_barrier*() */
- smap = rcu_dereference_protected(SDATA(selem)->smap, true);
- bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
- kfree(selem);
+ local_storage = container_of(rcu, struct bpf_local_storage, rcu);
+ bpf_mem_cache_raw_free(local_storage);
}
-static void bpf_selem_free_fields_trace_rcu(struct rcu_head *rcu)
+static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
- /* Free directly if Tasks Trace RCU GP also implies RCU GP */
if (rcu_trace_implies_rcu_gp())
- bpf_selem_free_fields_rcu(rcu);
+ bpf_local_storage_free_rcu(rcu);
else
- call_rcu(rcu, bpf_selem_free_fields_rcu);
+ call_rcu(rcu, bpf_local_storage_free_rcu);
}
-static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+/* Handle bpf_ma == false */
+static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
+ bool vanilla_rcu)
+{
+ if (vanilla_rcu)
+ kfree_rcu(local_storage, rcu);
+ else
+ call_rcu_tasks_trace(&local_storage->rcu,
+ __bpf_local_storage_free_trace_rcu);
+}
+
+static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_map *smap,
+ bool bpf_ma, bool reuse_now)
+{
+ if (!local_storage)
+ return;
+
+ if (!bpf_ma) {
+ __bpf_local_storage_free(local_storage, reuse_now);
+ return;
+ }
+
+ if (!reuse_now) {
+ call_rcu_tasks_trace(&local_storage->rcu,
+ bpf_local_storage_free_trace_rcu);
+ return;
+ }
+
+ if (smap) {
+ migrate_disable();
+ bpf_mem_cache_free(&smap->storage_ma, local_storage);
+ migrate_enable();
+ } else {
+ /* smap could be NULL if the selem that triggered
+ * this 'local_storage' creation had been long gone.
+ * In this case, directly do call_rcu().
+ */
+ call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
+ }
+}
+
+/* rcu tasks trace callback for bpf_ma == false */
+static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
@@ -141,17 +196,66 @@ static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
kfree_rcu(selem, rcu);
}
+/* Handle bpf_ma == false */
+static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
+ bool vanilla_rcu)
+{
+ if (vanilla_rcu)
+ kfree_rcu(selem, rcu);
+ else
+ call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
+}
+
+static void bpf_selem_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage_elem *selem;
+
+ selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+ bpf_mem_cache_raw_free(selem);
+}
+
+static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_selem_free_rcu(rcu);
+ else
+ call_rcu(rcu, bpf_selem_free_rcu);
+}
+
+void bpf_selem_free(struct bpf_local_storage_elem *selem,
+ struct bpf_local_storage_map *smap,
+ bool reuse_now)
+{
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+
+ if (!smap->bpf_ma) {
+ __bpf_selem_free(selem, reuse_now);
+ return;
+ }
+
+ if (!reuse_now) {
+ call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
+ } else {
+ /* Instead of using the vanilla call_rcu(),
+ * bpf_mem_cache_free will be able to reuse selem
+ * immediately.
+ */
+ migrate_disable();
+ bpf_mem_cache_free(&smap->selem_ma, selem);
+ migrate_enable();
+ }
+}
+
/* local_storage->lock must be held and selem->local_storage == local_storage.
* The caller must ensure selem->smap is still valid to be
* dereferenced for its smap->elem_size and smap->cache_idx.
*/
static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem,
- bool uncharge_mem, bool use_trace_rcu)
+ bool uncharge_mem, bool reuse_now)
{
struct bpf_local_storage_map *smap;
bool free_local_storage;
- struct btf_record *rec;
void *owner;
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
@@ -192,35 +296,55 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
SDATA(selem))
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
- /* A different RCU callback is chosen whenever we need to free
- * additional fields in selem data before freeing selem.
- * bpf_local_storage_map_free only executes rcu_barrier to wait for RCU
- * callbacks when it has special fields, hence we can only conditionally
- * dereference smap, as by this time the map might have already been
- * freed without waiting for our call_rcu callback if it did not have
- * any special fields.
+ bpf_selem_free(selem, smap, reuse_now);
+
+ if (rcu_access_pointer(local_storage->smap) == smap)
+ RCU_INIT_POINTER(local_storage->smap, NULL);
+
+ return free_local_storage;
+}
+
+static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_map *storage_smap,
+ struct bpf_local_storage_elem *selem)
+{
+
+ struct bpf_local_storage_map *selem_smap;
+
+ /* local_storage->smap may be NULL. If it is, get the bpf_ma
+ * from any selem in the local_storage->list. The bpf_ma of all
+ * local_storage and selem should have the same value
+ * for the same map type.
+ *
+ * If the local_storage->list is already empty, the caller will not
+ * care about the bpf_ma value also because the caller is not
+ * responsibile to free the local_storage.
*/
- rec = smap->map.record;
- if (use_trace_rcu) {
- if (!IS_ERR_OR_NULL(rec))
- call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_fields_trace_rcu);
- else
- call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
- } else {
- if (!IS_ERR_OR_NULL(rec))
- call_rcu(&selem->rcu, bpf_selem_free_fields_rcu);
- else
- kfree_rcu(selem, rcu);
+
+ if (storage_smap)
+ return storage_smap->bpf_ma;
+
+ if (!selem) {
+ struct hlist_node *n;
+
+ n = rcu_dereference_check(hlist_first_rcu(&local_storage->list),
+ bpf_rcu_lock_held());
+ if (!n)
+ return false;
+
+ selem = hlist_entry(n, struct bpf_local_storage_elem, snode);
}
+ selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
- return free_local_storage;
+ return selem_smap->bpf_ma;
}
-static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
- bool use_trace_rcu)
+static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
+ bool reuse_now)
{
+ struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage *local_storage;
- bool free_local_storage = false;
+ bool bpf_ma, free_local_storage = false;
unsigned long flags;
if (unlikely(!selem_linked_to_storage_lockless(selem)))
@@ -229,19 +353,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
local_storage = rcu_dereference_check(selem->local_storage,
bpf_rcu_lock_held());
+ storage_smap = rcu_dereference_check(local_storage->smap,
+ bpf_rcu_lock_held());
+ bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem);
+
raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, true, use_trace_rcu);
+ local_storage, selem, true, reuse_now);
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
- if (free_local_storage) {
- if (use_trace_rcu)
- call_rcu_tasks_trace(&local_storage->rcu,
- bpf_local_storage_free_rcu);
- else
- kfree_rcu(local_storage, rcu);
- }
+ if (free_local_storage)
+ bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now);
}
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -251,7 +374,7 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
hlist_add_head_rcu(&selem->snode, &local_storage->list);
}
-void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
+static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
{
struct bpf_local_storage_map *smap;
struct bpf_local_storage_map_bucket *b;
@@ -281,14 +404,14 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
raw_spin_unlock_irqrestore(&b->lock, flags);
}
-void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu)
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
{
/* Always unlink from map before unlinking from local_storage
* because selem will be freed after successfully unlinked from
* the local_storage.
*/
bpf_selem_unlink_map(selem);
- __bpf_selem_unlink_storage(selem, use_trace_rcu);
+ bpf_selem_unlink_storage(selem, reuse_now);
}
/* If cacheit_lockit is false, this lookup function is lockless */
@@ -361,13 +484,21 @@ int bpf_local_storage_alloc(void *owner,
if (err)
return err;
- storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
- gfp_flags | __GFP_NOWARN);
+ if (smap->bpf_ma) {
+ migrate_disable();
+ storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags);
+ migrate_enable();
+ } else {
+ storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
+ gfp_flags | __GFP_NOWARN);
+ }
+
if (!storage) {
err = -ENOMEM;
goto uncharge;
}
+ RCU_INIT_POINTER(storage->smap, smap);
INIT_HLIST_HEAD(&storage->list);
raw_spin_lock_init(&storage->lock);
storage->owner = owner;
@@ -407,7 +538,7 @@ int bpf_local_storage_alloc(void *owner,
return 0;
uncharge:
- kfree(storage);
+ bpf_local_storage_free(storage, smap, smap->bpf_ma, true);
mem_uncharge(smap, owner, sizeof(*storage));
return err;
}
@@ -451,7 +582,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
if (err) {
- kfree(selem);
+ bpf_selem_free(selem, smap, true);
mem_uncharge(smap, owner, smap->elem_size);
return ERR_PTR(err);
}
@@ -534,7 +665,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (old_sdata) {
bpf_selem_unlink_map(SELEM(old_sdata));
bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
- false, true);
+ false, false);
}
unlock:
@@ -545,7 +676,7 @@ unlock_err:
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
if (selem) {
mem_uncharge(smap, owner, smap->elem_size);
- kfree(selem);
+ bpf_selem_free(selem, smap, true);
}
return ERR_PTR(err);
}
@@ -601,40 +732,6 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
return 0;
}
-static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_attr *attr)
-{
- struct bpf_local_storage_map *smap;
- unsigned int i;
- u32 nbuckets;
-
- smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
- if (!smap)
- return ERR_PTR(-ENOMEM);
- bpf_map_init_from_attr(&smap->map, attr);
-
- nbuckets = roundup_pow_of_two(num_possible_cpus());
- /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
- nbuckets = max_t(u32, 2, nbuckets);
- smap->bucket_log = ilog2(nbuckets);
-
- smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
- nbuckets, GFP_USER | __GFP_NOWARN);
- if (!smap->buckets) {
- bpf_map_area_free(smap);
- return ERR_PTR(-ENOMEM);
- }
-
- for (i = 0; i < nbuckets; i++) {
- INIT_HLIST_HEAD(&smap->buckets[i].list);
- raw_spin_lock_init(&smap->buckets[i].lock);
- }
-
- smap->elem_size = offsetof(struct bpf_local_storage_elem,
- sdata.data[attr->value_size]);
-
- return smap;
-}
-
int bpf_local_storage_map_check_btf(const struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
@@ -652,11 +749,16 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
return 0;
}
-bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
+void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
{
+ struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage_elem *selem;
- bool free_storage = false;
+ bool bpf_ma, free_storage = false;
struct hlist_node *n;
+ unsigned long flags;
+
+ storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held());
+ bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL);
/* Neither the bpf_prog nor the bpf_map's syscall
* could be modifying the local_storage->list now.
@@ -667,6 +769,7 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
* when unlinking elem from the local_storage->list and
* the map's bucket->list.
*/
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
/* Always unlink from map before unlinking from
* local_storage.
@@ -679,10 +782,12 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
* of the loop will set the free_cgroup_storage to true.
*/
free_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, false, false);
+ local_storage, selem, false, true);
}
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
- return free_storage;
+ if (free_storage)
+ bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true);
}
u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
@@ -695,18 +800,71 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
return usage;
}
+/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory.
+ * A deadlock free allocator is useful for storage that the bpf prog can easily
+ * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf.
+ * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses
+ * memory immediately. To be reuse-immediate safe, the owner destruction
+ * code path needs to go through a rcu grace period before calling
+ * bpf_local_storage_destroy().
+ *
+ * When bpf_ma == false, the kmalloc and kfree are used.
+ */
struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr *attr,
- struct bpf_local_storage_cache *cache)
+ struct bpf_local_storage_cache *cache,
+ bool bpf_ma)
{
struct bpf_local_storage_map *smap;
+ unsigned int i;
+ u32 nbuckets;
+ int err;
+
+ smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
+ if (!smap)
+ return ERR_PTR(-ENOMEM);
+ bpf_map_init_from_attr(&smap->map, attr);
+
+ nbuckets = roundup_pow_of_two(num_possible_cpus());
+ /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
+ nbuckets = max_t(u32, 2, nbuckets);
+ smap->bucket_log = ilog2(nbuckets);
- smap = __bpf_local_storage_map_alloc(attr);
- if (IS_ERR(smap))
- return ERR_CAST(smap);
+ smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
+ nbuckets, GFP_USER | __GFP_NOWARN);
+ if (!smap->buckets) {
+ err = -ENOMEM;
+ goto free_smap;
+ }
+
+ for (i = 0; i < nbuckets; i++) {
+ INIT_HLIST_HEAD(&smap->buckets[i].list);
+ raw_spin_lock_init(&smap->buckets[i].lock);
+ }
+
+ smap->elem_size = offsetof(struct bpf_local_storage_elem,
+ sdata.data[attr->value_size]);
+
+ smap->bpf_ma = bpf_ma;
+ if (bpf_ma) {
+ err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false);
+ if (err)
+ goto free_smap;
+
+ err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false);
+ if (err) {
+ bpf_mem_alloc_destroy(&smap->selem_ma);
+ goto free_smap;
+ }
+ }
smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
return &smap->map;
+
+free_smap:
+ kvfree(smap->buckets);
+ bpf_map_area_free(smap);
+ return ERR_PTR(err);
}
void bpf_local_storage_map_free(struct bpf_map *map,
@@ -748,7 +906,7 @@ void bpf_local_storage_map_free(struct bpf_map *map,
migrate_disable();
this_cpu_inc(*busy_counter);
}
- bpf_selem_unlink(selem, false);
+ bpf_selem_unlink(selem, true);
if (busy_counter) {
this_cpu_dec(*busy_counter);
migrate_enable();
@@ -772,26 +930,10 @@ void bpf_local_storage_map_free(struct bpf_map *map,
*/
synchronize_rcu();
- /* Only delay freeing of smap, buckets are not needed anymore */
- kvfree(smap->buckets);
-
- /* When local storage has special fields, callbacks for
- * bpf_selem_free_fields_rcu and bpf_selem_free_fields_trace_rcu will
- * keep using the map BTF record, we need to execute an RCU barrier to
- * wait for them as the record will be freed right after our map_free
- * callback.
- */
- if (!IS_ERR_OR_NULL(smap->map.record)) {
- rcu_barrier_tasks_trace();
- /* We cannot skip rcu_barrier() when rcu_trace_implies_rcu_gp()
- * is true, because while call_rcu invocation is skipped in that
- * case in bpf_selem_free_fields_trace_rcu (and all local
- * storage maps pass use_trace_rcu = true), there can be
- * call_rcu callbacks based on use_trace_rcu = false in the
- * while ((selem = ...)) loop above or when owner's free path
- * calls bpf_local_storage_unlink_nolock.
- */
- rcu_barrier();
+ if (smap->bpf_ma) {
+ bpf_mem_alloc_destroy(&smap->selem_ma);
+ bpf_mem_alloc_destroy(&smap->storage_ma);
}
+ kvfree(smap->buckets);
bpf_map_area_free(smap);
}
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 38903fb52f98..d3f0a4825fa6 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -11,11 +11,13 @@
#include <linux/refcount.h>
#include <linux/mutex.h>
#include <linux/btf_ids.h>
+#include <linux/rcupdate_wait.h>
enum bpf_struct_ops_state {
BPF_STRUCT_OPS_STATE_INIT,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE,
+ BPF_STRUCT_OPS_STATE_READY,
};
#define BPF_STRUCT_OPS_COMMON_VALUE \
@@ -58,6 +60,13 @@ struct bpf_struct_ops_map {
struct bpf_struct_ops_value kvalue;
};
+struct bpf_struct_ops_link {
+ struct bpf_link link;
+ struct bpf_map __rcu *map;
+};
+
+static DEFINE_MUTEX(update_mutex);
+
#define VALUE_PREFIX "bpf_struct_ops_"
#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
@@ -249,6 +258,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
struct bpf_struct_ops_value *uvalue, *kvalue;
enum bpf_struct_ops_state state;
+ s64 refcnt;
if (unlikely(*(u32 *)key != 0))
return -ENOENT;
@@ -267,7 +277,14 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
uvalue = value;
memcpy(uvalue, st_map->uvalue, map->value_size);
uvalue->state = state;
- refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
+
+ /* This value offers the user space a general estimate of how
+ * many sockets are still utilizing this struct_ops for TCP
+ * congestion control. The number might not be exact, but it
+ * should sufficiently meet our present goals.
+ */
+ refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
+ refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0));
return 0;
}
@@ -349,8 +366,8 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
model, flags, tlinks, NULL);
}
-static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
const struct bpf_struct_ops *st_ops = st_map->st_ops;
@@ -491,12 +508,29 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*(unsigned long *)(udata + moff) = prog->aux->id;
}
- refcount_set(&kvalue->refcnt, 1);
- bpf_map_inc(map);
+ if (st_map->map.map_flags & BPF_F_LINK) {
+ err = st_ops->validate(kdata);
+ if (err)
+ goto reset_unlock;
+ set_memory_rox((long)st_map->image, 1);
+ /* Let bpf_link handle registration & unregistration.
+ *
+ * Pair with smp_load_acquire() during lookup_elem().
+ */
+ smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
+ goto unlock;
+ }
set_memory_rox((long)st_map->image, 1);
err = st_ops->reg(kdata);
if (likely(!err)) {
+ /* This refcnt increment on the map here after
+ * 'st_ops->reg()' is secure since the state of the
+ * map must be set to INIT at this moment, and thus
+ * bpf_struct_ops_map_delete_elem() can't unregister
+ * or transition it to TOBEFREE concurrently.
+ */
+ bpf_map_inc(map);
/* Pair with smp_load_acquire() during lookup_elem().
* It ensures the above udata updates (e.g. prog->aux->id)
* can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
@@ -512,7 +546,6 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*/
set_memory_nx((long)st_map->image, 1);
set_memory_rw((long)st_map->image, 1);
- bpf_map_put(map);
reset_unlock:
bpf_struct_ops_map_put_progs(st_map);
@@ -524,20 +557,22 @@ unlock:
return err;
}
-static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
+static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
{
enum bpf_struct_ops_state prev_state;
struct bpf_struct_ops_map *st_map;
st_map = (struct bpf_struct_ops_map *)map;
+ if (st_map->map.map_flags & BPF_F_LINK)
+ return -EOPNOTSUPP;
+
prev_state = cmpxchg(&st_map->kvalue.state,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE);
switch (prev_state) {
case BPF_STRUCT_OPS_STATE_INUSE:
st_map->st_ops->unreg(&st_map->kvalue.data);
- if (refcount_dec_and_test(&st_map->kvalue.refcnt))
- bpf_map_put(map);
+ bpf_map_put(map);
return 0;
case BPF_STRUCT_OPS_STATE_TOBEFREE:
return -EINPROGRESS;
@@ -570,7 +605,7 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
kfree(value);
}
-static void bpf_struct_ops_map_free(struct bpf_map *map)
+static void __bpf_struct_ops_map_free(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
@@ -582,10 +617,32 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
bpf_map_area_free(st_map);
}
+static void bpf_struct_ops_map_free(struct bpf_map *map)
+{
+ /* The struct_ops's function may switch to another struct_ops.
+ *
+ * For example, bpf_tcp_cc_x->init() may switch to
+ * another tcp_cc_y by calling
+ * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+ * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
+ * and its refcount may reach 0 which then free its
+ * trampoline image while tcp_cc_x is still running.
+ *
+ * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
+ * to finish. bpf-tcp-cc prog is non sleepable.
+ * A rcu_tasks gp is to wait for the last few insn
+ * in the tramopline image to finish before releasing
+ * the trampoline image.
+ */
+ synchronize_rcu_mult(call_rcu, call_rcu_tasks);
+
+ __bpf_struct_ops_map_free(map);
+}
+
static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
{
if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
- attr->map_flags || !attr->btf_vmlinux_value_type_id)
+ (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
return -EINVAL;
return 0;
}
@@ -609,6 +666,9 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
if (attr->value_size != vt->size)
return ERR_PTR(-EINVAL);
+ if (attr->map_flags & BPF_F_LINK && (!st_ops->validate || !st_ops->update))
+ return ERR_PTR(-EOPNOTSUPP);
+
t = st_ops->type;
st_map_size = sizeof(*st_map) +
@@ -630,7 +690,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
NUMA_NO_NODE);
st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
if (!st_map->uvalue || !st_map->links || !st_map->image) {
- bpf_struct_ops_map_free(map);
+ __bpf_struct_ops_map_free(map);
return ERR_PTR(-ENOMEM);
}
@@ -676,41 +736,175 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
bool bpf_struct_ops_get(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
- return refcount_inc_not_zero(&kvalue->refcnt);
+ map = __bpf_map_inc_not_zero(&st_map->map, false);
+ return !IS_ERR(map);
}
-static void bpf_struct_ops_put_rcu(struct rcu_head *head)
+void bpf_struct_ops_put(const void *kdata)
{
+ struct bpf_struct_ops_value *kvalue;
struct bpf_struct_ops_map *st_map;
- st_map = container_of(head, struct bpf_struct_ops_map, rcu);
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
bpf_map_put(&st_map->map);
}
-void bpf_struct_ops_put(const void *kdata)
+static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
{
- struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
- kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
- if (refcount_dec_and_test(&kvalue->refcnt)) {
- struct bpf_struct_ops_map *st_map;
+ return map->map_type == BPF_MAP_TYPE_STRUCT_OPS &&
+ map->map_flags & BPF_F_LINK &&
+ /* Pair with smp_store_release() during map_update */
+ smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY;
+}
- st_map = container_of(kvalue, struct bpf_struct_ops_map,
- kvalue);
- /* The struct_ops's function may switch to another struct_ops.
- *
- * For example, bpf_tcp_cc_x->init() may switch to
- * another tcp_cc_y by calling
- * setsockopt(TCP_CONGESTION, "tcp_cc_y").
- * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
- * and its map->refcnt may reach 0 which then free its
- * trampoline image while tcp_cc_x is still running.
- *
- * Thus, a rcu grace period is needed here.
+static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_struct_ops_map *st_map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ st_map = (struct bpf_struct_ops_map *)
+ rcu_dereference_protected(st_link->map, true);
+ if (st_map) {
+ /* st_link->map can be NULL if
+ * bpf_struct_ops_link_create() fails to register.
*/
- call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
+ st_map->st_ops->unreg(&st_map->kvalue.data);
+ bpf_map_put(&st_map->map);
+ }
+ kfree(st_link);
+}
+
+static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock();
+ map = rcu_dereference(st_link->map);
+ seq_printf(seq, "map_id:\t%d\n", map->id);
+ rcu_read_unlock();
+}
+
+static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock();
+ map = rcu_dereference(st_link->map);
+ info->struct_ops.map_id = map->id;
+ rcu_read_unlock();
+ return 0;
+}
+
+static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map,
+ struct bpf_map *expected_old_map)
+{
+ struct bpf_struct_ops_map *st_map, *old_st_map;
+ struct bpf_map *old_map;
+ struct bpf_struct_ops_link *st_link;
+ int err = 0;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ st_map = container_of(new_map, struct bpf_struct_ops_map, map);
+
+ if (!bpf_struct_ops_valid_to_reg(new_map))
+ return -EINVAL;
+
+ mutex_lock(&update_mutex);
+
+ old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ if (expected_old_map && old_map != expected_old_map) {
+ err = -EPERM;
+ goto err_out;
+ }
+
+ old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
+ /* The new and old struct_ops must be the same type. */
+ if (st_map->st_ops != old_st_map->st_ops) {
+ err = -EINVAL;
+ goto err_out;
}
+
+ err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
+ if (err)
+ goto err_out;
+
+ bpf_map_inc(new_map);
+ rcu_assign_pointer(st_link->map, new_map);
+ bpf_map_put(old_map);
+
+err_out:
+ mutex_unlock(&update_mutex);
+
+ return err;
}
+
+static const struct bpf_link_ops bpf_struct_ops_map_lops = {
+ .dealloc = bpf_struct_ops_map_link_dealloc,
+ .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
+ .fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+ .update_map = bpf_struct_ops_map_link_update,
+};
+
+int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+ struct bpf_struct_ops_link *link = NULL;
+ struct bpf_link_primer link_primer;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
+ int err;
+
+ map = bpf_map_get(attr->link_create.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ st_map = (struct bpf_struct_ops_map *)map;
+
+ if (!bpf_struct_ops_valid_to_reg(map)) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto err_out;
+
+ err = st_map->st_ops->reg(st_map->kvalue.data);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ link = NULL;
+ goto err_out;
+ }
+ RCU_INIT_POINTER(link->map, map);
+
+ return bpf_link_settle(&link_primer);
+
+err_out:
+ bpf_map_put(map);
+ kfree(link);
+ return err;
+}
+
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 20f942229f3c..adf6dfe0ba68 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -72,8 +72,6 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
void bpf_task_storage_free(struct task_struct *task)
{
struct bpf_local_storage *local_storage;
- bool free_task_storage = false;
- unsigned long flags;
rcu_read_lock();
@@ -84,14 +82,9 @@ void bpf_task_storage_free(struct task_struct *task)
}
bpf_task_storage_lock();
- raw_spin_lock_irqsave(&local_storage->lock, flags);
- free_task_storage = bpf_local_storage_unlink_nolock(local_storage);
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_local_storage_destroy(local_storage);
bpf_task_storage_unlock();
rcu_read_unlock();
-
- if (free_task_storage)
- kfree_rcu(local_storage, rcu);
}
static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
@@ -127,8 +120,8 @@ out:
return ERR_PTR(err);
}
-static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
struct task_struct *task;
@@ -175,12 +168,12 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
if (!nobusy)
return -EBUSY;
- bpf_selem_unlink(SELEM(sdata), true);
+ bpf_selem_unlink(SELEM(sdata), false);
return 0;
}
-static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
{
struct task_struct *task;
unsigned int f_flags;
@@ -316,7 +309,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &task_cache);
+ return bpf_local_storage_map_alloc(attr, &task_cache, true);
}
static void task_storage_map_free(struct bpf_map *map)
@@ -345,7 +338,7 @@ const struct bpf_func_proto bpf_task_storage_get_recur_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
@@ -356,7 +349,7 @@ const struct bpf_func_proto bpf_task_storage_get_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
@@ -367,7 +360,7 @@ const struct bpf_func_proto bpf_task_storage_delete_recur_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};
@@ -376,6 +369,6 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 1853beaed4be..2c2d1fb9f410 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3231,12 +3231,6 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
-enum btf_field_info_type {
- BTF_FIELD_SPIN_LOCK,
- BTF_FIELD_TIMER,
- BTF_FIELD_KPTR,
-};
-
enum {
BTF_FIELD_IGNORE = 0,
BTF_FIELD_FOUND = 1,
@@ -3562,7 +3556,10 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
{
struct module *mod = NULL;
const struct btf_type *t;
- struct btf *kernel_btf;
+ /* If a matching btf type is found in kernel or module BTFs, kptr_ref
+ * is that BTF, otherwise it's program BTF
+ */
+ struct btf *kptr_btf;
int ret;
s32 id;
@@ -3571,7 +3568,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
*/
t = btf_type_by_id(btf, info->kptr.type_id);
id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
- &kernel_btf);
+ &kptr_btf);
+ if (id == -ENOENT) {
+ /* btf_parse_kptr should only be called w/ btf = program BTF */
+ WARN_ON_ONCE(btf_is_kernel(btf));
+
+ /* Type exists only in program BTF. Assume that it's a MEM_ALLOC
+ * kptr allocated via bpf_obj_new
+ */
+ field->kptr.dtor = NULL;
+ id = info->kptr.type_id;
+ kptr_btf = (struct btf *)btf;
+ btf_get(kptr_btf);
+ goto found_dtor;
+ }
if (id < 0)
return id;
@@ -3588,20 +3598,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
* can be used as a referenced pointer and be stored in a map at
* the same time.
*/
- dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id);
+ dtor_btf_id = btf_find_dtor_kfunc(kptr_btf, id);
if (dtor_btf_id < 0) {
ret = dtor_btf_id;
goto end_btf;
}
- dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id);
+ dtor_func = btf_type_by_id(kptr_btf, dtor_btf_id);
if (!dtor_func) {
ret = -ENOENT;
goto end_btf;
}
- if (btf_is_module(kernel_btf)) {
- mod = btf_try_get_module(kernel_btf);
+ if (btf_is_module(kptr_btf)) {
+ mod = btf_try_get_module(kptr_btf);
if (!mod) {
ret = -ENXIO;
goto end_btf;
@@ -3611,7 +3621,7 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
/* We already verified dtor_func to be btf_type_is_func
* in register_btf_id_dtor_kfuncs.
*/
- dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off);
+ dtor_func_name = __btf_name_by_offset(kptr_btf, dtor_func->name_off);
addr = kallsyms_lookup_name(dtor_func_name);
if (!addr) {
ret = -EINVAL;
@@ -3620,14 +3630,15 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
field->kptr.dtor = (void *)addr;
}
+found_dtor:
field->kptr.btf_id = id;
- field->kptr.btf = kernel_btf;
+ field->kptr.btf = kptr_btf;
field->kptr.module = mod;
return 0;
end_mod:
module_put(mod);
end_btf:
- btf_put(kernel_btf);
+ btf_put(kptr_btf);
return ret;
}
@@ -5494,38 +5505,45 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
return 0;
}
-static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
- u32 log_level, char __user *log_ubuf, u32 log_size)
+static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size)
{
+ u32 log_true_size;
+ int err;
+
+ err = bpf_vlog_finalize(log, &log_true_size);
+
+ if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
+ copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
+ &log_true_size, sizeof(log_true_size)))
+ err = -EFAULT;
+
+ return err;
+}
+
+static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
+{
+ bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel);
+ char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf);
struct btf_struct_metas *struct_meta_tab;
struct btf_verifier_env *env = NULL;
- struct bpf_verifier_log *log;
struct btf *btf = NULL;
u8 *data;
- int err;
+ int err, ret;
- if (btf_data_size > BTF_MAX_SIZE)
+ if (attr->btf_size > BTF_MAX_SIZE)
return ERR_PTR(-E2BIG);
env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
if (!env)
return ERR_PTR(-ENOMEM);
- log = &env->log;
- if (log_level || log_ubuf || log_size) {
- /* user requested verbose verifier output
- * and supplied buffer to store the verification trace
- */
- log->level = log_level;
- log->ubuf = log_ubuf;
- log->len_total = log_size;
-
- /* log attributes have to be sane */
- if (!bpf_verifier_log_attr_valid(log)) {
- err = -EINVAL;
- goto errout;
- }
- }
+ /* user could have requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ err = bpf_vlog_init(&env->log, attr->btf_log_level,
+ log_ubuf, attr->btf_log_size);
+ if (err)
+ goto errout_free;
btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
if (!btf) {
@@ -5534,16 +5552,16 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
}
env->btf = btf;
- data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
+ data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN);
if (!data) {
err = -ENOMEM;
goto errout;
}
btf->data = data;
- btf->data_size = btf_data_size;
+ btf->data_size = attr->btf_size;
- if (copy_from_bpfptr(data, btf_data, btf_data_size)) {
+ if (copy_from_bpfptr(data, btf_data, attr->btf_size)) {
err = -EFAULT;
goto errout;
}
@@ -5566,7 +5584,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
if (err)
goto errout;
- struct_meta_tab = btf_parse_struct_metas(log, btf);
+ struct_meta_tab = btf_parse_struct_metas(&env->log, btf);
if (IS_ERR(struct_meta_tab)) {
err = PTR_ERR(struct_meta_tab);
goto errout;
@@ -5583,10 +5601,9 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
}
}
- if (log->level && bpf_verifier_log_full(log)) {
- err = -ENOSPC;
- goto errout_meta;
- }
+ err = finalize_log(&env->log, uattr, uattr_size);
+ if (err)
+ goto errout_free;
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
@@ -5595,6 +5612,11 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
errout_meta:
btf_free_struct_meta_tab(btf);
errout:
+ /* overwrite err with -ENOSPC or -EFAULT */
+ ret = finalize_log(&env->log, uattr, uattr_size);
+ if (ret)
+ err = ret;
+errout_free:
btf_verifier_env_free(env);
if (btf)
btf_free(btf);
@@ -5900,12 +5922,8 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
{
- /* t comes in already as a pointer */
- t = btf_type_by_id(btf, t->type);
-
- /* allow const */
- if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
- t = btf_type_by_id(btf, t->type);
+ /* skip modifiers */
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
return btf_type_is_int(t);
}
@@ -6156,7 +6174,8 @@ enum bpf_struct_walk_result {
static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, int off, int size,
- u32 *next_btf_id, enum bpf_type_flag *flag)
+ u32 *next_btf_id, enum bpf_type_flag *flag,
+ const char **field_name)
{
u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
const struct btf_type *mtype, *elem_type = NULL;
@@ -6385,6 +6404,8 @@ error:
if (btf_type_is_struct(stype)) {
*next_btf_id = id;
*flag |= tmp_flag;
+ if (field_name)
+ *field_name = mname;
return WALK_PTR;
}
}
@@ -6411,7 +6432,8 @@ error:
int btf_struct_access(struct bpf_verifier_log *log,
const struct bpf_reg_state *reg,
int off, int size, enum bpf_access_type atype __maybe_unused,
- u32 *next_btf_id, enum bpf_type_flag *flag)
+ u32 *next_btf_id, enum bpf_type_flag *flag,
+ const char **field_name)
{
const struct btf *btf = reg->btf;
enum bpf_type_flag tmp_flag = 0;
@@ -6443,7 +6465,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
t = btf_type_by_id(btf, id);
do {
- err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag);
+ err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag, field_name);
switch (err) {
case WALK_PTR:
@@ -6518,7 +6540,7 @@ again:
type = btf_type_by_id(btf, id);
if (!type)
return false;
- err = btf_struct_walk(log, btf, type, off, 1, &id, &flag);
+ err = btf_struct_walk(log, btf, type, off, 1, &id, &flag, NULL);
if (err != WALK_STRUCT)
return false;
@@ -7199,15 +7221,12 @@ static int __btf_new_fd(struct btf *btf)
return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
}
-int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr)
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
struct btf *btf;
int ret;
- btf = btf_parse(make_bpfptr(attr->btf, uattr.is_kernel),
- attr->btf_size, attr->btf_log_level,
- u64_to_user_ptr(attr->btf_log_buf),
- attr->btf_log_size);
+ btf = btf_parse(attr, uattr, uattr_size);
if (IS_ERR(btf))
return PTR_ERR(btf);
@@ -7597,6 +7616,108 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
BTF_TRACING_TYPE_xxx
#undef BTF_TRACING_TYPE
+static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
+ const struct btf_type *func, u32 func_flags)
+{
+ u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+ const char *name, *sfx, *iter_name;
+ const struct btf_param *arg;
+ const struct btf_type *t;
+ char exp_name[128];
+ u32 nr_args;
+
+ /* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */
+ if (!flags || (flags & (flags - 1)))
+ return -EINVAL;
+
+ /* any BPF iter kfunc should have `struct bpf_iter_<type> *` first arg */
+ nr_args = btf_type_vlen(func);
+ if (nr_args < 1)
+ return -EINVAL;
+
+ arg = &btf_params(func)[0];
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
+ if (!t || !__btf_type_is_struct(t))
+ return -EINVAL;
+
+ name = btf_name_by_offset(btf, t->name_off);
+ if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
+ return -EINVAL;
+
+ /* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to
+ * fit nicely in stack slots
+ */
+ if (t->size == 0 || (t->size % 8))
+ return -EINVAL;
+
+ /* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *)
+ * naming pattern
+ */
+ iter_name = name + sizeof(ITER_PREFIX) - 1;
+ if (flags & KF_ITER_NEW)
+ sfx = "new";
+ else if (flags & KF_ITER_NEXT)
+ sfx = "next";
+ else /* (flags & KF_ITER_DESTROY) */
+ sfx = "destroy";
+
+ snprintf(exp_name, sizeof(exp_name), "bpf_iter_%s_%s", iter_name, sfx);
+ if (strcmp(func_name, exp_name))
+ return -EINVAL;
+
+ /* only iter constructor should have extra arguments */
+ if (!(flags & KF_ITER_NEW) && nr_args != 1)
+ return -EINVAL;
+
+ if (flags & KF_ITER_NEXT) {
+ /* bpf_iter_<type>_next() should return pointer */
+ t = btf_type_skip_modifiers(btf, func->type, NULL);
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ }
+
+ if (flags & KF_ITER_DESTROY) {
+ /* bpf_iter_<type>_destroy() should return void */
+ t = btf_type_by_id(btf, func->type);
+ if (!t || !btf_type_is_void(t))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
+{
+ const struct btf_type *func;
+ const char *func_name;
+ int err;
+
+ /* any kfunc should be FUNC -> FUNC_PROTO */
+ func = btf_type_by_id(btf, func_id);
+ if (!func || !btf_type_is_func(func))
+ return -EINVAL;
+
+ /* sanity check kfunc name */
+ func_name = btf_name_by_offset(btf, func->name_off);
+ if (!func_name || !func_name[0])
+ return -EINVAL;
+
+ func = btf_type_by_id(btf, func->type);
+ if (!func || !btf_type_is_func_proto(func))
+ return -EINVAL;
+
+ if (func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY)) {
+ err = btf_check_iter_kfuncs(btf, func_name, func, func_flags);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
/* Kernel Function (kfunc) BTF ID set registration API */
static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
@@ -7773,7 +7894,7 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
const struct btf_kfunc_id_set *kset)
{
struct btf *btf;
- int ret;
+ int ret, i;
btf = btf_get_module_btf(kset->owner);
if (!btf) {
@@ -7790,7 +7911,15 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
if (IS_ERR(btf))
return PTR_ERR(btf);
+ for (i = 0; i < kset->set->cnt; i++) {
+ ret = btf_check_kfunc_protos(btf, kset->set->pairs[i].id,
+ kset->set->pairs[i].flags);
+ if (ret)
+ goto err_out;
+ }
+
ret = btf_populate_kfunc_set(btf, hook, kset->set);
+err_out:
btf_put(btf);
return ret;
}
@@ -8368,16 +8497,15 @@ out:
bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
const struct bpf_reg_state *reg,
- int off, const char *suffix)
+ const char *field_name, u32 btf_id, const char *suffix)
{
struct btf *btf = reg->btf;
const struct btf_type *walk_type, *safe_type;
const char *tname;
char safe_tname[64];
long ret, safe_id;
- const struct btf_member *member, *m_walk = NULL;
+ const struct btf_member *member;
u32 i;
- const char *walk_name;
walk_type = btf_type_by_id(btf, reg->btf_id);
if (!walk_type)
@@ -8397,30 +8525,17 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
if (!safe_type)
return false;
- for_each_member(i, walk_type, member) {
- u32 moff;
-
- /* We're looking for the PTR_TO_BTF_ID member in the struct
- * type we're walking which matches the specified offset.
- * Below, we'll iterate over the fields in the safe variant of
- * the struct and see if any of them has a matching type /
- * name.
- */
- moff = __btf_member_bit_offset(walk_type, member) / 8;
- if (off == moff) {
- m_walk = member;
- break;
- }
- }
- if (m_walk == NULL)
- return false;
-
- walk_name = __btf_name_by_offset(btf, m_walk->name_off);
for_each_member(i, safe_type, member) {
const char *m_name = __btf_name_by_offset(btf, member->name_off);
+ const struct btf_type *mtype = btf_type_by_id(btf, member->type);
+ u32 id;
+
+ if (!btf_type_is_ptr(mtype))
+ continue;
+ btf_type_skip_modifiers(btf, mtype->type, &id);
/* If we match on both type and name, the field is considered trusted. */
- if (m_walk->type == member->type && !strcmp(walk_name, m_name))
+ if (btf_id == id && !strcmp(field_name, m_name))
return true;
}
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 871809e71b4e..8ec18faa74ac 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -540,7 +540,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
}
}
-static int cpu_map_delete_elem(struct bpf_map *map, void *key)
+static long cpu_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
u32 key_cpu = *(u32 *)key;
@@ -553,8 +553,8 @@ static int cpu_map_delete_elem(struct bpf_map *map, void *key)
return 0;
}
-static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
struct bpf_cpumap_val cpumap_value = {};
@@ -667,7 +667,7 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
-static int cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
+static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
{
return __bpf_xdp_redirect_map(map, index, flags, 0,
__cpu_map_lookup_elem);
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index b6587ec40f1b..7efdf5d770ca 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -9,6 +9,7 @@
/**
* struct bpf_cpumask - refcounted BPF cpumask wrapper structure
* @cpumask: The actual cpumask embedded in the struct.
+ * @rcu: The RCU head used to free the cpumask with RCU safety.
* @usage: Object reference counter. When the refcount goes to 0, the
* memory is released back to the BPF allocator, which provides
* RCU safety.
@@ -24,6 +25,7 @@
*/
struct bpf_cpumask {
cpumask_t cpumask;
+ struct rcu_head rcu;
refcount_t usage;
};
@@ -80,32 +82,14 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)
return cpumask;
}
-/**
- * bpf_cpumask_kptr_get() - Attempt to acquire a reference to a BPF cpumask
- * stored in a map.
- * @cpumaskp: A pointer to a BPF cpumask map value.
- *
- * Attempts to acquire a reference to a BPF cpumask stored in a map value. The
- * cpumask returned by this function must either be embedded in a map as a
- * kptr, or freed with bpf_cpumask_release(). This function may return NULL if
- * no BPF cpumask was found in the specified map value.
- */
-__bpf_kfunc struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumaskp)
+static void cpumask_free_cb(struct rcu_head *head)
{
struct bpf_cpumask *cpumask;
- /* The BPF memory allocator frees memory backing its caches in an RCU
- * callback. Thus, we can safely use RCU to ensure that the cpumask is
- * safe to read.
- */
- rcu_read_lock();
-
- cpumask = READ_ONCE(*cpumaskp);
- if (cpumask && !refcount_inc_not_zero(&cpumask->usage))
- cpumask = NULL;
-
- rcu_read_unlock();
- return cpumask;
+ cpumask = container_of(head, struct bpf_cpumask, rcu);
+ migrate_disable();
+ bpf_mem_cache_free(&bpf_cpumask_ma, cpumask);
+ migrate_enable();
}
/**
@@ -118,14 +102,8 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumas
*/
__bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
{
- if (!cpumask)
- return;
-
- if (refcount_dec_and_test(&cpumask->usage)) {
- migrate_disable();
- bpf_mem_cache_free(&bpf_cpumask_ma, cpumask);
- migrate_enable();
- }
+ if (refcount_dec_and_test(&cpumask->usage))
+ call_rcu(&cpumask->rcu, cpumask_free_cb);
}
/**
@@ -424,9 +402,8 @@ __diag_pop();
BTF_SET8_START(cpumask_kfunc_btf_ids)
BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU)
BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU)
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 19b036a228f7..802692fa3905 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -809,7 +809,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
kfree(dev);
}
-static int dev_map_delete_elem(struct bpf_map *map, void *key)
+static long dev_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
@@ -826,7 +826,7 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
return 0;
}
-static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
+static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
@@ -897,8 +897,8 @@ err_out:
return ERR_PTR(-EINVAL);
}
-static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
- void *key, void *value, u64 map_flags)
+static long __dev_map_update_elem(struct net *net, struct bpf_map *map,
+ void *key, void *value, u64 map_flags)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev, *old_dev;
@@ -939,15 +939,15 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
return 0;
}
-static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
return __dev_map_update_elem(current->nsproxy->net_ns,
map, key, value, map_flags);
}
-static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
- void *key, void *value, u64 map_flags)
+static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
+ void *key, void *value, u64 map_flags)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev, *old_dev;
@@ -999,21 +999,21 @@ out_err:
return err;
}
-static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
return __dev_map_hash_update_elem(current->nsproxy->net_ns,
map, key, value, map_flags);
}
-static int dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
+static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
return __bpf_xdp_redirect_map(map, ifindex, flags,
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
__dev_map_lookup_elem);
}
-static int dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
+static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
return __bpf_xdp_redirect_map(map, ifindex, flags,
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 0df4b0c10f59..00c253b84bf5 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -607,6 +607,8 @@ free_htab:
static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
{
+ if (likely(key_len % 4 == 0))
+ return jhash2(key, key_len / 4, hashrnd);
return jhash(key, key_len, hashrnd);
}
@@ -1073,8 +1075,8 @@ static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
}
/* Called from syscall or from eBPF program */
-static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
@@ -1175,8 +1177,8 @@ static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
bpf_lru_push_free(&htab->lru, &elem->lru_node);
}
-static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new, *l_old = NULL;
@@ -1242,9 +1244,9 @@ err:
return ret;
}
-static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags,
- bool onallcpus)
+static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool onallcpus)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
@@ -1297,9 +1299,9 @@ err:
return ret;
}
-static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags,
- bool onallcpus)
+static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool onallcpus)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
@@ -1364,21 +1366,21 @@ err:
return ret;
}
-static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
}
-static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
false);
}
/* Called from syscall or from eBPF program */
-static int htab_map_delete_elem(struct bpf_map *map, void *key)
+static long htab_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_nulls_head *head;
@@ -1414,7 +1416,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
return ret;
}
-static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
+static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_nulls_head *head;
@@ -2134,8 +2136,8 @@ static const struct bpf_iter_seq_info iter_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info),
};
-static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
- void *callback_ctx, u64 flags)
+static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+ void *callback_ctx, u64 flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_nulls_head *head;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 637ac4e92e75..f04e60a4847f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -18,6 +18,7 @@
#include <linux/pid_namespace.h>
#include <linux/poison.h>
#include <linux/proc_ns.h>
+#include <linux/sched/task.h>
#include <linux/security.h>
#include <linux/btf_ids.h>
#include <linux/bpf_mem_alloc.h>
@@ -257,7 +258,7 @@ BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
goto err_clear;
/* Verifier guarantees that size > 0 */
- strscpy(buf, task->comm, size);
+ strscpy_pad(buf, task->comm, size);
return 0;
err_clear:
memset(buf, 0, size);
@@ -571,7 +572,7 @@ static const struct bpf_func_proto bpf_strncmp_proto = {
.func = bpf_strncmp,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_PTR_TO_CONST_STR,
};
@@ -1896,14 +1897,19 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
return p;
}
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
+{
+ if (rec)
+ bpf_obj_free_fields(rec, p);
+ bpf_mem_free(&bpf_global_ma, p);
+}
+
__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
{
struct btf_struct_meta *meta = meta__ign;
void *p = p__alloc;
- if (meta)
- bpf_obj_free_fields(meta->record, p);
- bpf_mem_free(&bpf_global_ma, p);
+ __bpf_obj_drop_impl(p, meta ? meta->record : NULL);
}
static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail)
@@ -2008,73 +2014,8 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
*/
__bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
{
- return get_task_struct(p);
-}
-
-/**
- * bpf_task_acquire_not_zero - Acquire a reference to a rcu task object. A task
- * acquired by this kfunc which is not stored in a map as a kptr, must be
- * released by calling bpf_task_release().
- * @p: The task on which a reference is being acquired.
- */
-__bpf_kfunc struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p)
-{
- /* For the time being this function returns NULL, as it's not currently
- * possible to safely acquire a reference to a task with RCU protection
- * using get_task_struct() and put_task_struct(). This is due to the
- * slightly odd mechanics of p->rcu_users, and how task RCU protection
- * works.
- *
- * A struct task_struct is refcounted by two different refcount_t
- * fields:
- *
- * 1. p->usage: The "true" refcount field which tracks a task's
- * lifetime. The task is freed as soon as this
- * refcount drops to 0.
- *
- * 2. p->rcu_users: An "RCU users" refcount field which is statically
- * initialized to 2, and is co-located in a union with
- * a struct rcu_head field (p->rcu). p->rcu_users
- * essentially encapsulates a single p->usage
- * refcount, and when p->rcu_users goes to 0, an RCU
- * callback is scheduled on the struct rcu_head which
- * decrements the p->usage refcount.
- *
- * There are two important implications to this task refcounting logic
- * described above. The first is that
- * refcount_inc_not_zero(&p->rcu_users) cannot be used anywhere, as
- * after the refcount goes to 0, the RCU callback being scheduled will
- * cause the memory backing the refcount to again be nonzero due to the
- * fields sharing a union. The other is that we can't rely on RCU to
- * guarantee that a task is valid in a BPF program. This is because a
- * task could have already transitioned to being in the TASK_DEAD
- * state, had its rcu_users refcount go to 0, and its rcu callback
- * invoked in which it drops its single p->usage reference. At this
- * point the task will be freed as soon as the last p->usage reference
- * goes to 0, without waiting for another RCU gp to elapse. The only
- * way that a BPF program can guarantee that a task is valid is in this
- * scenario is to hold a p->usage refcount itself.
- *
- * Until we're able to resolve this issue, either by pulling
- * p->rcu_users and p->rcu out of the union, or by getting rid of
- * p->usage and just using p->rcu_users for refcounting, we'll just
- * return NULL here.
- */
- return NULL;
-}
-
-/**
- * bpf_task_kptr_get - Acquire a reference on a struct task_struct kptr. A task
- * kptr acquired by this kfunc which is not subsequently stored in a map, must
- * be released by calling bpf_task_release().
- * @pp: A pointer to a task kptr on which a reference is being acquired.
- */
-__bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
-{
- /* We must return NULL here until we have clarity on how to properly
- * leverage RCU for ensuring a task's lifetime. See the comment above
- * in bpf_task_acquire_not_zero() for more details.
- */
+ if (refcount_inc_not_zero(&p->rcu_users))
+ return p;
return NULL;
}
@@ -2084,10 +2025,7 @@ __bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
*/
__bpf_kfunc void bpf_task_release(struct task_struct *p)
{
- if (!p)
- return;
-
- put_task_struct(p);
+ put_task_struct_rcu_user(p);
}
#ifdef CONFIG_CGROUPS
@@ -2099,39 +2037,7 @@ __bpf_kfunc void bpf_task_release(struct task_struct *p)
*/
__bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
{
- cgroup_get(cgrp);
- return cgrp;
-}
-
-/**
- * bpf_cgroup_kptr_get - Acquire a reference on a struct cgroup kptr. A cgroup
- * kptr acquired by this kfunc which is not subsequently stored in a map, must
- * be released by calling bpf_cgroup_release().
- * @cgrpp: A pointer to a cgroup kptr on which a reference is being acquired.
- */
-__bpf_kfunc struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)
-{
- struct cgroup *cgrp;
-
- rcu_read_lock();
- /* Another context could remove the cgroup from the map and release it
- * at any time, including after we've done the lookup above. This is
- * safe because we're in an RCU read region, so the cgroup is
- * guaranteed to remain valid until at least the rcu_read_unlock()
- * below.
- */
- cgrp = READ_ONCE(*cgrpp);
-
- if (cgrp && !cgroup_tryget(cgrp))
- /* If the cgroup had been removed from the map and freed as
- * described above, cgroup_tryget() will return false. The
- * cgroup will be freed at some point after the current RCU gp
- * has ended, so just return NULL to the user.
- */
- cgrp = NULL;
- rcu_read_unlock();
-
- return cgrp;
+ return cgroup_tryget(cgrp) ? cgrp : NULL;
}
/**
@@ -2143,9 +2049,6 @@ __bpf_kfunc struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)
*/
__bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
{
- if (!cgrp)
- return;
-
cgroup_put(cgrp);
}
@@ -2200,7 +2103,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
rcu_read_lock();
p = find_task_by_pid_ns(pid, &init_pid_ns);
if (p)
- bpf_task_acquire(p);
+ p = bpf_task_acquire(p);
rcu_read_unlock();
return p;
@@ -2372,17 +2275,14 @@ BTF_ID_FLAGS(func, bpf_list_push_front)
BTF_ID_FLAGS(func, bpf_list_push_back)
BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE)
BTF_ID_FLAGS(func, bpf_rbtree_add)
BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
#ifdef CONFIG_CGROUPS
-BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
@@ -2411,6 +2311,9 @@ BTF_ID_FLAGS(func, bpf_rcu_read_lock)
BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
BTF_SET8_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index a993560f200a..4c7bbec4a9e4 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -141,8 +141,8 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key)
return &READ_ONCE(storage->buf)->data[0];
}
-static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long cgroup_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
struct bpf_cgroup_storage *storage;
struct bpf_storage_buffer *new;
@@ -348,7 +348,7 @@ static void cgroup_storage_map_free(struct bpf_map *_map)
bpf_map_area_free(map);
}
-static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
+static long cgroup_storage_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
new file mode 100644
index 000000000000..046ddff37a76
--- /dev/null
+++ b/kernel/bpf/log.c
@@ -0,0 +1,330 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+ */
+#include <uapi/linux/btf.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/math64.h>
+
+static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
+{
+ /* ubuf and len_total should both be specified (or not) together */
+ if (!!log->ubuf != !!log->len_total)
+ return false;
+ /* log buf without log_level is meaningless */
+ if (log->ubuf && log->level == 0)
+ return false;
+ if (log->level & ~BPF_LOG_MASK)
+ return false;
+ if (log->len_total > UINT_MAX >> 2)
+ return false;
+ return true;
+}
+
+int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
+ char __user *log_buf, u32 log_size)
+{
+ log->level = log_level;
+ log->ubuf = log_buf;
+ log->len_total = log_size;
+
+ /* log attributes have to be sane */
+ if (!bpf_verifier_log_attr_valid(log))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len)
+{
+ /* add_len includes terminal \0, so no need for +1. */
+ u64 len = log->end_pos + add_len;
+
+ /* log->len_max could be larger than our current len due to
+ * bpf_vlog_reset() calls, so we maintain the max of any length at any
+ * previous point
+ */
+ if (len > UINT_MAX)
+ log->len_max = UINT_MAX;
+ else if (len > log->len_max)
+ log->len_max = len;
+}
+
+void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
+ va_list args)
+{
+ u64 cur_pos;
+ u32 new_n, n;
+
+ n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
+
+ WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
+ "verifier log line truncated - local buffer too short\n");
+
+ if (log->level == BPF_LOG_KERNEL) {
+ bool newline = n > 0 && log->kbuf[n - 1] == '\n';
+
+ pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
+ return;
+ }
+
+ n += 1; /* include terminating zero */
+ bpf_vlog_update_len_max(log, n);
+
+ if (log->level & BPF_LOG_FIXED) {
+ /* check if we have at least something to put into user buf */
+ new_n = 0;
+ if (log->end_pos < log->len_total) {
+ new_n = min_t(u32, log->len_total - log->end_pos, n);
+ log->kbuf[new_n - 1] = '\0';
+ }
+
+ cur_pos = log->end_pos;
+ log->end_pos += n - 1; /* don't count terminating '\0' */
+
+ if (log->ubuf && new_n &&
+ copy_to_user(log->ubuf + cur_pos, log->kbuf, new_n))
+ goto fail;
+ } else {
+ u64 new_end, new_start;
+ u32 buf_start, buf_end, new_n;
+
+ new_end = log->end_pos + n;
+ if (new_end - log->start_pos >= log->len_total)
+ new_start = new_end - log->len_total;
+ else
+ new_start = log->start_pos;
+
+ log->start_pos = new_start;
+ log->end_pos = new_end - 1; /* don't count terminating '\0' */
+
+ if (!log->ubuf)
+ return;
+
+ new_n = min(n, log->len_total);
+ cur_pos = new_end - new_n;
+ div_u64_rem(cur_pos, log->len_total, &buf_start);
+ div_u64_rem(new_end, log->len_total, &buf_end);
+ /* new_end and buf_end are exclusive indices, so if buf_end is
+ * exactly zero, then it actually points right to the end of
+ * ubuf and there is no wrap around
+ */
+ if (buf_end == 0)
+ buf_end = log->len_total;
+
+ /* if buf_start > buf_end, we wrapped around;
+ * if buf_start == buf_end, then we fill ubuf completely; we
+ * can't have buf_start == buf_end to mean that there is
+ * nothing to write, because we always write at least
+ * something, even if terminal '\0'
+ */
+ if (buf_start < buf_end) {
+ /* message fits within contiguous chunk of ubuf */
+ if (copy_to_user(log->ubuf + buf_start,
+ log->kbuf + n - new_n,
+ buf_end - buf_start))
+ goto fail;
+ } else {
+ /* message wraps around the end of ubuf, copy in two chunks */
+ if (copy_to_user(log->ubuf + buf_start,
+ log->kbuf + n - new_n,
+ log->len_total - buf_start))
+ goto fail;
+ if (copy_to_user(log->ubuf,
+ log->kbuf + n - buf_end,
+ buf_end))
+ goto fail;
+ }
+ }
+
+ return;
+fail:
+ log->ubuf = NULL;
+}
+
+void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos)
+{
+ char zero = 0;
+ u32 pos;
+
+ if (WARN_ON_ONCE(new_pos > log->end_pos))
+ return;
+
+ if (!bpf_verifier_log_needed(log) || log->level == BPF_LOG_KERNEL)
+ return;
+
+ /* if position to which we reset is beyond current log window,
+ * then we didn't preserve any useful content and should adjust
+ * start_pos to end up with an empty log (start_pos == end_pos)
+ */
+ log->end_pos = new_pos;
+ if (log->end_pos < log->start_pos)
+ log->start_pos = log->end_pos;
+
+ if (!log->ubuf)
+ return;
+
+ if (log->level & BPF_LOG_FIXED)
+ pos = log->end_pos + 1;
+ else
+ div_u64_rem(new_pos, log->len_total, &pos);
+
+ if (pos < log->len_total && put_user(zero, log->ubuf + pos))
+ log->ubuf = NULL;
+}
+
+static void bpf_vlog_reverse_kbuf(char *buf, int len)
+{
+ int i, j;
+
+ for (i = 0, j = len - 1; i < j; i++, j--)
+ swap(buf[i], buf[j]);
+}
+
+static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int end)
+{
+ /* we split log->kbuf into two equal parts for both ends of array */
+ int n = sizeof(log->kbuf) / 2, nn;
+ char *lbuf = log->kbuf, *rbuf = log->kbuf + n;
+
+ /* Read ubuf's section [start, end) two chunks at a time, from left
+ * and right side; within each chunk, swap all the bytes; after that
+ * reverse the order of lbuf and rbuf and write result back to ubuf.
+ * This way we'll end up with swapped contents of specified
+ * [start, end) ubuf segment.
+ */
+ while (end - start > 1) {
+ nn = min(n, (end - start ) / 2);
+
+ if (copy_from_user(lbuf, log->ubuf + start, nn))
+ return -EFAULT;
+ if (copy_from_user(rbuf, log->ubuf + end - nn, nn))
+ return -EFAULT;
+
+ bpf_vlog_reverse_kbuf(lbuf, nn);
+ bpf_vlog_reverse_kbuf(rbuf, nn);
+
+ /* we write lbuf to the right end of ubuf, while rbuf to the
+ * left one to end up with properly reversed overall ubuf
+ */
+ if (copy_to_user(log->ubuf + start, rbuf, nn))
+ return -EFAULT;
+ if (copy_to_user(log->ubuf + end - nn, lbuf, nn))
+ return -EFAULT;
+
+ start += nn;
+ end -= nn;
+ }
+
+ return 0;
+}
+
+int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual)
+{
+ u32 sublen;
+ int err;
+
+ *log_size_actual = 0;
+ if (!log || log->level == 0 || log->level == BPF_LOG_KERNEL)
+ return 0;
+
+ if (!log->ubuf)
+ goto skip_log_rotate;
+ /* If we never truncated log, there is nothing to move around. */
+ if (log->start_pos == 0)
+ goto skip_log_rotate;
+
+ /* Otherwise we need to rotate log contents to make it start from the
+ * buffer beginning and be a continuous zero-terminated string. Note
+ * that if log->start_pos != 0 then we definitely filled up entire log
+ * buffer with no gaps, and we just need to shift buffer contents to
+ * the left by (log->start_pos % log->len_total) bytes.
+ *
+ * Unfortunately, user buffer could be huge and we don't want to
+ * allocate temporary kernel memory of the same size just to shift
+ * contents in a straightforward fashion. Instead, we'll be clever and
+ * do in-place array rotation. This is a leetcode-style problem, which
+ * could be solved by three rotations.
+ *
+ * Let's say we have log buffer that has to be shifted left by 7 bytes
+ * (spaces and vertical bar is just for demonstrative purposes):
+ * E F G H I J K | A B C D
+ *
+ * First, we reverse entire array:
+ * D C B A | K J I H G F E
+ *
+ * Then we rotate first 4 bytes (DCBA) and separately last 7 bytes
+ * (KJIHGFE), resulting in a properly rotated array:
+ * A B C D | E F G H I J K
+ *
+ * We'll utilize log->kbuf to read user memory chunk by chunk, swap
+ * bytes, and write them back. Doing it byte-by-byte would be
+ * unnecessarily inefficient. Altogether we are going to read and
+ * write each byte twice, for total 4 memory copies between kernel and
+ * user space.
+ */
+
+ /* length of the chopped off part that will be the beginning;
+ * len(ABCD) in the example above
+ */
+ div_u64_rem(log->start_pos, log->len_total, &sublen);
+ sublen = log->len_total - sublen;
+
+ err = bpf_vlog_reverse_ubuf(log, 0, log->len_total);
+ err = err ?: bpf_vlog_reverse_ubuf(log, 0, sublen);
+ err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total);
+ if (err)
+ log->ubuf = NULL;
+
+skip_log_rotate:
+ *log_size_actual = log->len_max;
+
+ /* properly initialized log has either both ubuf!=NULL and len_total>0
+ * or ubuf==NULL and len_total==0, so if this condition doesn't hold,
+ * we got a fault somewhere along the way, so report it back
+ */
+ if (!!log->ubuf != !!log->len_total)
+ return -EFAULT;
+
+ /* did truncation actually happen? */
+ if (log->ubuf && log->len_max > log->len_total)
+ return -ENOSPC;
+
+ return 0;
+}
+
+/* log_level controls verbosity level of eBPF verifier.
+ * bpf_verifier_log_write() is used to dump the verification trace to the log,
+ * so the user can figure out what's wrong with the program
+ */
+__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ if (!bpf_verifier_log_needed(&env->log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(&env->log, fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
+
+__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_log);
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index dc23f2ac9cde..e0d3ddf2037a 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -300,8 +300,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
}
/* Called from syscall or from eBPF program */
-static int trie_update_elem(struct bpf_map *map,
- void *_key, void *value, u64 flags)
+static long trie_update_elem(struct bpf_map *map,
+ void *_key, void *value, u64 flags)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
@@ -431,7 +431,7 @@ out:
}
/* Called from syscall or from eBPF program */
-static int trie_delete_elem(struct bpf_map *map, void *_key)
+static long trie_delete_elem(struct bpf_map *map, void *_key)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
struct bpf_lpm_trie_key *key = _key;
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 5fcdacbb8439..410637c225fb 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -121,15 +121,8 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head)
return entry;
}
-static void *__alloc(struct bpf_mem_cache *c, int node)
+static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
{
- /* Allocate, but don't deplete atomic reserves that typical
- * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
- * will allocate from the current numa node which is what we
- * want here.
- */
- gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT;
-
if (c->percpu_size) {
void **obj = kmalloc_node(c->percpu_size, flags, node);
void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
@@ -185,7 +178,12 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
*/
obj = __llist_del_first(&c->free_by_rcu);
if (!obj) {
- obj = __alloc(c, node);
+ /* Allocate, but don't deplete atomic reserves that typical
+ * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
+ * will allocate from the current numa node which is what we
+ * want here.
+ */
+ obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT);
if (!obj)
break;
}
@@ -676,3 +674,46 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
unit_free(this_cpu_ptr(ma->cache), ptr);
}
+
+/* Directly does a kfree() without putting 'ptr' back to the free_llist
+ * for reuse and without waiting for a rcu_tasks_trace gp.
+ * The caller must first go through the rcu_tasks_trace gp for 'ptr'
+ * before calling bpf_mem_cache_raw_free().
+ * It could be used when the rcu_tasks_trace callback does not have
+ * a hold on the original bpf_mem_alloc object that allocated the
+ * 'ptr'. This should only be used in the uncommon code path.
+ * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled
+ * and may affect performance.
+ */
+void bpf_mem_cache_raw_free(void *ptr)
+{
+ if (!ptr)
+ return;
+
+ kfree(ptr - LLIST_NODE_SZ);
+}
+
+/* When flags == GFP_KERNEL, it signals that the caller will not cause
+ * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use
+ * kmalloc if the free_llist is empty.
+ */
+void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
+{
+ struct bpf_mem_cache *c;
+ void *ret;
+
+ c = this_cpu_ptr(ma->cache);
+
+ ret = unit_alloc(c);
+ if (!ret && flags == GFP_KERNEL) {
+ struct mem_cgroup *memcg, *old_memcg;
+
+ memcg = get_memcg(c);
+ old_memcg = set_active_memcg(memcg);
+ ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+ }
+
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 63ecbbcb349d..601609164ef3 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -95,7 +95,7 @@ static void queue_stack_map_free(struct bpf_map *map)
bpf_map_area_free(qs);
}
-static int __queue_map_get(struct bpf_map *map, void *value, bool delete)
+static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
unsigned long flags;
@@ -124,7 +124,7 @@ out:
}
-static int __stack_map_get(struct bpf_map *map, void *value, bool delete)
+static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
unsigned long flags;
@@ -156,32 +156,32 @@ out:
}
/* Called from syscall or from eBPF program */
-static int queue_map_peek_elem(struct bpf_map *map, void *value)
+static long queue_map_peek_elem(struct bpf_map *map, void *value)
{
return __queue_map_get(map, value, false);
}
/* Called from syscall or from eBPF program */
-static int stack_map_peek_elem(struct bpf_map *map, void *value)
+static long stack_map_peek_elem(struct bpf_map *map, void *value)
{
return __stack_map_get(map, value, false);
}
/* Called from syscall or from eBPF program */
-static int queue_map_pop_elem(struct bpf_map *map, void *value)
+static long queue_map_pop_elem(struct bpf_map *map, void *value)
{
return __queue_map_get(map, value, true);
}
/* Called from syscall or from eBPF program */
-static int stack_map_pop_elem(struct bpf_map *map, void *value)
+static long stack_map_pop_elem(struct bpf_map *map, void *value)
{
return __stack_map_get(map, value, true);
}
/* Called from syscall or from eBPF program */
-static int queue_stack_map_push_elem(struct bpf_map *map, void *value,
- u64 flags)
+static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
+ u64 flags)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
unsigned long irq_flags;
@@ -227,14 +227,14 @@ static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key)
}
/* Called from syscall or from eBPF program */
-static int queue_stack_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long queue_stack_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
return -EINVAL;
}
/* Called from syscall or from eBPF program */
-static int queue_stack_map_delete_elem(struct bpf_map *map, void *key)
+static long queue_stack_map_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 71cb72f5b733..cbf2d8d784b8 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -59,7 +59,7 @@ static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key)
}
/* Called from syscall only */
-static int reuseport_array_delete_elem(struct bpf_map *map, void *key)
+static long reuseport_array_delete_elem(struct bpf_map *map, void *key)
{
struct reuseport_array *array = reuseport_array(map);
u32 index = *(u32 *)key;
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 0d2a45ff83f1..875ac9b698d9 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -242,13 +242,13 @@ static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
return ERR_PTR(-ENOTSUPP);
}
-static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 flags)
+static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 flags)
{
return -ENOTSUPP;
}
-static int ringbuf_map_delete_elem(struct bpf_map *map, void *key)
+static long ringbuf_map_delete_elem(struct bpf_map *map, void *key)
{
return -ENOTSUPP;
}
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 0f1d8dced933..b25fce425b2c 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -618,14 +618,14 @@ static int stack_map_get_next_key(struct bpf_map *map, void *key,
return 0;
}
-static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long stack_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
return -EINVAL;
}
/* Called from syscall or from eBPF program */
-static int stack_map_delete_elem(struct bpf_map *map, void *key)
+static long stack_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *old_bucket;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f406dfa13792..6d575505f89c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -520,14 +520,14 @@ static int btf_field_cmp(const void *a, const void *b)
}
struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
- enum btf_field_type type)
+ u32 field_mask)
{
struct btf_field *field;
- if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & type))
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask))
return NULL;
field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
- if (!field || !(field->type & type))
+ if (!field || !(field->type & field_mask))
return NULL;
return field;
}
@@ -650,6 +650,8 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
bpf_timer_cancel_and_free(obj + rec->timer_off);
}
+extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+
void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
{
const struct btf_field *fields;
@@ -659,8 +661,10 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
return;
fields = rec->fields;
for (i = 0; i < rec->cnt; i++) {
+ struct btf_struct_meta *pointee_struct_meta;
const struct btf_field *field = &fields[i];
void *field_ptr = obj + field->offset;
+ void *xchgd_field;
switch (fields[i].type) {
case BPF_SPIN_LOCK:
@@ -672,7 +676,22 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
WRITE_ONCE(*(u64 *)field_ptr, 0);
break;
case BPF_KPTR_REF:
- field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0));
+ xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
+ if (!xchgd_field)
+ break;
+
+ if (!btf_is_kernel(field->kptr.btf)) {
+ pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
+ field->kptr.btf_id);
+ WARN_ON_ONCE(!pointee_struct_meta);
+ migrate_disable();
+ __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
+ pointee_struct_meta->record :
+ NULL);
+ migrate_enable();
+ } else {
+ field->kptr.dtor(xchgd_field);
+ }
break;
case BPF_LIST_HEAD:
if (WARN_ON_ONCE(rec->spin_lock_off < 0))
@@ -1287,8 +1306,10 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
return map;
}
-/* map_idr_lock should have been held */
-static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
+/* map_idr_lock should have been held or the map should have been
+ * protected by rcu read lock.
+ */
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
{
int refold;
@@ -2051,6 +2072,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
{
bpf_prog_kallsyms_del_all(prog);
btf_put(prog->aux->btf);
+ module_put(prog->aux->mod);
kvfree(prog->aux->jited_linfo);
kvfree(prog->aux->linfo);
kfree(prog->aux->kfunc_tab);
@@ -2479,9 +2501,9 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size
+#define BPF_PROG_LOAD_LAST_FIELD log_true_size
-static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
+static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog, *dst_prog = NULL;
@@ -2631,7 +2653,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
goto free_prog_sec;
/* run eBPF verifier */
- err = bpf_check(&prog, attr, uattr);
+ err = bpf_check(&prog, attr, uattr, uattr_size);
if (err < 0)
goto free_used_maps;
@@ -2806,16 +2828,19 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
const struct bpf_prog *prog = link->prog;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"link_type:\t%s\n"
- "link_id:\t%u\n"
- "prog_tag:\t%s\n"
- "prog_id:\t%u\n",
+ "link_id:\t%u\n",
bpf_link_type_strs[link->type],
- link->id,
- prog_tag,
- prog->aux->id);
+ link->id);
+ if (prog) {
+ bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
+ seq_printf(m,
+ "prog_tag:\t%s\n"
+ "prog_id:\t%u\n",
+ prog_tag,
+ prog->aux->id);
+ }
if (link->ops->show_fdinfo)
link->ops->show_fdinfo(link, m);
}
@@ -3097,6 +3122,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
if (err)
goto out_unlock;
+ if (tgt_info.tgt_mod) {
+ module_put(prog->aux->mod);
+ prog->aux->mod = tgt_info.tgt_mod;
+ }
+
tr = bpf_trampoline_get(key, &tgt_info);
if (!tr) {
err = -ENOMEM;
@@ -4290,7 +4320,8 @@ static int bpf_link_get_info_by_fd(struct file *file,
info.type = link->type;
info.id = link->id;
- info.prog_id = link->prog->aux->id;
+ if (link->prog)
+ info.prog_id = link->prog->aux->id;
if (link->ops->fill_link_info) {
err = link->ops->fill_link_info(link, &info);
@@ -4340,9 +4371,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
return err;
}
-#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
+#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size
-static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
+static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
if (CHECK_ATTR(BPF_BTF_LOAD))
return -EINVAL;
@@ -4350,7 +4381,7 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
if (!bpf_capable())
return -EPERM;
- return btf_new_fd(attr, uattr);
+ return btf_new_fd(attr, uattr, uattr_size);
}
#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
@@ -4553,6 +4584,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
if (CHECK_ATTR(BPF_LINK_CREATE))
return -EINVAL;
+ if (attr->link_create.attach_type == BPF_STRUCT_OPS)
+ return bpf_struct_ops_link_create(attr);
+
prog = bpf_prog_get(attr->link_create.prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
@@ -4651,6 +4685,35 @@ out:
return ret;
}
+static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
+{
+ struct bpf_map *new_map, *old_map = NULL;
+ int ret;
+
+ new_map = bpf_map_get(attr->link_update.new_map_fd);
+ if (IS_ERR(new_map))
+ return PTR_ERR(new_map);
+
+ if (attr->link_update.flags & BPF_F_REPLACE) {
+ old_map = bpf_map_get(attr->link_update.old_map_fd);
+ if (IS_ERR(old_map)) {
+ ret = PTR_ERR(old_map);
+ goto out_put;
+ }
+ } else if (attr->link_update.old_map_fd) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ ret = link->ops->update_map(link, new_map, old_map);
+
+ if (old_map)
+ bpf_map_put(old_map);
+out_put:
+ bpf_map_put(new_map);
+ return ret;
+}
+
#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
static int link_update(union bpf_attr *attr)
@@ -4671,6 +4734,11 @@ static int link_update(union bpf_attr *attr)
if (IS_ERR(link))
return PTR_ERR(link);
+ if (link->ops->update_map) {
+ ret = link_update_map(link, attr);
+ goto out_put_link;
+ }
+
new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
if (IS_ERR(new_prog)) {
ret = PTR_ERR(new_prog);
@@ -4991,7 +5059,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
err = map_freeze(&attr);
break;
case BPF_PROG_LOAD:
- err = bpf_prog_load(&attr, uattr);
+ err = bpf_prog_load(&attr, uattr, size);
break;
case BPF_OBJ_PIN:
err = bpf_obj_pin(&attr);
@@ -5036,7 +5104,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
err = bpf_raw_tracepoint_open(&attr);
break;
case BPF_BTF_LOAD:
- err = bpf_btf_load(&attr, uattr);
+ err = bpf_btf_load(&attr, uattr, size);
break;
case BPF_BTF_GET_FD_BY_ID:
err = bpf_btf_get_fd_by_id(&attr);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index d0ed7d6f5eec..f61d5138b12b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -9,7 +9,6 @@
#include <linux/btf.h>
#include <linux/rcupdate_trace.h>
#include <linux/rcupdate_wait.h>
-#include <linux/module.h>
#include <linux/static_call.h>
#include <linux/bpf_verifier.h>
#include <linux/bpf_lsm.h>
@@ -172,26 +171,6 @@ out:
return tr;
}
-static int bpf_trampoline_module_get(struct bpf_trampoline *tr)
-{
- struct module *mod;
- int err = 0;
-
- preempt_disable();
- mod = __module_text_address((unsigned long) tr->func.addr);
- if (mod && !try_module_get(mod))
- err = -ENOENT;
- preempt_enable();
- tr->mod = mod;
- return err;
-}
-
-static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
-{
- module_put(tr->mod);
- tr->mod = NULL;
-}
-
static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
{
void *ip = tr->func.addr;
@@ -202,8 +181,6 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
else
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
- if (!ret)
- bpf_trampoline_module_put(tr);
return ret;
}
@@ -238,9 +215,6 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
tr->func.ftrace_managed = true;
}
- if (bpf_trampoline_module_get(tr))
- return -ENOENT;
-
if (tr->func.ftrace_managed) {
ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
ret = register_ftrace_direct_multi(tr->fops, (long)new_addr);
@@ -248,8 +222,6 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
}
- if (ret)
- bpf_trampoline_module_put(tr);
return ret;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b2116ca78d9a..d6db6de3e9ea 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -24,6 +24,7 @@
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
#include <linux/poison.h>
+#include <linux/module.h>
#include "disasm.h"
@@ -302,6 +303,10 @@ struct bpf_kfunc_call_arg_meta {
enum bpf_dynptr_type type;
u32 id;
} initialized_dynptr;
+ struct {
+ u8 spi;
+ u8 frameno;
+ } iter;
u64 mem_size;
};
@@ -330,61 +335,6 @@ find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
return &linfo[i - 1];
}
-void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
- va_list args)
-{
- unsigned int n;
-
- n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
-
- WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
- "verifier log line truncated - local buffer too short\n");
-
- if (log->level == BPF_LOG_KERNEL) {
- bool newline = n > 0 && log->kbuf[n - 1] == '\n';
-
- pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
- return;
- }
-
- n = min(log->len_total - log->len_used - 1, n);
- log->kbuf[n] = '\0';
- if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
- log->len_used += n;
- else
- log->ubuf = NULL;
-}
-
-static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
-{
- char zero = 0;
-
- if (!bpf_verifier_log_needed(log))
- return;
-
- log->len_used = new_pos;
- if (put_user(zero, log->ubuf + new_pos))
- log->ubuf = NULL;
-}
-
-/* log_level controls verbosity level of eBPF verifier.
- * bpf_verifier_log_write() is used to dump the verification trace to the log,
- * so the user can figure out what's wrong with the program
- */
-__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
- const char *fmt, ...)
-{
- va_list args;
-
- if (!bpf_verifier_log_needed(&env->log))
- return;
-
- va_start(args, fmt);
- bpf_verifier_vlog(&env->log, fmt, args);
- va_end(args);
-}
-EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
-
__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
{
struct bpf_verifier_env *env = private_data;
@@ -398,20 +348,6 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
va_end(args);
}
-__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
- const char *fmt, ...)
-{
- va_list args;
-
- if (!bpf_verifier_log_needed(log))
- return;
-
- va_start(args, fmt);
- bpf_verifier_vlog(log, fmt, args);
- va_end(args);
-}
-EXPORT_SYMBOL_GPL(bpf_log);
-
static const char *ltrim(const char *s)
{
while (isspace(*s))
@@ -481,8 +417,17 @@ static bool type_is_sk_pointer(enum bpf_reg_type type)
type == PTR_TO_XDP_SOCK;
}
+static bool type_may_be_null(u32 type)
+{
+ return type & PTR_MAYBE_NULL;
+}
+
static bool reg_type_not_null(enum bpf_reg_type type)
{
+ if (type_may_be_null(type))
+ return false;
+
+ type = base_type(type);
return type == PTR_TO_SOCKET ||
type == PTR_TO_TCP_SOCK ||
type == PTR_TO_MAP_VALUE ||
@@ -526,11 +471,6 @@ static bool type_is_rdonly_mem(u32 type)
return type & MEM_RDONLY;
}
-static bool type_may_be_null(u32 type)
-{
- return type & PTR_MAYBE_NULL;
-}
-
static bool is_acquire_function(enum bpf_func_id func_id,
const struct bpf_map *map)
{
@@ -668,6 +608,7 @@ static char slot_type_char[] = {
[STACK_MISC] = 'm',
[STACK_ZERO] = '0',
[STACK_DYNPTR] = 'd',
+ [STACK_ITER] = 'i',
};
static void print_liveness(struct bpf_verifier_env *env,
@@ -742,7 +683,12 @@ static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *re
return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS);
}
-static const char *kernel_type_name(const struct btf* btf, u32 id)
+static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots)
+{
+ return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
+}
+
+static const char *btf_type_name(const struct btf *btf, u32 id)
{
return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
}
@@ -766,6 +712,30 @@ static const char *dynptr_type_str(enum bpf_dynptr_type type)
}
}
+static const char *iter_type_str(const struct btf *btf, u32 btf_id)
+{
+ if (!btf || btf_id == 0)
+ return "<invalid>";
+
+ /* we already validated that type is valid and has conforming name */
+ return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
+}
+
+static const char *iter_state_str(enum bpf_iter_state state)
+{
+ switch (state) {
+ case BPF_ITER_STATE_ACTIVE:
+ return "active";
+ case BPF_ITER_STATE_DRAINED:
+ return "drained";
+ case BPF_ITER_STATE_INVALID:
+ return "<invalid>";
+ default:
+ WARN_ONCE(1, "unknown iter state %d\n", state);
+ return "<unknown>";
+ }
+}
+
static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
{
env->scratched_regs |= 1U << regno;
@@ -1118,6 +1088,157 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
}
}
+static void __mark_reg_known_zero(struct bpf_reg_state *reg);
+
+static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int insn_idx,
+ struct btf *btf, u32 btf_id, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j, id;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ id = acquire_reference_state(env, insn_idx);
+ if (id < 0)
+ return id;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ __mark_reg_known_zero(st);
+ st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+ st->live |= REG_LIVE_WRITTEN;
+ st->ref_obj_id = i == 0 ? id : 0;
+ st->iter.btf = btf;
+ st->iter.btf_id = btf_id;
+ st->iter.state = BPF_ITER_STATE_ACTIVE;
+ st->iter.depth = 0;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ slot->slot_type[j] = STACK_ITER;
+
+ mark_stack_slot_scratched(env, spi - i);
+ }
+
+ return 0;
+}
+
+static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ if (i == 0)
+ WARN_ON_ONCE(release_reference(env, st->ref_obj_id));
+
+ __mark_reg_not_init(env, st);
+
+ /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
+ st->live |= REG_LIVE_WRITTEN;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ slot->slot_type[j] = STACK_INVALID;
+
+ mark_stack_slot_scratched(env, spi - i);
+ }
+
+ return 0;
+}
+
+static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+ * will do check_mem_access to check and update stack bounds later, so
+ * return true for that case.
+ */
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi == -ERANGE)
+ return true;
+ if (spi < 0)
+ return false;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ if (slot->slot_type[j] == STACK_ITER)
+ return false;
+ }
+
+ return true;
+}
+
+static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ struct btf *btf, u32 btf_id, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return false;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ /* only main (first) slot has ref_obj_id set */
+ if (i == 0 && !st->ref_obj_id)
+ return false;
+ if (i != 0 && st->ref_obj_id)
+ return false;
+ if (st->iter.btf != btf || st->iter.btf_id != btf_id)
+ return false;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ if (slot->slot_type[j] != STACK_ITER)
+ return false;
+ }
+
+ return true;
+}
+
+/* Check if given stack slot is "special":
+ * - spilled register state (STACK_SPILL);
+ * - dynptr state (STACK_DYNPTR);
+ * - iter state (STACK_ITER).
+ */
+static bool is_stack_slot_special(const struct bpf_stack_state *stack)
+{
+ enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1];
+
+ switch (type) {
+ case STACK_SPILL:
+ case STACK_DYNPTR:
+ case STACK_ITER:
+ return true;
+ case STACK_INVALID:
+ case STACK_MISC:
+ case STACK_ZERO:
+ return false;
+ default:
+ WARN_ONCE(1, "unknown stack slot type %d\n", type);
+ return true;
+ }
+}
+
/* The reg state of a pointer or a bounded scalar was saved when
* it was spilled to the stack.
*/
@@ -1164,7 +1285,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
verbose(env, "%s", reg_type_str(env, t));
if (base_type(t) == PTR_TO_BTF_ID)
- verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id));
+ verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
verbose(env, "(");
/*
* _a stands for append, was shortened to avoid multiline statements below.
@@ -1267,6 +1388,19 @@ static void print_verifier_state(struct bpf_verifier_env *env,
if (reg->ref_obj_id)
verbose(env, "(ref_id=%d)", reg->ref_obj_id);
break;
+ case STACK_ITER:
+ /* only main slot has ref_obj_id set; skip others */
+ reg = &state->stack[i].spilled_ptr;
+ if (!reg->ref_obj_id)
+ continue;
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
+ verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
+ iter_type_str(reg->iter.btf, reg->iter.btf_id),
+ reg->ref_obj_id, iter_state_str(reg->iter.state),
+ reg->iter.depth);
+ break;
case STACK_MISC:
case STACK_ZERO:
default:
@@ -1305,10 +1439,10 @@ static inline u32 vlog_alignment(u32 pos)
static void print_insn_state(struct bpf_verifier_env *env,
const struct bpf_func_state *state)
{
- if (env->prev_log_len && env->prev_log_len == env->log.len_used) {
+ if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
/* remove new line character */
- bpf_vlog_reset(&env->log, env->prev_log_len - 1);
- verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_len), ' ');
+ bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
+ verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
} else {
verbose(env, "%d:", env->insn_idx);
}
@@ -1616,7 +1750,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
- elem->log_pos = env->log.len_used;
+ elem->log_pos = env->log.end_pos;
env->head = elem;
env->stack_size++;
err = copy_verifier_state(&elem->st, cur);
@@ -1946,9 +2080,9 @@ static void __reg_bound_offset(struct bpf_reg_state *reg)
struct tnum var64_off = tnum_intersect(reg->var_off,
tnum_range(reg->umin_value,
reg->umax_value));
- struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off),
- tnum_range(reg->u32_min_value,
- reg->u32_max_value));
+ struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off),
+ tnum_range(reg->u32_min_value,
+ reg->u32_max_value));
reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
}
@@ -2152,7 +2286,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
- elem->log_pos = env->log.len_used;
+ elem->log_pos = env->log.end_pos;
env->head = elem;
env->stack_size++;
if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
@@ -2710,6 +2844,25 @@ static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *
state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);
}
+static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int spi, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int err, i;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
+
+ err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
+ if (err)
+ return err;
+
+ mark_stack_slot_scratched(env, spi - i);
+ }
+
+ return 0;
+}
+
/* This function is supposed to be used by the following 32-bit optimization
* code only. It returns TRUE if the source or destination register operates
* on 64-bit, otherwise return FALSE.
@@ -3691,8 +3844,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
/* regular write of data into stack destroys any spilled ptr */
state->stack[spi].spilled_ptr.type = NOT_INIT;
- /* Mark slots as STACK_MISC if they belonged to spilled ptr. */
- if (is_spilled_reg(&state->stack[spi]))
+ /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */
+ if (is_stack_slot_special(&state->stack[spi]))
for (i = 0; i < BPF_REG_SIZE; i++)
scrub_spilled_slot(&state->stack[spi].slot_type[i]);
@@ -4085,17 +4238,13 @@ static int check_stack_read(struct bpf_verifier_env *env,
}
/* Variable offset is prohibited for unprivileged mode for simplicity
* since it requires corresponding support in Spectre masking for stack
- * ALU. See also retrieve_ptr_limit().
+ * ALU. See also retrieve_ptr_limit(). The check in
+ * check_stack_access_for_ptr_arithmetic() called by
+ * adjust_ptr_min_max_vals() prevents users from creating stack pointers
+ * with variable offsets, therefore no check is required here. Further,
+ * just checking it here would be insufficient as speculative stack
+ * writes could still lead to unsafe speculative behaviour.
*/
- if (!env->bypass_spec_v1 && var_off) {
- char tn_buf[48];
-
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
- ptr_regno, tn_buf);
- return -EACCES;
- }
-
if (!var_off) {
off += reg->var_off.value;
err = check_stack_read_fixed_off(env, state, off, size,
@@ -4301,7 +4450,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
struct btf_field *kptr_field,
struct bpf_reg_state *reg, u32 regno)
{
- const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
+ const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
const char *reg_name = "";
@@ -4317,7 +4466,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
return -EINVAL;
}
/* We need to verify reg->type and reg->btf, before accessing reg->btf */
- reg_name = kernel_type_name(reg->btf, reg->btf_id);
+ reg_name = btf_type_name(reg->btf, reg->btf_id);
/* For ref_ptr case, release function check should ensure we get one
* referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the
@@ -4381,6 +4530,8 @@ static bool in_rcu_cs(struct bpf_verifier_env *env)
BTF_SET_START(rcu_protected_types)
BTF_ID(struct, prog_test_ref_kfunc)
BTF_ID(struct, cgroup)
+BTF_ID(struct, bpf_cpumask)
+BTF_ID(struct, task_struct)
BTF_SET_END(rcu_protected_types)
static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
@@ -4754,6 +4905,11 @@ static bool is_rcu_reg(const struct bpf_reg_state *reg)
return reg->type & MEM_RCU;
}
+static void clear_trusted_flags(enum bpf_type_flag *flag)
+{
+ *flag &= ~(BPF_REG_TRUSTED_MODIFIERS | MEM_RCU);
+}
+
static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int off, int size, bool strict)
@@ -5158,6 +5314,7 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
}
#define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu)
+#define BTF_TYPE_SAFE_RCU_OR_NULL(__type) __PASTE(__type, __safe_rcu_or_null)
#define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted)
/*
@@ -5174,18 +5331,39 @@ BTF_TYPE_SAFE_RCU(struct task_struct) {
struct task_struct *group_leader;
};
+BTF_TYPE_SAFE_RCU(struct cgroup) {
+ /* cgrp->kn is always accessible as documented in kernel/cgroup/cgroup.c */
+ struct kernfs_node *kn;
+};
+
BTF_TYPE_SAFE_RCU(struct css_set) {
struct cgroup *dfl_cgrp;
};
+/* RCU trusted: these fields are trusted in RCU CS and can be NULL */
+BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) {
+ struct file __rcu *exe_file;
+};
+
+/* skb->sk, req->sk are not RCU protected, but we mark them as such
+ * because bpf prog accessible sockets are SOCK_RCU_FREE.
+ */
+BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff) {
+ struct sock *sk;
+};
+
+BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock) {
+ struct sock *sk;
+};
+
/* full trusted: these fields are trusted even outside of RCU CS and never NULL */
BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) {
- __bpf_md_ptr(struct seq_file *, seq);
+ struct seq_file *seq;
};
BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) {
- __bpf_md_ptr(struct bpf_iter_meta *, meta);
- __bpf_md_ptr(struct task_struct *, task);
+ struct bpf_iter_meta *meta;
+ struct task_struct *task;
};
BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) {
@@ -5207,17 +5385,29 @@ BTF_TYPE_SAFE_TRUSTED(struct socket) {
static bool type_is_rcu(struct bpf_verifier_env *env,
struct bpf_reg_state *reg,
- int off)
+ const char *field_name, u32 btf_id)
{
BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set));
- return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_rcu");
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu");
+}
+
+static bool type_is_rcu_or_null(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu_or_null");
}
static bool type_is_trusted(struct bpf_verifier_env *env,
struct bpf_reg_state *reg,
- int off)
+ const char *field_name, u32 btf_id)
{
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task));
@@ -5226,7 +5416,7 @@ static bool type_is_trusted(struct bpf_verifier_env *env,
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry));
BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket));
- return btf_nested_type_is_trusted(&env->log, reg, off, "__safe_trusted");
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted");
}
static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
@@ -5238,8 +5428,9 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
struct bpf_reg_state *reg = regs + regno;
const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
const char *tname = btf_name_by_offset(reg->btf, t->name_off);
+ const char *field_name = NULL;
enum bpf_type_flag flag = 0;
- u32 btf_id;
+ u32 btf_id = 0;
int ret;
if (!env->allow_ptr_leaks) {
@@ -5284,12 +5475,12 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EACCES;
}
- if (env->ops->btf_struct_access && !type_is_alloc(reg->type)) {
+ if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) {
if (!btf_is_kernel(reg->btf)) {
verbose(env, "verifier internal error: reg->btf must be kernel btf\n");
return -EFAULT;
}
- ret = env->ops->btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
+ ret = env->ops->btf_struct_access(&env->log, reg, off, size);
} else {
/* Writes are permitted with default btf_struct_access for
* program allocated objects (which always have ref_obj_id > 0),
@@ -5306,7 +5497,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EFAULT;
}
- ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
+ ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag, &field_name);
}
if (ret < 0)
@@ -5334,20 +5525,21 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
* A regular RCU-protected pointer with __rcu tag can also be deemed
* trusted if we are in an RCU CS. Such pointer can be NULL.
*/
- if (type_is_trusted(env, reg, off)) {
+ if (type_is_trusted(env, reg, field_name, btf_id)) {
flag |= PTR_TRUSTED;
} else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) {
- if (type_is_rcu(env, reg, off)) {
+ if (type_is_rcu(env, reg, field_name, btf_id)) {
/* ignore __rcu tag and mark it MEM_RCU */
flag |= MEM_RCU;
- } else if (flag & MEM_RCU) {
+ } else if (flag & MEM_RCU ||
+ type_is_rcu_or_null(env, reg, field_name, btf_id)) {
/* __rcu tagged pointers can be NULL */
- flag |= PTR_MAYBE_NULL;
+ flag |= MEM_RCU | PTR_MAYBE_NULL;
} else if (flag & (MEM_PERCPU | MEM_USER)) {
/* keep as-is */
} else {
- /* walking unknown pointers yields untrusted pointer */
- flag = PTR_UNTRUSTED;
+ /* walking unknown pointers yields old deprecated PTR_TO_BTF_ID */
+ clear_trusted_flags(&flag);
}
} else {
/*
@@ -5361,7 +5553,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
}
} else {
/* Old compat. Deprecated */
- flag &= ~PTR_TRUSTED;
+ clear_trusted_flags(&flag);
}
if (atype == BPF_READ && value_regno >= 0)
@@ -5420,7 +5612,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
/* Simulate access to a PTR_TO_BTF_ID */
memset(&map_reg, 0, sizeof(map_reg));
mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0);
- ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag);
+ ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL);
if (ret < 0)
return ret;
@@ -6086,6 +6278,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
env,
regno, reg->off, access_size,
zero_size_allowed, ACCESS_HELPER, meta);
+ case PTR_TO_BTF_ID:
+ return check_ptr_to_btf_access(env, regs, regno, reg->off,
+ access_size, BPF_READ, -1);
case PTR_TO_CTX:
/* in case the function doesn't know how to access the context,
* (because we are in a program of type SYSCALL for example), we
@@ -6506,6 +6701,203 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
return err;
}
+static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi)
+{
+ struct bpf_func_state *state = func(env, reg);
+
+ return state->stack[spi].spilled_ptr.ref_obj_id;
+}
+
+static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+}
+
+static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_NEW;
+}
+
+static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_NEXT;
+}
+
+static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_DESTROY;
+}
+
+static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg)
+{
+ /* btf_check_iter_kfuncs() guarantees that first argument of any iter
+ * kfunc is iter state pointer
+ */
+ return arg == 0 && is_iter_kfunc(meta);
+}
+
+static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ const struct btf_type *t;
+ const struct btf_param *arg;
+ int spi, err, i, nr_slots;
+ u32 btf_id;
+
+ /* btf_check_iter_kfuncs() ensures we don't need to validate anything here */
+ arg = &btf_params(meta->func_proto)[0];
+ t = btf_type_skip_modifiers(meta->btf, arg->type, NULL); /* PTR */
+ t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id); /* STRUCT */
+ nr_slots = t->size / BPF_REG_SIZE;
+
+ if (is_iter_new_kfunc(meta)) {
+ /* bpf_iter_<type>_new() expects pointer to uninit iter state */
+ if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
+ verbose(env, "expected uninitialized iter_%s as arg #%d\n",
+ iter_type_str(meta->btf, btf_id), regno);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
+ err = check_mem_access(env, insn_idx, regno,
+ i, BPF_DW, BPF_WRITE, -1, false);
+ if (err)
+ return err;
+ }
+
+ err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots);
+ if (err)
+ return err;
+ } else {
+ /* iter_next() or iter_destroy() expect initialized iter state*/
+ if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) {
+ verbose(env, "expected an initialized iter_%s as arg #%d\n",
+ iter_type_str(meta->btf, btf_id), regno);
+ return -EINVAL;
+ }
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ err = mark_iter_read(env, reg, spi, nr_slots);
+ if (err)
+ return err;
+
+ /* remember meta->iter info for process_iter_next_call() */
+ meta->iter.spi = spi;
+ meta->iter.frameno = reg->frameno;
+ meta->ref_obj_id = iter_ref_obj_id(env, reg, spi);
+
+ if (is_iter_destroy_kfunc(meta)) {
+ err = unmark_stack_slots_iter(env, reg, nr_slots);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+/* process_iter_next_call() is called when verifier gets to iterator's next
+ * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
+ * to it as just "iter_next()" in comments below.
+ *
+ * BPF verifier relies on a crucial contract for any iter_next()
+ * implementation: it should *eventually* return NULL, and once that happens
+ * it should keep returning NULL. That is, once iterator exhausts elements to
+ * iterate, it should never reset or spuriously return new elements.
+ *
+ * With the assumption of such contract, process_iter_next_call() simulates
+ * a fork in the verifier state to validate loop logic correctness and safety
+ * without having to simulate infinite amount of iterations.
+ *
+ * In current state, we first assume that iter_next() returned NULL and
+ * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such
+ * conditions we should not form an infinite loop and should eventually reach
+ * exit.
+ *
+ * Besides that, we also fork current state and enqueue it for later
+ * verification. In a forked state we keep iterator state as ACTIVE
+ * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We
+ * also bump iteration depth to prevent erroneous infinite loop detection
+ * later on (see iter_active_depths_differ() comment for details). In this
+ * state we assume that we'll eventually loop back to another iter_next()
+ * calls (it could be in exactly same location or in some other instruction,
+ * it doesn't matter, we don't make any unnecessary assumptions about this,
+ * everything revolves around iterator state in a stack slot, not which
+ * instruction is calling iter_next()). When that happens, we either will come
+ * to iter_next() with equivalent state and can conclude that next iteration
+ * will proceed in exactly the same way as we just verified, so it's safe to
+ * assume that loop converges. If not, we'll go on another iteration
+ * simulation with a different input state, until all possible starting states
+ * are validated or we reach maximum number of instructions limit.
+ *
+ * This way, we will either exhaustively discover all possible input states
+ * that iterator loop can start with and eventually will converge, or we'll
+ * effectively regress into bounded loop simulation logic and either reach
+ * maximum number of instructions if loop is not provably convergent, or there
+ * is some statically known limit on number of iterations (e.g., if there is
+ * an explicit `if n > 100 then break;` statement somewhere in the loop).
+ *
+ * One very subtle but very important aspect is that we *always* simulate NULL
+ * condition first (as the current state) before we simulate non-NULL case.
+ * This has to do with intricacies of scalar precision tracking. By simulating
+ * "exit condition" of iter_next() returning NULL first, we make sure all the
+ * relevant precision marks *that will be set **after** we exit iterator loop*
+ * are propagated backwards to common parent state of NULL and non-NULL
+ * branches. Thanks to that, state equivalence checks done later in forked
+ * state, when reaching iter_next() for ACTIVE iterator, can assume that
+ * precision marks are finalized and won't change. Because simulating another
+ * ACTIVE iterator iteration won't change them (because given same input
+ * states we'll end up with exactly same output states which we are currently
+ * comparing; and verification after the loop already propagated back what
+ * needs to be **additionally** tracked as precise). It's subtle, grok
+ * precision tracking for more intuitive understanding.
+ */
+static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_verifier_state *cur_st = env->cur_state, *queued_st;
+ struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
+ struct bpf_reg_state *cur_iter, *queued_iter;
+ int iter_frameno = meta->iter.frameno;
+ int iter_spi = meta->iter.spi;
+
+ BTF_TYPE_EMIT(struct bpf_iter);
+
+ cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+
+ if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
+ cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
+ verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n",
+ cur_iter->iter.state, iter_state_str(cur_iter->iter.state));
+ return -EFAULT;
+ }
+
+ if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
+ /* branch out active iter state */
+ queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
+ if (!queued_st)
+ return -ENOMEM;
+
+ queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+ queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
+ queued_iter->iter.depth++;
+
+ queued_fr = queued_st->frame[queued_st->curframe];
+ mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
+ }
+
+ /* switch to DRAINED state, but keep the depth unchanged */
+ /* mark current iter state as drained and assume returned NULL */
+ cur_iter->iter.state = BPF_ITER_STATE_DRAINED;
+ __mark_reg_const_zero(&cur_fr->regs[BPF_REG_0]);
+
+ return 0;
+}
+
static bool arg_type_is_mem_size(enum bpf_arg_type type)
{
return type == ARG_CONST_SIZE ||
@@ -6600,6 +6992,7 @@ static const struct bpf_reg_types mem_types = {
PTR_TO_MEM,
PTR_TO_MEM | MEM_RINGBUF,
PTR_TO_BUF,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
},
};
@@ -6709,6 +7102,9 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
if (arg_type & PTR_MAYBE_NULL)
type &= ~PTR_MAYBE_NULL;
+ if (meta->func_id == BPF_FUNC_kptr_xchg && type & MEM_ALLOC)
+ type &= ~MEM_ALLOC;
+
for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
expected = compatible->types[i];
if (expected == NOT_INIT)
@@ -6728,10 +7124,23 @@ found:
if (base_type(reg->type) != PTR_TO_BTF_ID)
return 0;
+ if (compatible == &mem_types) {
+ if (!(arg_type & MEM_RDONLY)) {
+ verbose(env,
+ "%s() may write into memory pointed by R%d type=%s\n",
+ func_id_name(meta->func_id),
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ return 0;
+ }
+
switch ((int)reg->type) {
case PTR_TO_BTF_ID:
case PTR_TO_BTF_ID | PTR_TRUSTED:
case PTR_TO_BTF_ID | MEM_RCU:
+ case PTR_TO_BTF_ID | PTR_MAYBE_NULL:
+ case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU:
{
/* For bpf_sk_release, it needs to match against first member
* 'struct sock_common', hence make an exception for it. This
@@ -6740,6 +7149,12 @@ found:
bool strict_type_match = arg_type_is_release(arg_type) &&
meta->func_id != BPF_FUNC_sk_release;
+ if (type_may_be_null(reg->type) &&
+ (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) {
+ verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno);
+ return -EACCES;
+ }
+
if (!arg_btf_id) {
if (!compatible->btf_id) {
verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
@@ -6763,15 +7178,16 @@ found:
btf_vmlinux, *arg_btf_id,
strict_type_match)) {
verbose(env, "R%d is of type %s but %s is expected\n",
- regno, kernel_type_name(reg->btf, reg->btf_id),
- kernel_type_name(btf_vmlinux, *arg_btf_id));
+ regno, btf_type_name(reg->btf, reg->btf_id),
+ btf_type_name(btf_vmlinux, *arg_btf_id));
return -EACCES;
}
}
break;
}
case PTR_TO_BTF_ID | MEM_ALLOC:
- if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock) {
+ if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
+ meta->func_id != BPF_FUNC_kptr_xchg) {
verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
return -EFAULT;
}
@@ -6834,7 +7250,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
verbose(env, "R%d must have zero offset when passed to release func\n",
regno);
verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno,
- kernel_type_name(reg->btf, reg->btf_id), reg->off);
+ btf_type_name(reg->btf, reg->btf_id), reg->off);
return -EINVAL;
}
@@ -8737,6 +9153,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (func_id == BPF_FUNC_kptr_xchg) {
ret_btf = meta.kptr_field->kptr.btf;
ret_btf_id = meta.kptr_field->kptr.btf_id;
+ if (!btf_is_kernel(ret_btf))
+ regs[BPF_REG_0].type |= MEM_ALLOC;
} else {
if (fn->ret_btf_id == BPF_PTR_POISON) {
verbose(env, "verifier internal error:");
@@ -8870,7 +9288,7 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta)
{
- return meta->kfunc_flags & KF_TRUSTED_ARGS;
+ return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta);
}
static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta)
@@ -9099,6 +9517,7 @@ enum kfunc_ptr_arg_type {
KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */
KF_ARG_PTR_TO_KPTR, /* PTR_TO_KPTR but type specific */
KF_ARG_PTR_TO_DYNPTR,
+ KF_ARG_PTR_TO_ITER,
KF_ARG_PTR_TO_LIST_HEAD,
KF_ARG_PTR_TO_LIST_NODE,
KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */
@@ -9220,6 +9639,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_DYNPTR;
+ if (is_kfunc_arg_iter(meta, argno))
+ return KF_ARG_PTR_TO_ITER;
+
if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_LIST_HEAD;
@@ -9848,6 +10270,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
break;
case KF_ARG_PTR_TO_KPTR:
case KF_ARG_PTR_TO_DYNPTR:
+ case KF_ARG_PTR_TO_ITER:
case KF_ARG_PTR_TO_LIST_HEAD:
case KF_ARG_PTR_TO_LIST_NODE:
case KF_ARG_PTR_TO_RB_ROOT:
@@ -9944,6 +10367,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
break;
}
+ case KF_ARG_PTR_TO_ITER:
+ ret = process_iter_arg(env, regno, insn_idx, meta);
+ if (ret < 0)
+ return ret;
+ break;
case KF_ARG_PTR_TO_LIST_HEAD:
if (reg->type != PTR_TO_MAP_VALUE &&
reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
@@ -10079,24 +10507,21 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return 0;
}
-static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
- int *insn_idx_p)
+static int fetch_kfunc_meta(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_kfunc_call_arg_meta *meta,
+ const char **kfunc_name)
{
- const struct btf_type *t, *func, *func_proto, *ptr_type;
- u32 i, nargs, func_id, ptr_type_id, release_ref_obj_id;
- struct bpf_reg_state *regs = cur_regs(env);
- const char *func_name, *ptr_type_name;
- bool sleepable, rcu_lock, rcu_unlock;
- struct bpf_kfunc_call_arg_meta meta;
- int err, insn_idx = *insn_idx_p;
- const struct btf_param *args;
- const struct btf_type *ret_t;
+ const struct btf_type *func, *func_proto;
+ u32 func_id, *kfunc_flags;
+ const char *func_name;
struct btf *desc_btf;
- u32 *kfunc_flags;
- /* skip for now, but return error when we find this in fixup_kfunc_call */
+ if (kfunc_name)
+ *kfunc_name = NULL;
+
if (!insn->imm)
- return 0;
+ return -EINVAL;
desc_btf = find_kfunc_desc_btf(env, insn->off);
if (IS_ERR(desc_btf))
@@ -10105,22 +10530,53 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
func_id = insn->imm;
func = btf_type_by_id(desc_btf, func_id);
func_name = btf_name_by_offset(desc_btf, func->name_off);
+ if (kfunc_name)
+ *kfunc_name = func_name;
func_proto = btf_type_by_id(desc_btf, func->type);
kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id);
if (!kfunc_flags) {
- verbose(env, "calling kernel function %s is not allowed\n",
- func_name);
return -EACCES;
}
- /* Prepare kfunc call metadata */
- memset(&meta, 0, sizeof(meta));
- meta.btf = desc_btf;
- meta.func_id = func_id;
- meta.kfunc_flags = *kfunc_flags;
- meta.func_proto = func_proto;
- meta.func_name = func_name;
+ memset(meta, 0, sizeof(*meta));
+ meta->btf = desc_btf;
+ meta->func_id = func_id;
+ meta->kfunc_flags = *kfunc_flags;
+ meta->func_proto = func_proto;
+ meta->func_name = func_name;
+
+ return 0;
+}
+
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx_p)
+{
+ const struct btf_type *t, *ptr_type;
+ u32 i, nargs, ptr_type_id, release_ref_obj_id;
+ struct bpf_reg_state *regs = cur_regs(env);
+ const char *func_name, *ptr_type_name;
+ bool sleepable, rcu_lock, rcu_unlock;
+ struct bpf_kfunc_call_arg_meta meta;
+ struct bpf_insn_aux_data *insn_aux;
+ int err, insn_idx = *insn_idx_p;
+ const struct btf_param *args;
+ const struct btf_type *ret_t;
+ struct btf *desc_btf;
+
+ /* skip for now, but return error when we find this in fixup_kfunc_call */
+ if (!insn->imm)
+ return 0;
+
+ err = fetch_kfunc_meta(env, insn, &meta, &func_name);
+ if (err == -EACCES && func_name)
+ verbose(env, "calling kernel function %s is not allowed\n", func_name);
+ if (err)
+ return err;
+ desc_btf = meta.btf;
+ insn_aux = &env->insn_aux_data[insn_idx];
+
+ insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
@@ -10173,7 +10629,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
err = release_reference(env, regs[meta.release_regno].ref_obj_id);
if (err) {
verbose(env, "kfunc %s#%d reference has not been acquired before\n",
- func_name, func_id);
+ func_name, meta.func_id);
return err;
}
}
@@ -10185,14 +10641,14 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
err = ref_convert_owning_non_owning(env, release_ref_obj_id);
if (err) {
verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
- func_name, func_id);
+ func_name, meta.func_id);
return err;
}
err = release_reference(env, release_ref_obj_id);
if (err) {
verbose(env, "kfunc %s#%d reference has not been acquired before\n",
- func_name, func_id);
+ func_name, meta.func_id);
return err;
}
}
@@ -10202,7 +10658,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
set_rbtree_add_callback_state);
if (err) {
verbose(env, "kfunc %s#%d failed callback verification\n",
- func_name, func_id);
+ func_name, meta.func_id);
return err;
}
}
@@ -10211,7 +10667,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
mark_reg_not_init(env, regs, caller_saved[i]);
/* Check return type */
- t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
+ t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);
if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {
/* Only exception is bpf_obj_new_impl */
@@ -10260,13 +10716,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
regs[BPF_REG_0].btf = ret_btf;
regs[BPF_REG_0].btf_id = ret_btf_id;
- env->insn_aux_data[insn_idx].obj_new_size = ret_t->size;
- env->insn_aux_data[insn_idx].kptr_struct_meta =
+ insn_aux->obj_new_size = ret_t->size;
+ insn_aux->kptr_struct_meta =
btf_find_struct_meta(ret_btf, ret_btf_id);
- } else if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
- env->insn_aux_data[insn_idx].kptr_struct_meta =
- btf_find_struct_meta(meta.arg_obj_drop.btf,
- meta.arg_obj_drop.btf_id);
} else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
struct btf_field *field = meta.arg_list_head.field;
@@ -10395,10 +10847,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
regs[BPF_REG_0].id = ++env->id_gen;
- } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
+ } else if (btf_type_is_void(t)) {
+ if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ insn_aux->kptr_struct_meta =
+ btf_find_struct_meta(meta.arg_obj_drop.btf,
+ meta.arg_obj_drop.btf_id);
+ }
+ }
+ }
- nargs = btf_type_vlen(func_proto);
- args = (const struct btf_param *)(func_proto + 1);
+ nargs = btf_type_vlen(meta.func_proto);
+ args = (const struct btf_param *)(meta.func_proto + 1);
for (i = 0; i < nargs; i++) {
u32 regno = i + 1;
@@ -10410,6 +10870,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
mark_btf_func_reg_size(env, regno, t->size);
}
+ if (is_iter_next_kfunc(&meta)) {
+ err = process_iter_next_call(env, insn_idx, &meta);
+ if (err)
+ return err;
+ }
+
return 0;
}
@@ -12116,10 +12582,14 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
case BPF_JEQ:
if (tnum_is_const(subreg))
return !!tnum_equals_const(subreg, val);
+ else if (val < reg->u32_min_value || val > reg->u32_max_value)
+ return 0;
break;
case BPF_JNE:
if (tnum_is_const(subreg))
return !tnum_equals_const(subreg, val);
+ else if (val < reg->u32_min_value || val > reg->u32_max_value)
+ return 1;
break;
case BPF_JSET:
if ((~subreg.mask & subreg.value) & val)
@@ -12189,10 +12659,14 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
case BPF_JEQ:
if (tnum_is_const(reg->var_off))
return !!tnum_equals_const(reg->var_off, val);
+ else if (val < reg->umin_value || val > reg->umax_value)
+ return 0;
break;
case BPF_JNE:
if (tnum_is_const(reg->var_off))
return !tnum_equals_const(reg->var_off, val);
+ else if (val < reg->umin_value || val > reg->umax_value)
+ return 1;
break;
case BPF_JSET:
if ((~reg->var_off.mask & reg->var_off.value) & val)
@@ -12813,6 +13287,18 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
src_reg->var_off.value,
opcode,
is_jmp32);
+ } else if (dst_reg->type == SCALAR_VALUE &&
+ is_jmp32 && tnum_is_const(tnum_subreg(dst_reg->var_off))) {
+ pred = is_branch_taken(src_reg,
+ tnum_subreg(dst_reg->var_off).value,
+ flip_opcode(opcode),
+ is_jmp32);
+ } else if (dst_reg->type == SCALAR_VALUE &&
+ !is_jmp32 && tnum_is_const(dst_reg->var_off)) {
+ pred = is_branch_taken(src_reg,
+ dst_reg->var_off.value,
+ flip_opcode(opcode),
+ is_jmp32);
} else if (reg_is_pkt_pointer_any(dst_reg) &&
reg_is_pkt_pointer_any(src_reg) &&
!is_jmp32) {
@@ -13407,6 +13893,17 @@ static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
return env->insn_aux_data[insn_idx].prune_point;
}
+static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].force_checkpoint = true;
+}
+
+static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].force_checkpoint;
+}
+
+
enum {
DONE_EXPLORING = 0,
KEEP_EXPLORING = 1,
@@ -13522,6 +14019,26 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
* async state will be pushed for further exploration.
*/
mark_prune_point(env, t);
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ struct bpf_kfunc_call_arg_meta meta;
+
+ ret = fetch_kfunc_meta(env, insn, &meta, NULL);
+ if (ret == 0 && is_iter_next_kfunc(&meta)) {
+ mark_prune_point(env, t);
+ /* Checking and saving state checkpoints at iter_next() call
+ * is crucial for fast convergence of open-coded iterator loop
+ * logic, so we need to force it. If we don't do that,
+ * is_state_visited() might skip saving a checkpoint, causing
+ * unnecessarily long sequence of not checkpointed
+ * instructions and jumps, leading to exhaustion of jump
+ * history buffer, and potentially other undesired outcomes.
+ * It is expected that with correct open-coded iterators
+ * convergence will happen quickly, so we don't run a risk of
+ * exhausting memory.
+ */
+ mark_force_checkpoint(env, t);
+ }
+ }
return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
case BPF_JA:
@@ -14275,6 +14792,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
* didn't use them
*/
for (i = 0; i < old->allocated_stack; i++) {
+ struct bpf_reg_state *old_reg, *cur_reg;
+
spi = i / BPF_REG_SIZE;
if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
@@ -14331,9 +14850,6 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
return false;
break;
case STACK_DYNPTR:
- {
- const struct bpf_reg_state *old_reg, *cur_reg;
-
old_reg = &old->stack[spi].spilled_ptr;
cur_reg = &cur->stack[spi].spilled_ptr;
if (old_reg->dynptr.type != cur_reg->dynptr.type ||
@@ -14341,7 +14857,22 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
return false;
break;
- }
+ case STACK_ITER:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ /* iter.depth is not compared between states as it
+ * doesn't matter for correctness and would otherwise
+ * prevent convergence; we maintain it only to prevent
+ * infinite loop check triggering, see
+ * iter_active_depths_differ()
+ */
+ if (old_reg->iter.btf != cur_reg->iter.btf ||
+ old_reg->iter.btf_id != cur_reg->iter.btf_id ||
+ old_reg->iter.state != cur_reg->iter.state ||
+ /* ignore {old_reg,cur_reg}->iter.depth, see above */
+ !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
case STACK_MISC:
case STACK_ZERO:
case STACK_INVALID:
@@ -14555,10 +15086,11 @@ static int propagate_precision(struct bpf_verifier_env *env,
state_reg = state->regs;
for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
+ !state_reg->precise ||
+ !(state_reg->live & REG_LIVE_READ))
continue;
if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "frame %d: propagating r%d\n", i, fr);
+ verbose(env, "frame %d: propagating r%d\n", fr, i);
err = mark_chain_precision_frame(env, fr, i);
if (err < 0)
return err;
@@ -14569,11 +15101,12 @@ static int propagate_precision(struct bpf_verifier_env *env,
continue;
state_reg = &state->stack[i].spilled_ptr;
if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
+ !state_reg->precise ||
+ !(state_reg->live & REG_LIVE_READ))
continue;
if (env->log.level & BPF_LOG_LEVEL2)
verbose(env, "frame %d: propagating fp%d\n",
- (-i - 1) * BPF_REG_SIZE, fr);
+ fr, (-i - 1) * BPF_REG_SIZE);
err = mark_chain_precision_stack_frame(env, fr, i);
if (err < 0)
return err;
@@ -14600,6 +15133,92 @@ static bool states_maybe_looping(struct bpf_verifier_state *old,
return true;
}
+static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].is_iter_next;
+}
+
+/* is_state_visited() handles iter_next() (see process_iter_next_call() for
+ * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
+ * states to match, which otherwise would look like an infinite loop. So while
+ * iter_next() calls are taken care of, we still need to be careful and
+ * prevent erroneous and too eager declaration of "ininite loop", when
+ * iterators are involved.
+ *
+ * Here's a situation in pseudo-BPF assembly form:
+ *
+ * 0: again: ; set up iter_next() call args
+ * 1: r1 = &it ; <CHECKPOINT HERE>
+ * 2: call bpf_iter_num_next ; this is iter_next() call
+ * 3: if r0 == 0 goto done
+ * 4: ... something useful here ...
+ * 5: goto again ; another iteration
+ * 6: done:
+ * 7: r1 = &it
+ * 8: call bpf_iter_num_destroy ; clean up iter state
+ * 9: exit
+ *
+ * This is a typical loop. Let's assume that we have a prune point at 1:,
+ * before we get to `call bpf_iter_num_next` (e.g., because of that `goto
+ * again`, assuming other heuristics don't get in a way).
+ *
+ * When we first time come to 1:, let's say we have some state X. We proceed
+ * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit.
+ * Now we come back to validate that forked ACTIVE state. We proceed through
+ * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we
+ * are converging. But the problem is that we don't know that yet, as this
+ * convergence has to happen at iter_next() call site only. So if nothing is
+ * done, at 1: verifier will use bounded loop logic and declare infinite
+ * looping (and would be *technically* correct, if not for iterator's
+ * "eventual sticky NULL" contract, see process_iter_next_call()). But we
+ * don't want that. So what we do in process_iter_next_call() when we go on
+ * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's
+ * a different iteration. So when we suspect an infinite loop, we additionally
+ * check if any of the *ACTIVE* iterator states depths differ. If yes, we
+ * pretend we are not looping and wait for next iter_next() call.
+ *
+ * This only applies to ACTIVE state. In DRAINED state we don't expect to
+ * loop, because that would actually mean infinite loop, as DRAINED state is
+ * "sticky", and so we'll keep returning into the same instruction with the
+ * same state (at least in one of possible code paths).
+ *
+ * This approach allows to keep infinite loop heuristic even in the face of
+ * active iterator. E.g., C snippet below is and will be detected as
+ * inifintely looping:
+ *
+ * struct bpf_iter_num it;
+ * int *p, x;
+ *
+ * bpf_iter_num_new(&it, 0, 10);
+ * while ((p = bpf_iter_num_next(&t))) {
+ * x = p;
+ * while (x--) {} // <<-- infinite loop here
+ * }
+ *
+ */
+static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur)
+{
+ struct bpf_reg_state *slot, *cur_slot;
+ struct bpf_func_state *state;
+ int i, fr;
+
+ for (fr = old->curframe; fr >= 0; fr--) {
+ state = old->frame[fr];
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_ITER)
+ continue;
+
+ slot = &state->stack[i].spilled_ptr;
+ if (slot->iter.state != BPF_ITER_STATE_ACTIVE)
+ continue;
+
+ cur_slot = &cur->frame[fr]->stack[i].spilled_ptr;
+ if (cur_slot->iter.depth != slot->iter.depth)
+ return true;
+ }
+ }
+ return false;
+}
static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
@@ -14607,7 +15226,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
struct bpf_verifier_state_list *sl, **pprev;
struct bpf_verifier_state *cur = env->cur_state, *new;
int i, j, err, states_cnt = 0;
- bool add_new_state = env->test_state_freq ? true : false;
+ bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
+ bool add_new_state = force_new_state;
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
@@ -14647,8 +15267,46 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* Since the verifier still needs to catch infinite loops
* inside async callbacks.
*/
- } else if (states_maybe_looping(&sl->state, cur) &&
- states_equal(env, &sl->state, cur)) {
+ goto skip_inf_loop_check;
+ }
+ /* BPF open-coded iterators loop detection is special.
+ * states_maybe_looping() logic is too simplistic in detecting
+ * states that *might* be equivalent, because it doesn't know
+ * about ID remapping, so don't even perform it.
+ * See process_iter_next_call() and iter_active_depths_differ()
+ * for overview of the logic. When current and one of parent
+ * states are detected as equivalent, it's a good thing: we prove
+ * convergence and can stop simulating further iterations.
+ * It's safe to assume that iterator loop will finish, taking into
+ * account iter_next() contract of eventually returning
+ * sticky NULL result.
+ */
+ if (is_iter_next_insn(env, insn_idx)) {
+ if (states_equal(env, &sl->state, cur)) {
+ struct bpf_func_state *cur_frame;
+ struct bpf_reg_state *iter_state, *iter_reg;
+ int spi;
+
+ cur_frame = cur->frame[cur->curframe];
+ /* btf_check_iter_kfuncs() enforces that
+ * iter state pointer is always the first arg
+ */
+ iter_reg = &cur_frame->regs[BPF_REG_1];
+ /* current state is valid due to states_equal(),
+ * so we can assume valid iter and reg state,
+ * no need for extra (re-)validations
+ */
+ spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
+ iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
+ if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE)
+ goto hit;
+ }
+ goto skip_inf_loop_check;
+ }
+ /* attempt to detect infinite loop to avoid unnecessary doomed work */
+ if (states_maybe_looping(&sl->state, cur) &&
+ states_equal(env, &sl->state, cur) &&
+ !iter_active_depths_differ(&sl->state, cur)) {
verbose_linfo(env, insn_idx, "; ");
verbose(env, "infinite loop detected at insn %d\n", insn_idx);
return -EINVAL;
@@ -14665,13 +15323,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* This threshold shouldn't be too high either, since states
* at the end of the loop are likely to be useful in pruning.
*/
- if (!env->test_state_freq &&
+skip_inf_loop_check:
+ if (!force_new_state &&
env->jmps_processed - env->prev_jmps_processed < 20 &&
env->insn_processed - env->prev_insn_processed < 100)
add_new_state = false;
goto miss;
}
if (states_equal(env, &sl->state, cur)) {
+hit:
sl->hit_cnt++;
/* reached equivalent register/stack state,
* prune the search.
@@ -14978,11 +15638,11 @@ static int do_check(struct bpf_verifier_env *env)
print_insn_state(env, state->frame[state->curframe]);
verbose_linfo(env, env->insn_idx, "; ");
- env->prev_log_len = env->log.len_used;
+ env->prev_log_pos = env->log.end_pos;
verbose(env, "%d: ", env->insn_idx);
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
- env->prev_insn_print_len = env->log.len_used - env->prev_log_len;
- env->prev_log_len = env->log.len_used;
+ env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos;
+ env->prev_log_pos = env->log.end_pos;
}
if (bpf_prog_is_offloaded(env->prog->aux)) {
@@ -15301,8 +15961,8 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
goto err_put;
}
- if (!btf_type_is_var(t)) {
- verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", id);
+ if (!btf_type_is_var(t) && !btf_type_is_func(t)) {
+ verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR or KIND_FUNC\n", id);
err = -EINVAL;
goto err_put;
}
@@ -15315,6 +15975,14 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
err = -ENOENT;
goto err_put;
}
+ insn[0].imm = (u32)addr;
+ insn[1].imm = addr >> 32;
+
+ if (btf_type_is_func(t)) {
+ aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
+ aux->btf_var.mem_size = 0;
+ goto check_btf;
+ }
datasec_id = find_btf_percpu_datasec(btf);
if (datasec_id > 0) {
@@ -15327,9 +15995,6 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
}
}
- insn[0].imm = (u32)addr;
- insn[1].imm = addr >> 32;
-
type = t->type;
t = btf_type_skip_modifiers(btf, type, NULL);
if (percpu) {
@@ -15357,7 +16022,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
aux->btf_var.btf = btf;
aux->btf_var.btf_id = type;
}
-
+check_btf:
/* check whether we recorded this BTF (and maybe module) already */
for (i = 0; i < env->used_btf_cnt; i++) {
if (env->used_btfs[i].btf == btf) {
@@ -17032,21 +17697,21 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
- (int (*)(struct bpf_map *map, void *key))NULL));
+ (long (*)(struct bpf_map *map, void *key))NULL));
BUILD_BUG_ON(!__same_type(ops->map_update_elem,
- (int (*)(struct bpf_map *map, void *key, void *value,
+ (long (*)(struct bpf_map *map, void *key, void *value,
u64 flags))NULL));
BUILD_BUG_ON(!__same_type(ops->map_push_elem,
- (int (*)(struct bpf_map *map, void *value,
+ (long (*)(struct bpf_map *map, void *value,
u64 flags))NULL));
BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
- (int (*)(struct bpf_map *map, void *value))NULL));
+ (long (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
- (int (*)(struct bpf_map *map, void *value))NULL));
+ (long (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_redirect,
- (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
+ (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
- (int (*)(struct bpf_map *map,
+ (long (*)(struct bpf_map *map,
bpf_callback_t callback_fn,
void *callback_ctx,
u64 flags))NULL));
@@ -17654,6 +18319,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
const char *tname;
struct btf *btf;
long addr = 0;
+ struct module *mod = NULL;
if (!btf_id) {
bpf_log(log, "Tracing programs must provide btf_id\n");
@@ -17827,8 +18493,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
else
addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
} else {
- addr = kallsyms_lookup_name(tname);
+ if (btf_is_module(btf)) {
+ mod = btf_try_get_module(btf);
+ if (mod)
+ addr = find_kallsyms_symbol_value(mod, tname);
+ else
+ addr = 0;
+ } else {
+ addr = kallsyms_lookup_name(tname);
+ }
if (!addr) {
+ module_put(mod);
bpf_log(log,
"The address of function %s cannot be found\n",
tname);
@@ -17868,11 +18543,13 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
break;
}
if (ret) {
+ module_put(mod);
bpf_log(log, "%s is not sleepable\n", tname);
return ret;
}
} else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
if (tgt_prog) {
+ module_put(mod);
bpf_log(log, "can't modify return codes of BPF programs\n");
return -EINVAL;
}
@@ -17881,6 +18558,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
!check_attach_modify_return(addr, tname))
ret = 0;
if (ret) {
+ module_put(mod);
bpf_log(log, "%s() is not modifiable\n", tname);
return ret;
}
@@ -17891,6 +18569,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
tgt_info->tgt_addr = addr;
tgt_info->tgt_name = tname;
tgt_info->tgt_type = t;
+ tgt_info->tgt_mod = mod;
return 0;
}
@@ -17970,6 +18649,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
/* store info about the attachment target that will be used later */
prog->aux->attach_func_proto = tgt_info.tgt_type;
prog->aux->attach_func_name = tgt_info.tgt_name;
+ prog->aux->mod = tgt_info.tgt_mod;
if (tgt_prog) {
prog->aux->saved_dst_prog_type = tgt_prog->type;
@@ -18014,12 +18694,12 @@ struct btf *bpf_get_btf_vmlinux(void)
return btf_vmlinux;
}
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
u64 start_time = ktime_get_ns();
struct bpf_verifier_env *env;
- struct bpf_verifier_log *log;
- int i, len, ret = -EINVAL;
+ int i, len, ret = -EINVAL, err;
+ u32 log_true_size;
bool is_priv;
/* no program is valid */
@@ -18032,7 +18712,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
if (!env)
return -ENOMEM;
- log = &env->log;
len = (*prog)->len;
env->insn_aux_data =
@@ -18053,20 +18732,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
if (!is_priv)
mutex_lock(&bpf_verifier_lock);
- if (attr->log_level || attr->log_buf || attr->log_size) {
- /* user requested verbose verifier output
- * and supplied buffer to store the verification trace
- */
- log->level = attr->log_level;
- log->ubuf = (char __user *) (unsigned long) attr->log_buf;
- log->len_total = attr->log_size;
-
- /* log attributes have to be sane */
- if (!bpf_verifier_log_attr_valid(log)) {
- ret = -EINVAL;
- goto err_unlock;
- }
- }
+ /* user could have requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ ret = bpf_vlog_init(&env->log, attr->log_level,
+ (char __user *) (unsigned long) attr->log_buf,
+ attr->log_size);
+ if (ret)
+ goto err_unlock;
mark_verifier_state_clean(env);
@@ -18180,9 +18853,14 @@ skip_full_check:
print_verification_stats(env);
env->prog->aux->verified_insns = env->insn_processed;
- if (log->level && bpf_verifier_log_full(log))
- ret = -ENOSPC;
- if (log->level && !log->ubuf) {
+ /* preserve original error even if log finalization is successful */
+ err = bpf_vlog_finalize(&env->log, &log_true_size);
+ if (err)
+ ret = err;
+
+ if (uattr_size >= offsetofend(union bpf_attr, log_true_size) &&
+ copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size),
+ &log_true_size, sizeof(log_true_size))) {
ret = -EFAULT;
goto err_release_maps;
}