aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/bpf.h47
-rw-r--r--include/linux/bpf_verifier.h19
-rw-r--r--include/linux/btf.h1
-rw-r--r--include/uapi/linux/bpf.h73
-rw-r--r--kernel/bpf/arraymap.c21
-rw-r--r--kernel/bpf/btf.c77
-rw-r--r--kernel/bpf/hashtab.c105
-rw-r--r--kernel/bpf/helpers.c340
-rw-r--r--kernel/bpf/local_storage.c4
-rw-r--r--kernel/bpf/map_in_map.c8
-rw-r--r--kernel/bpf/syscall.c53
-rw-r--r--kernel/bpf/verifier.c307
-rw-r--r--kernel/trace/bpf_trace.c2
-rwxr-xr-xscripts/bpf_doc.py2
-rw-r--r--tools/include/uapi/linux/bpf.h73
-rw-r--r--tools/testing/selftests/bpf/prog_tests/timer.c55
-rw-r--r--tools/testing/selftests/bpf/prog_tests/timer_mim.c69
-rw-r--r--tools/testing/selftests/bpf/progs/timer.c297
-rw-r--r--tools/testing/selftests/bpf/progs/timer_mim.c88
-rw-r--r--tools/testing/selftests/bpf/progs/timer_mim_reject.c74
20 files changed, 1651 insertions, 64 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4afbff308ca3..a9a4a480a6d0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -168,6 +168,7 @@ struct bpf_map {
u32 max_entries;
u32 map_flags;
int spin_lock_off; /* >=0 valid offset, <0 error */
+ int timer_off; /* >=0 valid offset, <0 error */
u32 id;
int numa_node;
u32 btf_key_type_id;
@@ -197,30 +198,53 @@ static inline bool map_value_has_spin_lock(const struct bpf_map *map)
return map->spin_lock_off >= 0;
}
-static inline void check_and_init_map_lock(struct bpf_map *map, void *dst)
+static inline bool map_value_has_timer(const struct bpf_map *map)
{
- if (likely(!map_value_has_spin_lock(map)))
- return;
- *(struct bpf_spin_lock *)(dst + map->spin_lock_off) =
- (struct bpf_spin_lock){};
+ return map->timer_off >= 0;
}
-/* copy everything but bpf_spin_lock */
+static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
+{
+ if (unlikely(map_value_has_spin_lock(map)))
+ *(struct bpf_spin_lock *)(dst + map->spin_lock_off) =
+ (struct bpf_spin_lock){};
+ if (unlikely(map_value_has_timer(map)))
+ *(struct bpf_timer *)(dst + map->timer_off) =
+ (struct bpf_timer){};
+}
+
+/* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */
static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
{
+ u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0;
+
if (unlikely(map_value_has_spin_lock(map))) {
- u32 off = map->spin_lock_off;
+ s_off = map->spin_lock_off;
+ s_sz = sizeof(struct bpf_spin_lock);
+ } else if (unlikely(map_value_has_timer(map))) {
+ t_off = map->timer_off;
+ t_sz = sizeof(struct bpf_timer);
+ }
- memcpy(dst, src, off);
- memcpy(dst + off + sizeof(struct bpf_spin_lock),
- src + off + sizeof(struct bpf_spin_lock),
- map->value_size - off - sizeof(struct bpf_spin_lock));
+ if (unlikely(s_sz || t_sz)) {
+ if (s_off < t_off || !s_sz) {
+ swap(s_off, t_off);
+ swap(s_sz, t_sz);
+ }
+ memcpy(dst, src, t_off);
+ memcpy(dst + t_off + t_sz,
+ src + t_off + t_sz,
+ s_off - t_off - t_sz);
+ memcpy(dst + s_off + s_sz,
+ src + s_off + s_sz,
+ map->value_size - s_off - s_sz);
} else {
memcpy(dst, src, map->value_size);
}
}
void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
bool lock_src);
+void bpf_timer_cancel_and_free(void *timer);
int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
struct bpf_offload_dev;
@@ -314,6 +338,7 @@ enum bpf_arg_type {
ARG_PTR_TO_FUNC, /* pointer to a bpf program function */
ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */
ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */
+ ARG_PTR_TO_TIMER, /* pointer to bpf_timer */
__BPF_ARG_TYPE_MAX,
};
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e774ecc1cd1f..b847e1ccd10f 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -53,7 +53,14 @@ struct bpf_reg_state {
/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
* PTR_TO_MAP_VALUE_OR_NULL
*/
- struct bpf_map *map_ptr;
+ struct {
+ struct bpf_map *map_ptr;
+ /* To distinguish map lookups from outer map
+ * the map_uid is non-zero for registers
+ * pointing to inner maps.
+ */
+ u32 map_uid;
+ };
/* for PTR_TO_BTF_ID */
struct {
@@ -201,12 +208,19 @@ struct bpf_func_state {
* zero == main subprog
*/
u32 subprogno;
+ /* Every bpf_timer_start will increment async_entry_cnt.
+ * It's used to distinguish:
+ * void foo(void) { for(;;); }
+ * void foo(void) { bpf_timer_set_callback(,foo); }
+ */
+ u32 async_entry_cnt;
+ bool in_callback_fn;
+ bool in_async_callback_fn;
/* The following fields should be last. See copy_func_state() */
int acquired_refs;
struct bpf_reference_state *refs;
int allocated_stack;
- bool in_callback_fn;
struct bpf_stack_state *stack;
};
@@ -392,6 +406,7 @@ struct bpf_subprog_info {
bool has_tail_call;
bool tail_call_reachable;
bool has_ld_abs;
+ bool is_async_cb;
};
/* single container for all structs
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 94a0c976c90f..214fde93214b 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -99,6 +99,7 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
const struct btf_member *m,
u32 expected_offset, u32 expected_size);
int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
+int btf_find_timer(const struct btf *btf, const struct btf_type *t);
bool btf_type_is_void(const struct btf_type *t);
s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind);
const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bafb6282032b..3544ec5234f0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4777,6 +4777,70 @@ union bpf_attr {
* Execute close syscall for given FD.
* Return
* A syscall result.
+ *
+ * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags)
+ * Description
+ * Initialize the timer.
+ * First 4 bits of *flags* specify clockid.
+ * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
+ * All other bits of *flags* are reserved.
+ * The verifier will reject the program if *timer* is not from
+ * the same *map*.
+ * Return
+ * 0 on success.
+ * **-EBUSY** if *timer* is already initialized.
+ * **-EINVAL** if invalid *flags* are passed.
+ * **-EPERM** if *timer* is in a map that doesn't have any user references.
+ * The user space should either hold a file descriptor to a map with timers
+ * or pin such map in bpffs. When map is unpinned or file descriptor is
+ * closed all timers in the map will be cancelled and freed.
+ *
+ * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn)
+ * Description
+ * Configure the timer to call *callback_fn* static function.
+ * Return
+ * 0 on success.
+ * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
+ * **-EPERM** if *timer* is in a map that doesn't have any user references.
+ * The user space should either hold a file descriptor to a map with timers
+ * or pin such map in bpffs. When map is unpinned or file descriptor is
+ * closed all timers in the map will be cancelled and freed.
+ *
+ * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags)
+ * Description
+ * Set timer expiration N nanoseconds from the current time. The
+ * configured callback will be invoked in soft irq context on some cpu
+ * and will not repeat unless another bpf_timer_start() is made.
+ * In such case the next invocation can migrate to a different cpu.
+ * Since struct bpf_timer is a field inside map element the map
+ * owns the timer. The bpf_timer_set_callback() will increment refcnt
+ * of BPF program to make sure that callback_fn code stays valid.
+ * When user space reference to a map reaches zero all timers
+ * in a map are cancelled and corresponding program's refcnts are
+ * decremented. This is done to make sure that Ctrl-C of a user
+ * process doesn't leave any timers running. If map is pinned in
+ * bpffs the callback_fn can re-arm itself indefinitely.
+ * bpf_map_update/delete_elem() helpers and user space sys_bpf commands
+ * cancel and free the timer in the given map element.
+ * The map can contain timers that invoke callback_fn-s from different
+ * programs. The same callback_fn can serve different timers from
+ * different maps if key/value layout matches across maps.
+ * Every bpf_timer_set_callback() can have different callback_fn.
+ *
+ * Return
+ * 0 on success.
+ * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier
+ * or invalid *flags* are passed.
+ *
+ * long bpf_timer_cancel(struct bpf_timer *timer)
+ * Description
+ * Cancel the timer and wait for callback_fn to finish if it was running.
+ * Return
+ * 0 if the timer was not active.
+ * 1 if the timer was active.
+ * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
+ * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its
+ * own timer which would have led to a deadlock otherwise.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -4948,6 +5012,10 @@ union bpf_attr {
FN(sys_bpf), \
FN(btf_find_by_name_kind), \
FN(sys_close), \
+ FN(timer_init), \
+ FN(timer_set_callback), \
+ FN(timer_start), \
+ FN(timer_cancel), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -6074,6 +6142,11 @@ struct bpf_spin_lock {
__u32 val;
};
+struct bpf_timer {
+ __u64 :64;
+ __u64 :64;
+} __attribute__((aligned(8)));
+
struct bpf_sysctl {
__u32 write; /* Sysctl is being read (= 0) or written (= 1).
* Allows 1,2,4-byte read, but no write.
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 3c4105603f9d..cebd4fb06d19 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -287,6 +287,12 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
return 0;
}
+static void check_and_free_timer_in_array(struct bpf_array *arr, void *val)
+{
+ if (unlikely(map_value_has_timer(&arr->map)))
+ bpf_timer_cancel_and_free(val + arr->map.timer_off);
+}
+
/* Called from syscall or from eBPF program */
static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
@@ -321,6 +327,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
copy_map_value_locked(map, val, value, false);
else
copy_map_value(map, val, value);
+ check_and_free_timer_in_array(array, val);
}
return 0;
}
@@ -374,6 +381,19 @@ static void *array_map_vmalloc_addr(struct bpf_array *array)
return (void *)round_down((unsigned long)array, PAGE_SIZE);
}
+static void array_map_free_timers(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ if (likely(!map_value_has_timer(map)))
+ return;
+
+ for (i = 0; i < array->map.max_entries; i++)
+ bpf_timer_cancel_and_free(array->value + array->elem_size * i +
+ map->timer_off);
+}
+
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void array_map_free(struct bpf_map *map)
{
@@ -668,6 +688,7 @@ const struct bpf_map_ops array_map_ops = {
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
+ .map_release_uref = array_map_free_timers,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index cb4b72997d9b..7780131f710e 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3046,43 +3046,92 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
-/* find 'struct bpf_spin_lock' in map value.
- * return >= 0 offset if found
- * and < 0 in case of error
- */
-int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t,
+ const char *name, int sz, int align)
{
const struct btf_member *member;
u32 i, off = -ENOENT;
- if (!__btf_type_is_struct(t))
- return -EINVAL;
-
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
if (!__btf_type_is_struct(member_type))
continue;
- if (member_type->size != sizeof(struct bpf_spin_lock))
+ if (member_type->size != sz)
continue;
- if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
- "bpf_spin_lock"))
+ if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name))
continue;
if (off != -ENOENT)
- /* only one 'struct bpf_spin_lock' is allowed */
+ /* only one such field is allowed */
return -E2BIG;
off = btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
return -EINVAL;
off /= 8;
- if (off % __alignof__(struct bpf_spin_lock))
- /* valid struct bpf_spin_lock will be 4 byte aligned */
+ if (off % align)
+ return -EINVAL;
+ }
+ return off;
+}
+
+static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
+ const char *name, int sz, int align)
+{
+ const struct btf_var_secinfo *vsi;
+ u32 i, off = -ENOENT;
+
+ for_each_vsi(i, t, vsi) {
+ const struct btf_type *var = btf_type_by_id(btf, vsi->type);
+ const struct btf_type *var_type = btf_type_by_id(btf, var->type);
+
+ if (!__btf_type_is_struct(var_type))
+ continue;
+ if (var_type->size != sz)
+ continue;
+ if (vsi->size != sz)
+ continue;
+ if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
+ continue;
+ if (off != -ENOENT)
+ /* only one such field is allowed */
+ return -E2BIG;
+ off = vsi->offset;
+ if (off % align)
return -EINVAL;
}
return off;
}
+static int btf_find_field(const struct btf *btf, const struct btf_type *t,
+ const char *name, int sz, int align)
+{
+
+ if (__btf_type_is_struct(t))
+ return btf_find_struct_field(btf, t, name, sz, align);
+ else if (btf_type_is_datasec(t))
+ return btf_find_datasec_var(btf, t, name, sz, align);
+ return -EINVAL;
+}
+
+/* find 'struct bpf_spin_lock' in map value.
+ * return >= 0 offset if found
+ * and < 0 in case of error
+ */
+int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+{
+ return btf_find_field(btf, t, "bpf_spin_lock",
+ sizeof(struct bpf_spin_lock),
+ __alignof__(struct bpf_spin_lock));
+}
+
+int btf_find_timer(const struct btf *btf, const struct btf_type *t)
+{
+ return btf_find_field(btf, t, "bpf_timer",
+ sizeof(struct bpf_timer),
+ __alignof__(struct bpf_timer));
+}
+
static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct btf_show *show)
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 72c58cc516a3..6dc3fae46a56 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -228,6 +228,32 @@ static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
}
+static bool htab_has_extra_elems(struct bpf_htab *htab)
+{
+ return !htab_is_percpu(htab) && !htab_is_lru(htab);
+}
+
+static void htab_free_prealloced_timers(struct bpf_htab *htab)
+{
+ u32 num_entries = htab->map.max_entries;
+ int i;
+
+ if (likely(!map_value_has_timer(&htab->map)))
+ return;
+ if (htab_has_extra_elems(htab))
+ num_entries += num_possible_cpus();
+
+ for (i = 0; i < num_entries; i++) {
+ struct htab_elem *elem;
+
+ elem = get_htab_elem(htab, i);
+ bpf_timer_cancel_and_free(elem->key +
+ round_up(htab->map.key_size, 8) +
+ htab->map.timer_off);
+ cond_resched();
+ }
+}
+
static void htab_free_elems(struct bpf_htab *htab)
{
int i;
@@ -265,8 +291,12 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
struct htab_elem *l;
if (node) {
+ u32 key_size = htab->map.key_size;
+
l = container_of(node, struct htab_elem, lru_node);
- memcpy(l->key, key, htab->map.key_size);
+ memcpy(l->key, key, key_size);
+ check_and_init_map_value(&htab->map,
+ l->key + round_up(key_size, 8));
return l;
}
@@ -278,7 +308,7 @@ static int prealloc_init(struct bpf_htab *htab)
u32 num_entries = htab->map.max_entries;
int err = -ENOMEM, i;
- if (!htab_is_percpu(htab) && !htab_is_lru(htab))
+ if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries,
@@ -695,6 +725,14 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
return insn - insn_buf;
}
+static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem)
+{
+ if (unlikely(map_value_has_timer(&htab->map)))
+ bpf_timer_cancel_and_free(elem->key +
+ round_up(htab->map.key_size, 8) +
+ htab->map.timer_off);
+}
+
/* It is called from the bpf_lru_list when the LRU needs to delete
* older elements from the htab.
*/
@@ -719,6 +757,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l == tgt_l) {
hlist_nulls_del_rcu(&l->hash_node);
+ check_and_free_timer(htab, l);
break;
}
@@ -790,6 +829,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
+ check_and_free_timer(htab, l);
kfree(l);
}
@@ -817,6 +857,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
htab_put_fd_value(htab, l);
if (htab_is_prealloc(htab)) {
+ check_and_free_timer(htab, l);
__pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
atomic_dec(&htab->count);
@@ -920,8 +961,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
- check_and_init_map_lock(&htab->map,
- l_new->key + round_up(key_size, 8));
+ check_and_init_map_value(&htab->map,
+ l_new->key + round_up(key_size, 8));
}
memcpy(l_new->key, key, key_size);
@@ -1062,6 +1103,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_nulls_del_rcu(&l_old->hash_node);
if (!htab_is_prealloc(htab))
free_htab_elem(htab, l_old);
+ else
+ check_and_free_timer(htab, l_old);
}
ret = 0;
err:
@@ -1069,6 +1112,12 @@ err:
return ret;
}
+static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
+{
+ check_and_free_timer(htab, elem);
+ bpf_lru_push_free(&htab->lru, &elem->lru_node);
+}
+
static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
@@ -1102,7 +1151,8 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
l_new = prealloc_lru_pop(htab, key, hash);
if (!l_new)
return -ENOMEM;
- memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
+ copy_map_value(&htab->map,
+ l_new->key + round_up(map->key_size, 8), value);
ret = htab_lock_bucket(htab, b, hash, &flags);
if (ret)
@@ -1128,9 +1178,9 @@ err:
htab_unlock_bucket(htab, b, hash, flags);
if (ret)
- bpf_lru_push_free(&htab->lru, &l_new->lru_node);
+ htab_lru_push_free(htab, l_new);
else if (l_old)
- bpf_lru_push_free(&htab->lru, &l_old->lru_node);
+ htab_lru_push_free(htab, l_old);
return ret;
}
@@ -1339,7 +1389,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
htab_unlock_bucket(htab, b, hash, flags);
if (l)
- bpf_lru_push_free(&htab->lru, &l->lru_node);
+ htab_lru_push_free(htab, l);
return ret;
}
@@ -1359,6 +1409,35 @@ static void delete_all_elements(struct bpf_htab *htab)
}
}
+static void htab_free_malloced_timers(struct bpf_htab *htab)
+{
+ int i;
+
+ rcu_read_lock();
+ for (i = 0; i < htab->n_buckets; i++) {
+ struct hlist_nulls_head *head = select_bucket(htab, i);
+ struct hlist_nulls_node *n;
+ struct htab_elem *l;
+
+ hlist_nulls_for_each_entry(l, n, head, hash_node)
+ check_and_free_timer(htab, l);
+ cond_resched_rcu();
+ }
+ rcu_read_unlock();
+}
+
+static void htab_map_free_timers(struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ if (likely(!map_value_has_timer(&htab->map)))
+ return;
+ if (!htab_is_prealloc(htab))
+ htab_free_malloced_timers(htab);
+ else
+ htab_free_prealloced_timers(htab);
+}
+
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void htab_map_free(struct bpf_map *map)
{
@@ -1456,7 +1535,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
else
copy_map_value(map, value, l->key +
roundup_key_size);
- check_and_init_map_lock(map, value);
+ check_and_init_map_value(map, value);
}
hlist_nulls_del_rcu(&l->hash_node);
@@ -1467,7 +1546,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
htab_unlock_bucket(htab, b, hash, bflags);
if (is_lru_map && l)
- bpf_lru_push_free(&htab->lru, &l->lru_node);
+ htab_lru_push_free(htab, l);
return ret;
}
@@ -1645,7 +1724,7 @@ again_nocopy:
true);
else
copy_map_value(map, dst_val, value);
- check_and_init_map_lock(map, dst_val);
+ check_and_init_map_value(map, dst_val);
}
if (do_delete) {
hlist_nulls_del_rcu(&l->hash_node);
@@ -1672,7 +1751,7 @@ again_nocopy:
while (node_to_free) {
l = node_to_free;
node_to_free = node_to_free->batch_flink;
- bpf_lru_push_free(&htab->lru, &l->lru_node);
+ htab_lru_push_free(htab, l);
}
next_batch:
@@ -2034,6 +2113,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
+ .map_release_uref = htab_map_free_timers,
.map_lookup_elem = htab_map_lookup_elem,
.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
.map_update_elem = htab_map_update_elem,
@@ -2055,6 +2135,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
+ .map_release_uref = htab_map_free_timers,
.map_lookup_elem = htab_lru_map_lookup_elem,
.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 62cf00383910..9fe846ec6bd1 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -289,13 +289,18 @@ static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
static DEFINE_PER_CPU(unsigned long, irqsave_flags);
-notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+static inline void __bpf_spin_lock_irqsave(struct bpf_spin_lock *lock)
{
unsigned long flags;
local_irq_save(flags);
__bpf_spin_lock(lock);
__this_cpu_write(irqsave_flags, flags);
+}
+
+notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
+{
+ __bpf_spin_lock_irqsave(lock);
return 0;
}
@@ -306,13 +311,18 @@ const struct bpf_func_proto bpf_spin_lock_proto = {
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
};
-notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
{
unsigned long flags;
flags = __this_cpu_read(irqsave_flags);
__bpf_spin_unlock(lock);
local_irq_restore(flags);
+}
+
+notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
+{
+ __bpf_spin_unlock_irqrestore(lock);
return 0;
}
@@ -333,9 +343,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
else
lock = dst + map->spin_lock_off;
preempt_disable();
- ____bpf_spin_lock(lock);
+ __bpf_spin_lock_irqsave(lock);
copy_map_value(map, dst, src);
- ____bpf_spin_unlock(lock);
+ __bpf_spin_unlock_irqrestore(lock);
preempt_enable();
}
@@ -989,6 +999,320 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
+/* BPF map elements can contain 'struct bpf_timer'.
+ * Such map owns all of its BPF timers.
+ * 'struct bpf_timer' is allocated as part of map element allocation
+ * and it's zero initialized.
+ * That space is used to keep 'struct bpf_timer_kern'.
+ * bpf_timer_init() allocates 'struct bpf_hrtimer', inits hrtimer, and
+ * remembers 'struct bpf_map *' pointer it's part of.
+ * bpf_timer_set_callback() increments prog refcnt and assign bpf callback_fn.
+ * bpf_timer_start() arms the timer.
+ * If user space reference to a map goes to zero at this point
+ * ops->map_release_uref callback is responsible for cancelling the timers,
+ * freeing their memory, and decrementing prog's refcnts.
+ * bpf_timer_cancel() cancels the timer and decrements prog's refcnt.
+ * Inner maps can contain bpf timers as well. ops->map_release_uref is
+ * freeing the timers when inner map is replaced or deleted by user space.
+ */
+struct bpf_hrtimer {
+ struct hrtimer timer;
+ struct bpf_map *map;
+ struct bpf_prog *prog;
+ void __rcu *callback_fn;
+ void *value;
+};
+
+/* the actual struct hidden inside uapi struct bpf_timer */
+struct bpf_timer_kern {
+ struct bpf_hrtimer *timer;
+ /* bpf_spin_lock is used here instead of spinlock_t to make
+ * sure that it always fits into space resereved by struct bpf_timer
+ * regardless of LOCKDEP and spinlock debug flags.
+ */
+ struct bpf_spin_lock lock;
+} __attribute__((aligned(8)));
+
+static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);
+
+static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
+{
+ struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
+ struct bpf_map *map = t->map;
+ void *value = t->value;
+ void *callback_fn;
+ void *key;
+ u32 idx;
+
+ callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
+ if (!callback_fn)
+ goto out;
+
+ /* bpf_timer_cb() runs in hrtimer_run_softirq. It doesn't migrate and
+ * cannot be preempted by another bpf_timer_cb() on the same cpu.
+ * Remember the timer this callback is servicing to prevent
+ * deadlock if callback_fn() calls bpf_timer_cancel() or
+ * bpf_map_delete_elem() on the same timer.
+ */
+ this_cpu_write(hrtimer_running, t);
+ if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ /* compute the key */
+ idx = ((char *)value - array->value) / array->elem_size;
+ key = &idx;
+ } else { /* hash or lru */
+ key = value - round_up(map->key_size, 8);
+ }
+
+ BPF_CAST_CALL(callback_fn)((u64)(long)map, (u64)(long)key,
+ (u64)(long)value, 0, 0);
+ /* The verifier checked that return value is zero. */
+
+ this_cpu_write(hrtimer_running, NULL);
+out:
+ return HRTIMER_NORESTART;
+}
+
+BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map,
+ u64, flags)
+{
+ clockid_t clockid = flags & (MAX_CLOCKS - 1);
+ struct bpf_hrtimer *t;
+ int ret = 0;
+
+ BUILD_BUG_ON(MAX_CLOCKS != 16);
+ BUILD_BUG_ON(sizeof(struct bpf_timer_kern) > sizeof(struct bpf_timer));
+ BUILD_BUG_ON(__alignof__(struct bpf_timer_kern) != __alignof__(struct bpf_timer));
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+
+ if (flags >= MAX_CLOCKS ||
+ /* similar to timerfd except _ALARM variants are not supported */
+ (clockid != CLOCK_MONOTONIC &&
+ clockid != CLOCK_REALTIME &&
+ clockid != CLOCK_BOOTTIME))
+ return -EINVAL;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (t) {
+ ret = -EBUSY;
+ goto out;
+ }
+ if (!atomic64_read(&map->usercnt)) {
+ /* maps with timers must be either held by user space
+ * or pinned in bpffs.
+ */
+ ret = -EPERM;
+ goto out;
+ }
+ /* allocate hrtimer via map_kmalloc to use memcg accounting */
+ t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node);
+ if (!t) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ t->value = (void *)timer - map->timer_off;
+ t->map = map;
+ t->prog = NULL;
+ rcu_assign_pointer(t->callback_fn, NULL);
+ hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
+ t->timer.function = bpf_timer_cb;
+ timer->timer = t;
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_init_proto = {
+ .func = bpf_timer_init,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn,
+ struct bpf_prog_aux *, aux)
+{
+ struct bpf_prog *prev, *prog = aux->prog;
+ struct bpf_hrtimer *t;
+ int ret = 0;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (!t) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!atomic64_read(&t->map->usercnt)) {
+ /* maps with timers must be either held by user space
+ * or pinned in bpffs. Otherwise timer might still be
+ * running even when bpf prog is detached and user space
+ * is gone, since map_release_uref won't ever be called.
+ */
+ ret = -EPERM;
+ goto out;
+ }
+ prev = t->prog;
+ if (prev != prog) {
+ /* Bump prog refcnt once. Every bpf_timer_set_callback()
+ * can pick different callback_fn-s within the same prog.
+ */
+ prog = bpf_prog_inc_not_zero(prog);
+ if (IS_ERR(prog)) {
+ ret = PTR_ERR(prog);
+ goto out;
+ }
+ if (prev)
+ /* Drop prev prog refcnt when swapping with new prog */
+ bpf_prog_put(prev);
+ t->prog = prog;
+ }
+ rcu_assign_pointer(t->callback_fn, callback_fn);
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_set_callback_proto = {
+ .func = bpf_timer_set_callback,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+ .arg2_type = ARG_PTR_TO_FUNC,
+};
+
+BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags)
+{
+ struct bpf_hrtimer *t;
+ int ret = 0;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ if (flags)
+ return -EINVAL;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (!t || !t->prog) {
+ ret = -EINVAL;
+ goto out;
+ }
+ hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT);
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_start_proto = {
+ .func = bpf_timer_start,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static void drop_prog_refcnt(struct bpf_hrtimer *t)
+{
+ struct bpf_prog *prog = t->prog;
+
+ if (prog) {
+ bpf_prog_put(prog);
+ t->prog = NULL;
+ rcu_assign_pointer(t->callback_fn, NULL);
+ }
+}
+
+BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
+{
+ struct bpf_hrtimer *t;
+ int ret = 0;
+
+ if (in_nmi())
+ return -EOPNOTSUPP;
+ __bpf_spin_lock_irqsave(&timer->lock);
+ t = timer->timer;
+ if (!t) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (this_cpu_read(hrtimer_running) == t) {
+ /* If bpf callback_fn is trying to bpf_timer_cancel()
+ * its own timer the hrtimer_cancel() will deadlock
+ * since it waits for callback_fn to finish
+ */
+ ret = -EDEADLK;
+ goto out;
+ }
+ drop_prog_refcnt(t);
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ /* Cancel the timer and wait for associated callback to finish
+ * if it was running.
+ */
+ ret = ret ?: hrtimer_cancel(&t->timer);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_timer_cancel_proto = {
+ .func = bpf_timer_cancel,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+};
+
+/* This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_timer_cancel_and_free(void *val)
+{
+ struct bpf_timer_kern *timer = val;
+ struct bpf_hrtimer *t;
+
+ /* Performance optimization: read timer->timer without lock first. */
+ if (!READ_ONCE(timer->timer))
+ return;
+
+ __bpf_spin_lock_irqsave(&timer->lock);
+ /* re-read it under lock */
+ t = timer->timer;
+ if (!t)
+ goto out;
+ drop_prog_refcnt(t);
+ /* The subsequent bpf_timer_start/cancel() helpers won't be able to use
+ * this timer, since it won't be initialized.
+ */
+ timer->timer = NULL;
+out:
+ __bpf_spin_unlock_irqrestore(&timer->lock);
+ if (!t)
+ return;
+ /* Cancel the timer and wait for callback to complete if it was running.
+ * If hrtimer_cancel() can be safely called it's safe to call kfree(t)
+ * right after for both preallocated and non-preallocated maps.
+ * The timer->timer = NULL was already done and no code path can
+ * see address 't' anymore.
+ *
+ * Check that bpf_map_delete/update_elem() wasn't called from timer
+ * callback_fn. In such case don't call hrtimer_cancel() (since it will
+ * deadlock) and don't call hrtimer_try_to_cancel() (since it will just
+ * return -1). Though callback_fn is still running on this cpu it's
+ * safe to do kfree(t) because bpf_timer_cb() read everything it needed
+ * from 't'. The bpf subprog callback_fn won't be able to access 't',
+ * since timer->timer = NULL was already done. The timer will be
+ * effectively cancelled because bpf_timer_cb() will return
+ * HRTIMER_NORESTART.
+ */
+ if (this_cpu_read(hrtimer_running) != t)
+ hrtimer_cancel(&t->timer);
+ kfree(t);
+}
+
const struct bpf_func_proto bpf_get_current_task_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
@@ -1055,6 +1379,14 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_per_cpu_ptr_proto;
case BPF_FUNC_this_cpu_ptr:
return &bpf_this_cpu_ptr_proto;
+ case BPF_FUNC_timer_init:
+ return &bpf_timer_init_proto;
+ case BPF_FUNC_timer_set_callback:
+ return &bpf_timer_set_callback_proto;
+ case BPF_FUNC_timer_start:
+ return &bpf_timer_start_proto;
+ case BPF_FUNC_timer_cancel:
+ return &bpf_timer_cancel_proto;
default:
break;
}
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index bd11db9774c3..95d70a08325d 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -173,7 +173,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
return -ENOMEM;
memcpy(&new->data[0], value, map->value_size);
- check_and_init_map_lock(map, new->data);
+ check_and_init_map_value(map, new->data);
new = xchg(&storage->buf, new);
kfree_rcu(new, rcu);
@@ -509,7 +509,7 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
map->numa_node);
if (!storage->buf)
goto enomem;
- check_and_init_map_lock(map, storage->buf->data);
+ check_and_init_map_value(map, storage->buf->data);
} else {
storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp);
if (!storage->percpu_buf)
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 39ab0b68cade..5cd8f5277279 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -3,6 +3,7 @@
*/
#include <linux/slab.h>
#include <linux/bpf.h>
+#include <linux/btf.h>
#include "map_in_map.h"
@@ -50,6 +51,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta->map_flags = inner_map->map_flags;
inner_map_meta->max_entries = inner_map->max_entries;
inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
+ inner_map_meta->timer_off = inner_map->timer_off;
+ if (inner_map->btf) {
+ btf_get(inner_map->btf);
+ inner_map_meta->btf = inner_map->btf;
+ }
/* Misc members not needed in bpf_map_meta_equal() check. */
inner_map_meta->ops = inner_map->ops;
@@ -65,6 +71,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
void bpf_map_meta_free(struct bpf_map *map_meta)
{
+ btf_put(map_meta->btf);
kfree(map_meta);
}
@@ -75,6 +82,7 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
return meta0->map_type == meta1->map_type &&
meta0->key_size == meta1->key_size &&
meta0->value_size == meta1->value_size &&
+ meta0->timer_off == meta1->timer_off &&
meta0->map_flags == meta1->map_flags;
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e343f158e556..9a2068e39d23 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -260,8 +260,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
copy_map_value_locked(map, value, ptr, true);
else
copy_map_value(map, value, ptr);
- /* mask lock, since value wasn't zero inited */
- check_and_init_map_lock(map, value);
+ /* mask lock and timer, since value wasn't zero inited */
+ check_and_init_map_value(map, value);
}
rcu_read_unlock();
}
@@ -623,7 +623,8 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
struct bpf_map *map = filp->private_data;
int err;
- if (!map->ops->map_mmap || map_value_has_spin_lock(map))
+ if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
+ map_value_has_timer(map))
return -ENOTSUPP;
if (!(vma->vm_flags & VM_SHARED))
@@ -793,6 +794,16 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
}
}
+ map->timer_off = btf_find_timer(btf, value_type);
+ if (map_value_has_timer(map)) {
+ if (map->map_flags & BPF_F_RDONLY_PROG)
+ return -EACCES;
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY)
+ return -EOPNOTSUPP;
+ }
+
if (map->ops->map_check_btf)
ret = map->ops->map_check_btf(map, btf, key_type, value_type);
@@ -844,6 +855,7 @@ static int map_create(union bpf_attr *attr)
mutex_init(&map->freeze_mutex);
map->spin_lock_off = -EINVAL;
+ map->timer_off = -EINVAL;
if (attr->btf_key_type_id || attr->btf_value_type_id ||
/* Even the map's value is a kernel's struct,
* the bpf_prog.o must have BTF to begin with
@@ -1591,7 +1603,8 @@ static int map_freeze(const union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
+ map_value_has_timer(map)) {
fdput(f);
return -ENOTSUPP;
}
@@ -1699,6 +1712,8 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
{
+ unsigned long flags;
+
/* cBPF to eBPF migrations are currently not in the idr store.
* Offloaded programs are removed from the store when their device
* disappears - even if someone grabs an fd to them they are unusable,
@@ -1708,7 +1723,7 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
return;
if (do_idr_lock)
- spin_lock_bh(&prog_idr_lock);
+ spin_lock_irqsave(&prog_idr_lock, flags);
else
__acquire(&prog_idr_lock);
@@ -1716,7 +1731,7 @@ void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
prog->aux->id = 0;
if (do_idr_lock)
- spin_unlock_bh(&prog_idr_lock);
+ spin_unlock_irqrestore(&prog_idr_lock, flags);
else
__release(&prog_idr_lock);
}
@@ -1752,14 +1767,32 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
}
}
+static void bpf_prog_put_deferred(struct work_struct *work)
+{
+ struct bpf_prog_aux *aux;
+ struct bpf_prog *prog;
+
+ aux = container_of(work, struct bpf_prog_aux, work);
+ prog = aux->prog;
+ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
+ bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
+ __bpf_prog_put_noref(prog, true);
+}
+
static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
- if (atomic64_dec_and_test(&prog->aux->refcnt)) {
- perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
- bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
+ struct bpf_prog_aux *aux = prog->aux;
+
+ if (atomic64_dec_and_test(&aux->refcnt)) {
/* bpf_prog_free_id() must be called first */
bpf_prog_free_id(prog, do_idr_lock);
- __bpf_prog_put_noref(prog, true);
+
+ if (in_irq() || irqs_disabled()) {
+ INIT_WORK(&aux->work, bpf_prog_put_deferred);
+ schedule_work(&aux->work);
+ } else {
+ bpf_prog_put_deferred(&aux->work);
+ }
}
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3dbb3b40b754..344ee67265cc 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -255,6 +255,7 @@ struct bpf_call_arg_meta {
int mem_size;
u64 msize_max_value;
int ref_obj_id;
+ int map_uid;
int func_id;
struct btf *btf;
u32 btf_id;
@@ -734,6 +735,10 @@ static void print_verifier_state(struct bpf_verifier_env *env,
if (state->refs[i].id)
verbose(env, ",%d", state->refs[i].id);
}
+ if (state->in_callback_fn)
+ verbose(env, " cb");
+ if (state->in_async_callback_fn)
+ verbose(env, " async_cb");
verbose(env, "\n");
}
@@ -1135,6 +1140,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
if (map->inner_map_meta) {
reg->type = CONST_PTR_TO_MAP;
reg->map_ptr = map->inner_map_meta;
+ /* transfer reg's id which is unique for every map_lookup_elem
+ * as UID of the inner map.
+ */
+ reg->map_uid = reg->id;
} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
@@ -1522,6 +1531,54 @@ static void init_func_state(struct bpf_verifier_env *env,
init_reg_state(env, state);
}
+/* Similar to push_stack(), but for async callbacks */
+static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
+ int insn_idx, int prev_insn_idx,
+ int subprog)
+{
+ struct bpf_verifier_stack_elem *elem;
+ struct bpf_func_state *frame;
+
+ elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+ if (!elem)
+ goto err;
+
+ elem->insn_idx = insn_idx;
+ elem->prev_insn_idx = prev_insn_idx;
+ elem->next = env->head;
+ elem->log_pos = env->log.len_used;
+ env->head = elem;
+ env->stack_size++;
+ if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
+ verbose(env,
+ "The sequence of %d jumps is too complex for async cb.\n",
+ env->stack_size);
+ goto err;
+ }
+ /* Unlike push_stack() do not copy_verifier_state().
+ * The caller state doesn't matter.
+ * This is async callback. It starts in a fresh stack.
+ * Initialize it similar to do_check_common().
+ */
+ elem->st.branches = 1;
+ frame = kzalloc(sizeof(*frame), GFP_KERNEL);
+ if (!frame)
+ goto err;
+ init_func_state(env, frame,
+ BPF_MAIN_FUNC /* callsite */,
+ 0 /* frameno within this callchain */,
+ subprog /* subprog number within this prog */);
+ elem->st.frame[0] = frame;
+ return &elem->st;
+err:
+ free_verifier_state(env->cur_state, true);
+ env->cur_state = NULL;
+ /* pop all elements and return */
+ while (!pop_stack(env, NULL, NULL, false));
+ return NULL;
+}
+
+
enum reg_arg_type {
SRC_OP, /* register is used as source operand */
DST_OP, /* register is used as destination operand */
@@ -3241,6 +3298,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
}
+ if (map_value_has_timer(map)) {
+ u32 t = map->timer_off;
+
+ if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&
+ t < reg->umax_value + off + size) {
+ verbose(env, "bpf_timer cannot be accessed directly by load/store\n");
+ return -EACCES;
+ }
+ }
return err;
}
@@ -3643,6 +3709,8 @@ process_func:
continue_func:
subprog_end = subprog[idx + 1].start;
for (; i < subprog_end; i++) {
+ int next_insn;
+
if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
continue;
/* remember insn and function to return to */
@@ -3650,13 +3718,22 @@ continue_func:
ret_prog[frame] = idx;
/* find the callee */
- i = i + insn[i].imm + 1;
- idx = find_subprog(env, i);
+ next_insn = i + insn[i].imm + 1;
+ idx = find_subprog(env, next_insn);
if (idx < 0) {
WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
- i);
+ next_insn);
return -EFAULT;
}
+ if (subprog[idx].is_async_cb) {
+ if (subprog[idx].has_tail_call) {
+ verbose(env, "verifier bug. subprog has tail_call and async cb\n");
+ return -EFAULT;
+ }
+ /* async callbacks don't increase bpf prog stack size */
+ continue;
+ }
+ i = next_insn;
if (subprog[idx].has_tail_call)
tail_call_reachable = true;
@@ -4656,6 +4733,54 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
return 0;
}
+static int process_timer_func(struct bpf_verifier_env *env, int regno,
+ struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ bool is_const = tnum_is_const(reg->var_off);
+ struct bpf_map *map = reg->map_ptr;
+ u64 val = reg->var_off.value;
+
+ if (!is_const) {
+ verbose(env,
+ "R%d doesn't have constant offset. bpf_timer has to be at the constant offset\n",
+ regno);
+ return -EINVAL;
+ }
+ if (!map->btf) {
+ verbose(env, "map '%s' has to have BTF in order to use bpf_timer\n",
+ map->name);
+ return -EINVAL;
+ }
+ if (!map_value_has_timer(map)) {
+ if (map->timer_off == -E2BIG)
+ verbose(env,
+ "map '%s' has more than one 'struct bpf_timer'\n",
+ map->name);
+ else if (map->timer_off == -ENOENT)
+ verbose(env,
+ "map '%s' doesn't have 'struct bpf_timer'\n",
+ map->name);
+ else
+ verbose(env,
+ "map '%s' is not a struct type or bpf_timer is mangled\n",
+ map->name);
+ return -EINVAL;
+ }
+ if (map->timer_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
+ val + reg->off, map->timer_off);
+ return -EINVAL;
+ }
+ if (meta->map_ptr) {
+ verbose(env, "verifier bug. Two map pointers in a timer helper\n");
+ return -EFAULT;
+ }
+ meta->map_uid = reg->map_uid;
+ meta->map_ptr = map;
+ return 0;
+}
+
static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
{
return type == ARG_PTR_TO_MEM ||
@@ -4788,6 +4913,7 @@ static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PER
static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
@@ -4819,6 +4945,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_FUNC] = &func_ptr_types,
[ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types,
[ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
+ [ARG_PTR_TO_TIMER] = &timer_types,
};
static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -4948,7 +5075,29 @@ skip_type_check:
if (arg_type == ARG_CONST_MAP_PTR) {
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
+ if (meta->map_ptr) {
+ /* Use map_uid (which is unique id of inner map) to reject:
+ * inner_map1 = bpf_map_lookup_elem(outer_map, key1)
+ * inner_map2 = bpf_map_lookup_elem(outer_map, key2)
+ * if (inner_map1 && inner_map2) {
+ * timer = bpf_map_lookup_elem(inner_map1);
+ * if (timer)
+ * // mismatch would have been allowed
+ * bpf_timer_init(timer, inner_map2);
+ * }
+ *
+ * Comparing map_ptr is enough to distinguish normal and outer maps.
+ */
+ if (meta->map_ptr != reg->map_ptr ||
+ meta->map_uid != reg->map_uid) {
+ verbose(env,
+ "timer pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n",
+ meta->map_uid, reg->map_uid);
+ return -EINVAL;
+ }
+ }
meta->map_ptr = reg->map_ptr;
+ meta->map_uid = reg->map_uid;
} else if (arg_type == ARG_PTR_TO_MAP_KEY) {
/* bpf_map_xxx(..., map_ptr, ..., key) call:
* check that [key, key + map->key_size) are within
@@ -5000,6 +5149,9 @@ skip_type_check:
verbose(env, "verifier internal error\n");
return -EFAULT;
}
+ } else if (arg_type == ARG_PTR_TO_TIMER) {
+ if (process_timer_func(env, regno, meta))
+ return -EACCES;
} else if (arg_type == ARG_PTR_TO_FUNC) {
meta->subprogno = reg->subprogno;
} else if (arg_type_is_mem_ptr(arg_type)) {
@@ -5615,6 +5767,31 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
}
+ if (insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->imm == BPF_FUNC_timer_set_callback) {
+ struct bpf_verifier_state *async_cb;
+
+ /* there is no real recursion here. timer callbacks are async */
+ env->subprog_info[subprog].is_async_cb = true;
+ async_cb = push_async_cb(env, env->subprog_info[subprog].start,
+ *insn_idx, subprog);
+ if (!async_cb)
+ return -EFAULT;
+ callee = async_cb->frame[0];
+ callee->async_entry_cnt = caller->async_entry_cnt + 1;
+
+ /* Convert bpf_timer_set_callback() args into timer callback args */
+ err = set_callee_state_cb(env, caller, callee, *insn_idx);
+ if (err)
+ return err;
+
+ clear_caller_saved_regs(env, caller->regs);
+ mark_reg_unknown(env, caller->regs, BPF_REG_0);
+ caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+ /* continue with next insn after call */
+ return 0;
+ }
+
callee = kzalloc(sizeof(*callee), GFP_KERNEL);
if (!callee)
return -ENOMEM;
@@ -5742,6 +5919,35 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
return 0;
}
+static int set_timer_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ struct bpf_map *map_ptr = caller->regs[BPF_REG_1].map_ptr;
+
+ /* bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn);
+ * callback_fn(struct bpf_map *map, void *key, void *value);
+ */
+ callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+ callee->regs[BPF_REG_1].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].map_ptr = map_ptr;
+
+ callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+ callee->regs[BPF_REG_3].map_ptr = map_ptr;
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_async_callback_fn = true;
+ return 0;
+}
+
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state;
@@ -6069,6 +6275,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
+ if (func_id == BPF_FUNC_timer_set_callback) {
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_timer_callback_state);
+ if (err < 0)
+ return -EINVAL;
+ }
+
if (func_id == BPF_FUNC_snprintf) {
err = check_bpf_snprintf_call(env, regs);
if (err < 0)
@@ -6104,6 +6317,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
+ regs[BPF_REG_0].map_uid = meta.map_uid;
if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
if (map_value_has_spin_lock(meta.map_ptr))
@@ -9099,7 +9313,8 @@ static int check_return_code(struct bpf_verifier_env *env)
struct tnum range = tnum_range(0, 1);
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
int err;
- const bool is_subprog = env->cur_state->frame[0]->subprogno;
+ struct bpf_func_state *frame = env->cur_state->frame[0];
+ const bool is_subprog = frame->subprogno;
/* LSM and struct_ops func-ptr's return type could be "void" */
if (!is_subprog &&
@@ -9124,6 +9339,22 @@ static int check_return_code(struct bpf_verifier_env *env)
}
reg = cur_regs(env) + BPF_REG_0;
+
+ if (frame->in_async_callback_fn) {
+ /* enforce return zero from async callbacks like timer */
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "In async callback the register R0 is not a known value (%s)\n",
+ reg_type_str[reg->type]);
+ return -EINVAL;
+ }
+
+ if (!tnum_in(tnum_const(0), reg->var_off)) {
+ verbose_invalid_scalar(env, reg, &range, "async callback", "R0");
+ return -EINVAL;
+ }
+ return 0;
+ }
+
if (is_subprog) {
if (reg->type != SCALAR_VALUE) {
verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
@@ -9338,8 +9569,12 @@ static int visit_func_call_insn(int t, int insn_cnt,
init_explored_state(env, t + 1);
if (visit_callee) {
init_explored_state(env, t);
- ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
- env, false);
+ ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env,
+ /* It's ok to allow recursion from CFG point of
+ * view. __check_func_call() will do the actual
+ * check.
+ */
+ bpf_pseudo_func(insns + t));
}
return ret;
}
@@ -9367,6 +9602,13 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
return DONE_EXPLORING;
case BPF_CALL:
+ if (insns[t].imm == BPF_FUNC_timer_set_callback)
+ /* Mark this call insn to trigger is_state_visited() check
+ * before call itself is processed by __check_func_call().
+ * Otherwise new async state will be pushed for further
+ * exploration.
+ */
+ init_explored_state(env, t);
return visit_func_call_insn(t, insn_cnt, insns, env,
insns[t].src_reg == BPF_PSEUDO_CALL);
@@ -10374,9 +10616,25 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
states_cnt++;
if (sl->state.insn_idx != insn_idx)
goto next;
+
if (sl->state.branches) {
- if (states_maybe_looping(&sl->state, cur) &&
- states_equal(env, &sl->state, cur)) {
+ struct bpf_func_state *frame = sl->state.frame[sl->state.curframe];
+
+ if (frame->in_async_callback_fn &&
+ frame->async_entry_cnt != cur->frame[cur->curframe]->async_entry_cnt) {
+ /* Different async_entry_cnt means that the verifier is
+ * processing another entry into async callback.
+ * Seeing the same state is not an indication of infinite
+ * loop or infinite recursion.
+ * But finding the same state doesn't mean that it's safe
+ * to stop processing the current state. The previous state
+ * hasn't yet reached bpf_exit, since state.branches > 0.
+ * Checking in_async_callback_fn alone is not enough either.
+ * Since the verifier still needs to catch infinite loops
+ * inside async callbacks.
+ */
+ } else if (states_maybe_looping(&sl->state, cur) &&
+ states_equal(env, &sl->state, cur)) {
verbose_linfo(env, insn_idx, "; ");
verbose(env, "infinite loop detected at insn %d\n", insn_idx);
return -EINVAL;
@@ -12591,6 +12849,39 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
continue;
}
+ if (insn->imm == BPF_FUNC_timer_set_callback) {
+ /* The verifier will process callback_fn as many times as necessary
+ * with different maps and the register states prepared by
+ * set_timer_callback_state will be accurate.
+ *
+ * The following use case is valid:
+ * map1 is shared by prog1, prog2, prog3.
+ * prog1 calls bpf_timer_init for some map1 elements
+ * prog2 calls bpf_timer_set_callback for some map1 elements.
+ * Those that were not bpf_timer_init-ed will return -EINVAL.
+ * prog3 calls bpf_timer_start for some map1 elements.
+ * Those that were not both bpf_timer_init-ed and
+ * bpf_timer_set_callback-ed will return -EINVAL.
+ */
+ struct bpf_insn ld_addrs[2] = {
+ BPF_LD_IMM64(BPF_REG_3, (long)prog->aux),
+ };
+
+ insn_buf[0] = ld_addrs[0];
+ insn_buf[1] = ld_addrs[1];
+ insn_buf[2] = *insn;
+ cnt = 3;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
* and other inlining handlers are currently limited to 64 bit
* only.
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 64bd2d84367f..6c77d25137e0 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1059,7 +1059,7 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_snprintf:
return &bpf_snprintf_proto;
default:
- return NULL;
+ return bpf_base_func_proto(func_id);
}
}
diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
index 2d94025b38e9..00ac7b79cddb 100755
--- a/scripts/bpf_doc.py
+++ b/scripts/bpf_doc.py
@@ -547,6 +547,7 @@ class PrinterHelpers(Printer):
'struct inode',
'struct socket',
'struct file',
+ 'struct bpf_timer',
]
known_types = {
'...',
@@ -594,6 +595,7 @@ class PrinterHelpers(Printer):
'struct inode',
'struct socket',
'struct file',
+ 'struct bpf_timer',
}
mapped_types = {
'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index bafb6282032b..3544ec5234f0 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4777,6 +4777,70 @@ union bpf_attr {
* Execute close syscall for given FD.
* Return
* A syscall result.
+ *
+ * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags)
+ * Description
+ * Initialize the timer.
+ * First 4 bits of *flags* specify clockid.
+ * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
+ * All other bits of *flags* are reserved.
+ * The verifier will reject the program if *timer* is not from
+ * the same *map*.
+ * Return
+ * 0 on success.
+ * **-EBUSY** if *timer* is already initialized.
+ * **-EINVAL** if invalid *flags* are passed.
+ * **-EPERM** if *timer* is in a map that doesn't have any user references.
+ * The user space should either hold a file descriptor to a map with timers
+ * or pin such map in bpffs. When map is unpinned or file descriptor is
+ * closed all timers in the map will be cancelled and freed.
+ *
+ * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn)
+ * Description
+ * Configure the timer to call *callback_fn* static function.
+ * Return
+ * 0 on success.
+ * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
+ * **-EPERM** if *timer* is in a map that doesn't have any user references.
+ * The user space should either hold a file descriptor to a map with timers
+ * or pin such map in bpffs. When map is unpinned or file descriptor is
+ * closed all timers in the map will be cancelled and freed.
+ *
+ * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags)
+ * Description
+ * Set timer expiration N nanoseconds from the current time. The
+ * configured callback will be invoked in soft irq context on some cpu
+ * and will not repeat unless another bpf_timer_start() is made.
+ * In such case the next invocation can migrate to a different cpu.
+ * Since struct bpf_timer is a field inside map element the map
+ * owns the timer. The bpf_timer_set_callback() will increment refcnt
+ * of BPF program to make sure that callback_fn code stays valid.
+ * When user space reference to a map reaches zero all timers
+ * in a map are cancelled and corresponding program's refcnts are
+ * decremented. This is done to make sure that Ctrl-C of a user
+ * process doesn't leave any timers running. If map is pinned in
+ * bpffs the callback_fn can re-arm itself indefinitely.
+ * bpf_map_update/delete_elem() helpers and user space sys_bpf commands
+ * cancel and free the timer in the given map element.
+ * The map can contain timers that invoke callback_fn-s from different
+ * programs. The same callback_fn can serve different timers from
+ * different maps if key/value layout matches across maps.
+ * Every bpf_timer_set_callback() can have different callback_fn.
+ *
+ * Return
+ * 0 on success.
+ * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier
+ * or invalid *flags* are passed.
+ *
+ * long bpf_timer_cancel(struct bpf_timer *timer)
+ * Description
+ * Cancel the timer and wait for callback_fn to finish if it was running.
+ * Return
+ * 0 if the timer was not active.
+ * 1 if the timer was active.
+ * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
+ * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its
+ * own timer which would have led to a deadlock otherwise.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -4948,6 +5012,10 @@ union bpf_attr {
FN(sys_bpf), \
FN(btf_find_by_name_kind), \
FN(sys_close), \
+ FN(timer_init), \
+ FN(timer_set_callback), \
+ FN(timer_start), \
+ FN(timer_cancel), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -6074,6 +6142,11 @@ struct bpf_spin_lock {
__u32 val;
};
+struct bpf_timer {
+ __u64 :64;
+ __u64 :64;
+} __attribute__((aligned(8)));
+
struct bpf_sysctl {
__u32 write; /* Sysctl is being read (= 0) or written (= 1).
* Allows 1,2,4-byte read, but no write.
diff --git a/tools/testing/selftests/bpf/prog_tests/timer.c b/tools/testing/selftests/bpf/prog_tests/timer.c
new file mode 100644
index 000000000000..25f40e1b9967
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/timer.c
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <test_progs.h>
+#include "timer.skel.h"
+
+static int timer(struct timer *timer_skel)
+{
+ int err, prog_fd;
+ __u32 duration = 0, retval;
+
+ err = timer__attach(timer_skel);
+ if (!ASSERT_OK(err, "timer_attach"))
+ return err;
+
+ ASSERT_EQ(timer_skel->data->callback_check, 52, "callback_check1");
+ ASSERT_EQ(timer_skel->data->callback2_check, 52, "callback2_check1");
+
+ prog_fd = bpf_program__fd(timer_skel->progs.test1);
+ err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
+ NULL, NULL, &retval, &duration);
+ ASSERT_OK(err, "test_run");
+ ASSERT_EQ(retval, 0, "test_run");
+ timer__detach(timer_skel);
+
+ usleep(50); /* 10 usecs should be enough, but give it extra */
+ /* check that timer_cb1() was executed 10+10 times */
+ ASSERT_EQ(timer_skel->data->callback_check, 42, "callback_check2");
+ ASSERT_EQ(timer_skel->data->callback2_check, 42, "callback2_check2");
+
+ /* check that timer_cb2() was executed twice */
+ ASSERT_EQ(timer_skel->bss->bss_data, 10, "bss_data");
+
+ /* check that there were no errors in timer execution */
+ ASSERT_EQ(timer_skel->bss->err, 0, "err");
+
+ /* check that code paths completed */
+ ASSERT_EQ(timer_skel->bss->ok, 1 | 2 | 4, "ok");
+
+ return 0;
+}
+
+void test_timer(void)
+{
+ struct timer *timer_skel = NULL;
+ int err;
+
+ timer_skel = timer__open_and_load();
+ if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load"))
+ goto cleanup;
+
+ err = timer(timer_skel);
+ ASSERT_OK(err, "timer");
+cleanup:
+ timer__destroy(timer_skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/timer_mim.c b/tools/testing/selftests/bpf/prog_tests/timer_mim.c
new file mode 100644
index 000000000000..f5acbcbe33a4
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/timer_mim.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <test_progs.h>
+#include "timer_mim.skel.h"
+#include "timer_mim_reject.skel.h"
+
+static int timer_mim(struct timer_mim *timer_skel)
+{
+ __u32 duration = 0, retval;
+ __u64 cnt1, cnt2;
+ int err, prog_fd, key1 = 1;
+
+ err = timer_mim__attach(timer_skel);
+ if (!ASSERT_OK(err, "timer_attach"))
+ return err;
+
+ prog_fd = bpf_program__fd(timer_skel->progs.test1);
+ err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
+ NULL, NULL, &retval, &duration);
+ ASSERT_OK(err, "test_run");
+ ASSERT_EQ(retval, 0, "test_run");
+ timer_mim__detach(timer_skel);
+
+ /* check that timer_cb[12] are incrementing 'cnt' */
+ cnt1 = READ_ONCE(timer_skel->bss->cnt);
+ usleep(200); /* 100 times more than interval */
+ cnt2 = READ_ONCE(timer_skel->bss->cnt);
+ ASSERT_GT(cnt2, cnt1, "cnt");
+
+ ASSERT_EQ(timer_skel->bss->err, 0, "err");
+ /* check that code paths completed */
+ ASSERT_EQ(timer_skel->bss->ok, 1 | 2, "ok");
+
+ close(bpf_map__fd(timer_skel->maps.inner_htab));
+ err = bpf_map_delete_elem(bpf_map__fd(timer_skel->maps.outer_arr), &key1);
+ ASSERT_EQ(err, 0, "delete inner map");
+
+ /* check that timer_cb[12] are no longer running */
+ cnt1 = READ_ONCE(timer_skel->bss->cnt);
+ usleep(200);
+ cnt2 = READ_ONCE(timer_skel->bss->cnt);
+ ASSERT_EQ(cnt2, cnt1, "cnt");
+
+ return 0;
+}
+
+void test_timer_mim(void)
+{
+ struct timer_mim_reject *timer_reject_skel = NULL;
+ libbpf_print_fn_t old_print_fn = NULL;
+ struct timer_mim *timer_skel = NULL;
+ int err;
+
+ old_print_fn = libbpf_set_print(NULL);
+ timer_reject_skel = timer_mim_reject__open_and_load();
+ libbpf_set_print(old_print_fn);
+ if (!ASSERT_ERR_PTR(timer_reject_skel, "timer_reject_skel_load"))
+ goto cleanup;
+
+ timer_skel = timer_mim__open_and_load();
+ if (!ASSERT_OK_PTR(timer_skel, "timer_skel_load"))
+ goto cleanup;
+
+ err = timer_mim(timer_skel);
+ ASSERT_OK(err, "timer_mim");
+cleanup:
+ timer_mim__destroy(timer_skel);
+ timer_mim_reject__destroy(timer_reject_skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c
new file mode 100644
index 000000000000..5f5309791649
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/timer.c
@@ -0,0 +1,297 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <linux/bpf.h>
+#include <time.h>
+#include <errno.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_tcp_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+struct hmap_elem {
+ int counter;
+ struct bpf_timer timer;
+ struct bpf_spin_lock lock; /* unused */
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1000);
+ __type(key, int);
+ __type(value, struct hmap_elem);
+} hmap SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __uint(max_entries, 1000);
+ __type(key, int);
+ __type(value, struct hmap_elem);
+} hmap_malloc SEC(".maps");
+
+struct elem {
+ struct bpf_timer t;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 2);
+ __type(key, int);
+ __type(value, struct elem);
+} array SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_LRU_HASH);
+ __uint(max_entries, 4);
+ __type(key, int);
+ __type(value, struct elem);
+} lru SEC(".maps");
+
+__u64 bss_data;
+__u64 err;
+__u64 ok;
+__u64 callback_check = 52;
+__u64 callback2_check = 52;
+
+#define ARRAY 1
+#define HTAB 2
+#define HTAB_MALLOC 3
+#define LRU 4
+
+/* callback for array and lru timers */
+static int timer_cb1(void *map, int *key, struct bpf_timer *timer)
+{
+ /* increment bss variable twice.
+ * Once via array timer callback and once via lru timer callback
+ */
+ bss_data += 5;
+
+ /* *key == 0 - the callback was called for array timer.
+ * *key == 4 - the callback was called from lru timer.
+ */
+ if (*key == ARRAY) {
+ struct bpf_timer *lru_timer;
+ int lru_key = LRU;
+
+ /* rearm array timer to be called again in ~35 seconds */
+ if (bpf_timer_start(timer, 1ull << 35, 0) != 0)
+ err |= 1;
+
+ lru_timer = bpf_map_lookup_elem(&lru, &lru_key);
+ if (!lru_timer)
+ return 0;
+ bpf_timer_set_callback(lru_timer, timer_cb1);
+ if (bpf_timer_start(lru_timer, 0, 0) != 0)
+ err |= 2;
+ } else if (*key == LRU) {
+ int lru_key, i;
+
+ for (i = LRU + 1;
+ i <= 100 /* for current LRU eviction algorithm this number
+ * should be larger than ~ lru->max_entries * 2
+ */;
+ i++) {
+ struct elem init = {};
+
+ /* lru_key cannot be used as loop induction variable
+ * otherwise the loop will be unbounded.
+ */
+ lru_key = i;
+
+ /* add more elements into lru map to push out current
+ * element and force deletion of this timer
+ */
+ bpf_map_update_elem(map, &lru_key, &init, 0);
+ /* look it up to bump it into active list */
+ bpf_map_lookup_elem(map, &lru_key);
+
+ /* keep adding until *key changes underneath,
+ * which means that key/timer memory was reused
+ */
+ if (*key != LRU)
+ break;
+ }
+
+ /* check that the timer was removed */
+ if (bpf_timer_cancel(timer) != -EINVAL)
+ err |= 4;
+ ok |= 1;
+ }
+ return 0;
+}
+
+SEC("fentry/bpf_fentry_test1")
+int BPF_PROG(test1, int a)
+{
+ struct bpf_timer *arr_timer, *lru_timer;
+ struct elem init = {};
+ int lru_key = LRU;
+ int array_key = ARRAY;
+
+ arr_timer = bpf_map_lookup_elem(&array, &array_key);
+ if (!arr_timer)
+ return 0;
+ bpf_timer_init(arr_timer, &array, CLOCK_MONOTONIC);
+
+ bpf_map_update_elem(&lru, &lru_key, &init, 0);
+ lru_timer = bpf_map_lookup_elem(&lru, &lru_key);
+ if (!lru_timer)
+ return 0;
+ bpf_timer_init(lru_timer, &lru, CLOCK_MONOTONIC);
+
+ bpf_timer_set_callback(arr_timer, timer_cb1);
+ bpf_timer_start(arr_timer, 0 /* call timer_cb1 asap */, 0);
+
+ /* init more timers to check that array destruction
+ * doesn't leak timer memory.
+ */
+ array_key = 0;
+ arr_timer = bpf_map_lookup_elem(&array, &array_key);
+ if (!arr_timer)
+ return 0;
+ bpf_timer_init(arr_timer, &array, CLOCK_MONOTONIC);
+ return 0;
+}
+
+/* callback for prealloc and non-prealloca hashtab timers */
+static int timer_cb2(void *map, int *key, struct hmap_elem *val)
+{
+ if (*key == HTAB)
+ callback_check--;
+ else
+ callback2_check--;
+ if (val->counter > 0 && --val->counter) {
+ /* re-arm the timer again to execute after 1 usec */
+ bpf_timer_start(&val->timer, 1000, 0);
+ } else if (*key == HTAB) {
+ struct bpf_timer *arr_timer;
+ int array_key = ARRAY;
+
+ /* cancel arr_timer otherwise bpf_fentry_test1 prog
+ * will stay alive forever.
+ */
+ arr_timer = bpf_map_lookup_elem(&array, &array_key);
+ if (!arr_timer)
+ return 0;
+ if (bpf_timer_cancel(arr_timer) != 1)
+ /* bpf_timer_cancel should return 1 to indicate
+ * that arr_timer was active at this time
+ */
+ err |= 8;
+
+ /* try to cancel ourself. It shouldn't deadlock. */
+ if (bpf_timer_cancel(&val->timer) != -EDEADLK)
+ err |= 16;
+
+ /* delete this key and this timer anyway.
+ * It shouldn't deadlock either.
+ */
+ bpf_map_delete_elem(map, key);
+
+ /* in preallocated hashmap both 'key' and 'val' could have been
+ * reused to store another map element (like in LRU above),
+ * but in controlled test environment the below test works.
+ * It's not a use-after-free. The memory is owned by the map.
+ */
+ if (bpf_timer_start(&val->timer, 1000, 0) != -EINVAL)
+ err |= 32;
+ ok |= 2;
+ } else {
+ if (*key != HTAB_MALLOC)
+ err |= 64;
+
+ /* try to cancel ourself. It shouldn't deadlock. */
+ if (bpf_timer_cancel(&val->timer) != -EDEADLK)
+ err |= 128;
+
+ /* delete this key and this timer anyway.
+ * It shouldn't deadlock either.
+ */
+ bpf_map_delete_elem(map, key);
+
+ /* in non-preallocated hashmap both 'key' and 'val' are RCU
+ * protected and still valid though this element was deleted
+ * from the map. Arm this timer for ~35 seconds. When callback
+ * finishes the call_rcu will invoke:
+ * htab_elem_free_rcu
+ * check_and_free_timer
+ * bpf_timer_cancel_and_free
+ * to cancel this 35 second sleep and delete the timer for real.
+ */
+ if (bpf_timer_start(&val->timer, 1ull << 35, 0) != 0)
+ err |= 256;
+ ok |= 4;
+ }
+ return 0;
+}
+
+int bpf_timer_test(void)
+{
+ struct hmap_elem *val;
+ int key = HTAB, key_malloc = HTAB_MALLOC;
+
+ val = bpf_map_lookup_elem(&hmap, &key);
+ if (val) {
+ if (bpf_timer_init(&val->timer, &hmap, CLOCK_BOOTTIME) != 0)
+ err |= 512;
+ bpf_timer_set_callback(&val->timer, timer_cb2);
+ bpf_timer_start(&val->timer, 1000, 0);
+ }
+ val = bpf_map_lookup_elem(&hmap_malloc, &key_malloc);
+ if (val) {
+ if (bpf_timer_init(&val->timer, &hmap_malloc, CLOCK_BOOTTIME) != 0)
+ err |= 1024;
+ bpf_timer_set_callback(&val->timer, timer_cb2);
+ bpf_timer_start(&val->timer, 1000, 0);
+ }
+ return 0;
+}
+
+SEC("fentry/bpf_fentry_test2")
+int BPF_PROG(test2, int a, int b)
+{
+ struct hmap_elem init = {}, *val;
+ int key = HTAB, key_malloc = HTAB_MALLOC;
+
+ init.counter = 10; /* number of times to trigger timer_cb2 */
+ bpf_map_update_elem(&hmap, &key, &init, 0);
+ val = bpf_map_lookup_elem(&hmap, &key);
+ if (val)
+ bpf_timer_init(&val->timer, &hmap, CLOCK_BOOTTIME);
+ /* update the same key to free the timer */
+ bpf_map_update_elem(&hmap, &key, &init, 0);
+
+ bpf_map_update_elem(&hmap_malloc, &key_malloc, &init, 0);
+ val = bpf_map_lookup_elem(&hmap_malloc, &key_malloc);
+ if (val)
+ bpf_timer_init(&val->timer, &hmap_malloc, CLOCK_BOOTTIME);
+ /* update the same key to free the timer */
+ bpf_map_update_elem(&hmap_malloc, &key_malloc, &init, 0);
+
+ /* init more timers to check that htab operations
+ * don't leak timer memory.
+ */
+ key = 0;
+ bpf_map_update_elem(&hmap, &key, &init, 0);
+ val = bpf_map_lookup_elem(&hmap, &key);
+ if (val)
+ bpf_timer_init(&val->timer, &hmap, CLOCK_BOOTTIME);
+ bpf_map_delete_elem(&hmap, &key);
+ bpf_map_update_elem(&hmap, &key, &init, 0);
+ val = bpf_map_lookup_elem(&hmap, &key);
+ if (val)
+ bpf_timer_init(&val->timer, &hmap, CLOCK_BOOTTIME);
+
+ /* and with non-prealloc htab */
+ key_malloc = 0;
+ bpf_map_update_elem(&hmap_malloc, &key_malloc, &init, 0);
+ val = bpf_map_lookup_elem(&hmap_malloc, &key_malloc);
+ if (val)
+ bpf_timer_init(&val->timer, &hmap_malloc, CLOCK_BOOTTIME);
+ bpf_map_delete_elem(&hmap_malloc, &key_malloc);
+ bpf_map_update_elem(&hmap_malloc, &key_malloc, &init, 0);
+ val = bpf_map_lookup_elem(&hmap_malloc, &key_malloc);
+ if (val)
+ bpf_timer_init(&val->timer, &hmap_malloc, CLOCK_BOOTTIME);
+
+ return bpf_timer_test();
+}
diff --git a/tools/testing/selftests/bpf/progs/timer_mim.c b/tools/testing/selftests/bpf/progs/timer_mim.c
new file mode 100644
index 000000000000..2fee7ab105ef
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/timer_mim.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <linux/bpf.h>
+#include <time.h>
+#include <errno.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_tcp_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+struct hmap_elem {
+ int pad; /* unused */
+ struct bpf_timer timer;
+};
+
+struct inner_map {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1024);
+ __type(key, int);
+ __type(value, struct hmap_elem);
+} inner_htab SEC(".maps");
+
+#define ARRAY_KEY 1
+#define HASH_KEY 1234
+
+struct outer_arr {
+ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+ __uint(max_entries, 2);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+ __array(values, struct inner_map);
+} outer_arr SEC(".maps") = {
+ .values = { [ARRAY_KEY] = &inner_htab },
+};
+
+__u64 err;
+__u64 ok;
+__u64 cnt;
+
+static int timer_cb1(void *map, int *key, struct hmap_elem *val);
+
+static int timer_cb2(void *map, int *key, struct hmap_elem *val)
+{
+ cnt++;
+ bpf_timer_set_callback(&val->timer, timer_cb1);
+ if (bpf_timer_start(&val->timer, 1000, 0))
+ err |= 1;
+ ok |= 1;
+ return 0;
+}
+
+/* callback for inner hash map */
+static int timer_cb1(void *map, int *key, struct hmap_elem *val)
+{
+ cnt++;
+ bpf_timer_set_callback(&val->timer, timer_cb2);
+ if (bpf_timer_start(&val->timer, 1000, 0))
+ err |= 2;
+ /* Do a lookup to make sure 'map' and 'key' pointers are correct */
+ bpf_map_lookup_elem(map, key);
+ ok |= 2;
+ return 0;
+}
+
+SEC("fentry/bpf_fentry_test1")
+int BPF_PROG(test1, int a)
+{
+ struct hmap_elem init = {};
+ struct bpf_map *inner_map;
+ struct hmap_elem *val;
+ int array_key = ARRAY_KEY;
+ int hash_key = HASH_KEY;
+
+ inner_map = bpf_map_lookup_elem(&outer_arr, &array_key);
+ if (!inner_map)
+ return 0;
+
+ bpf_map_update_elem(inner_map, &hash_key, &init, 0);
+ val = bpf_map_lookup_elem(inner_map, &hash_key);
+ if (!val)
+ return 0;
+
+ bpf_timer_init(&val->timer, inner_map, CLOCK_MONOTONIC);
+ if (bpf_timer_set_callback(&val->timer, timer_cb1))
+ err |= 4;
+ if (bpf_timer_start(&val->timer, 0, 0))
+ err |= 8;
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/timer_mim_reject.c b/tools/testing/selftests/bpf/progs/timer_mim_reject.c
new file mode 100644
index 000000000000..5d648e3d8a41
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/timer_mim_reject.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <linux/bpf.h>
+#include <time.h>
+#include <errno.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_tcp_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+struct hmap_elem {
+ int pad; /* unused */
+ struct bpf_timer timer;
+};
+
+struct inner_map {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1024);
+ __type(key, int);
+ __type(value, struct hmap_elem);
+} inner_htab SEC(".maps");
+
+#define ARRAY_KEY 1
+#define ARRAY_KEY2 2
+#define HASH_KEY 1234
+
+struct outer_arr {
+ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+ __uint(max_entries, 2);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+ __array(values, struct inner_map);
+} outer_arr SEC(".maps") = {
+ .values = { [ARRAY_KEY] = &inner_htab },
+};
+
+__u64 err;
+__u64 ok;
+__u64 cnt;
+
+/* callback for inner hash map */
+static int timer_cb(void *map, int *key, struct hmap_elem *val)
+{
+ return 0;
+}
+
+SEC("fentry/bpf_fentry_test1")
+int BPF_PROG(test1, int a)
+{
+ struct hmap_elem init = {};
+ struct bpf_map *inner_map, *inner_map2;
+ struct hmap_elem *val;
+ int array_key = ARRAY_KEY;
+ int array_key2 = ARRAY_KEY2;
+ int hash_key = HASH_KEY;
+
+ inner_map = bpf_map_lookup_elem(&outer_arr, &array_key);
+ if (!inner_map)
+ return 0;
+
+ inner_map2 = bpf_map_lookup_elem(&outer_arr, &array_key2);
+ if (!inner_map2)
+ return 0;
+ bpf_map_update_elem(inner_map, &hash_key, &init, 0);
+ val = bpf_map_lookup_elem(inner_map, &hash_key);
+ if (!val)
+ return 0;
+
+ bpf_timer_init(&val->timer, inner_map2, CLOCK_MONOTONIC);
+ if (bpf_timer_set_callback(&val->timer, timer_cb))
+ err |= 4;
+ if (bpf_timer_start(&val->timer, 0, 0))
+ err |= 8;
+ return 0;
+}