diff options
author | Jakub Kicinski | 2022-03-22 10:36:56 -0700 |
---|---|---|
committer | Jakub Kicinski | 2022-03-22 11:18:49 -0700 |
commit | 0db8640df59512dbd423c32077919f10cf35ebc6 (patch) | |
tree | a8e0d6806763ce5c3c84a5a2c29a1cbdb58f5129 /kernel | |
parent | 4a0cb83ba6e0cd73a50fa4f84736846bf0029f2b (diff) | |
parent | 7f0059b58f0257d895fafd2f2e3afe3bbdf21e64 (diff) |
Merge https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Alexei Starovoitov says:
====================
pull-request: bpf-next 2022-03-21 v2
We've added 137 non-merge commits during the last 17 day(s) which contain
a total of 143 files changed, 7123 insertions(+), 1092 deletions(-).
The main changes are:
1) Custom SEC() handling in libbpf, from Andrii.
2) subskeleton support, from Delyan.
3) Use btf_tag to recognize __percpu pointers in the verifier, from Hao.
4) Fix net.core.bpf_jit_harden race, from Hou.
5) Fix bpf_sk_lookup remote_port on big-endian, from Jakub.
6) Introduce fprobe (multi kprobe) _without_ arch bits, from Masami.
The arch specific bits will come later.
7) Introduce multi_kprobe bpf programs on top of fprobe, from Jiri.
8) Enable non-atomic allocations in local storage, from Joanne.
9) Various var_off ptr_to_btf_id fixed, from Kumar.
10) bpf_ima_file_hash helper, from Roberto.
11) Add "live packet" mode for XDP in BPF_PROG_RUN, from Toke.
* https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (137 commits)
selftests/bpf: Fix kprobe_multi test.
Revert "rethook: x86: Add rethook x86 implementation"
Revert "arm64: rethook: Add arm64 rethook implementation"
Revert "powerpc: Add rethook support"
Revert "ARM: rethook: Add rethook arm implementation"
bpftool: Fix a bug in subskeleton code generation
bpf: Fix bpf_prog_pack when PMU_SIZE is not defined
bpf: Fix bpf_prog_pack for multi-node setup
bpf: Fix warning for cast from restricted gfp_t in verifier
bpf, arm: Fix various typos in comments
libbpf: Close fd in bpf_object__reuse_map
bpftool: Fix print error when show bpf map
bpf: Fix kprobe_multi return probe backtrace
Revert "bpf: Add support to inline bpf_get_func_ip helper on x86"
bpf: Simplify check in btf_parse_hdr()
selftests/bpf/test_lirc_mode2.sh: Exit with proper code
bpf: Check for NULL return from bpf_get_btf_vmlinux
selftests/bpf: Test skipping stacktrace
bpf: Adjust BPF stack helper functions to accommodate skip > 0
bpf: Select proper size for bpf_prog_pack
...
====================
Link: https://lore.kernel.org/r/20220322050159.5507-1-alexei.starovoitov@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/Kconfig | 1 | ||||
-rw-r--r-- | kernel/bpf/bpf_inode_storage.c | 9 | ||||
-rw-r--r-- | kernel/bpf/bpf_local_storage.c | 58 | ||||
-rw-r--r-- | kernel/bpf/bpf_lsm.c | 21 | ||||
-rw-r--r-- | kernel/bpf/bpf_task_storage.c | 10 | ||||
-rw-r--r-- | kernel/bpf/btf.c | 166 | ||||
-rw-r--r-- | kernel/bpf/core.c | 89 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 9 | ||||
-rw-r--r-- | kernel/bpf/preload/Makefile | 5 | ||||
-rw-r--r-- | kernel/bpf/stackmap.c | 56 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 28 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 161 | ||||
-rw-r--r-- | kernel/exit.c | 2 | ||||
-rw-r--r-- | kernel/fork.c | 3 | ||||
-rw-r--r-- | kernel/kallsyms.c | 4 | ||||
-rw-r--r-- | kernel/trace/Kconfig | 26 | ||||
-rw-r--r-- | kernel/trace/Makefile | 2 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 348 | ||||
-rw-r--r-- | kernel/trace/fprobe.c | 332 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 58 | ||||
-rw-r--r-- | kernel/trace/rethook.c | 317 |
21 files changed, 1475 insertions, 230 deletions
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index c3cf0b86eeb2..d56ee177d5f8 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -30,6 +30,7 @@ config BPF_SYSCALL select TASKS_TRACE_RCU select BINARY_PRINTF select NET_SOCK_MSG if NET + select PAGE_POOL if NET default n help Enable the bpf() system call that allows to manipulate BPF programs diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index e29d9e3d853e..96be8d518885 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -136,7 +136,7 @@ static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, sdata = bpf_local_storage_update(f->f_inode, (struct bpf_local_storage_map *)map, - value, map_flags); + value, map_flags, GFP_ATOMIC); fput(f); return PTR_ERR_OR_ZERO(sdata); } @@ -169,8 +169,9 @@ static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) return err; } -BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, - void *, value, u64, flags) +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, + void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; @@ -196,7 +197,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { sdata = bpf_local_storage_update( inode, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST); + BPF_NOEXIST, gfp_flags); return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 092a1ac772d7..01aa2b51ec4d 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -63,7 +63,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, - void *value, bool charge_mem) + void *value, bool charge_mem, gfp_t gfp_flags) { struct bpf_local_storage_elem *selem; @@ -71,7 +71,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, return NULL; selem = bpf_map_kzalloc(&smap->map, smap->elem_size, - GFP_ATOMIC | __GFP_NOWARN); + gfp_flags | __GFP_NOWARN); if (selem) { if (value) memcpy(SDATA(selem)->data, value, smap->map.value_size); @@ -282,7 +282,8 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata, int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *first_selem) + struct bpf_local_storage_elem *first_selem, + gfp_t gfp_flags) { struct bpf_local_storage *prev_storage, *storage; struct bpf_local_storage **owner_storage_ptr; @@ -293,7 +294,7 @@ int bpf_local_storage_alloc(void *owner, return err; storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), - GFP_ATOMIC | __GFP_NOWARN); + gfp_flags | __GFP_NOWARN); if (!storage) { err = -ENOMEM; goto uncharge; @@ -350,10 +351,10 @@ uncharge: */ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags) + void *value, u64 map_flags, gfp_t gfp_flags) { struct bpf_local_storage_data *old_sdata = NULL; - struct bpf_local_storage_elem *selem; + struct bpf_local_storage_elem *selem = NULL; struct bpf_local_storage *local_storage; unsigned long flags; int err; @@ -365,6 +366,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, !map_value_has_spin_lock(&smap->map))) return ERR_PTR(-EINVAL); + if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST) + return ERR_PTR(-EINVAL); + local_storage = rcu_dereference_check(*owner_storage(smap, owner), bpf_rcu_lock_held()); if (!local_storage || hlist_empty(&local_storage->list)) { @@ -373,11 +377,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (err) return ERR_PTR(err); - selem = bpf_selem_alloc(smap, owner, value, true); + selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); if (!selem) return ERR_PTR(-ENOMEM); - err = bpf_local_storage_alloc(owner, smap, selem); + err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags); if (err) { kfree(selem); mem_uncharge(smap, owner, smap->elem_size); @@ -404,6 +408,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } } + if (gfp_flags == GFP_KERNEL) { + selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + if (!selem) + return ERR_PTR(-ENOMEM); + } + raw_spin_lock_irqsave(&local_storage->lock, flags); /* Recheck local_storage->list under local_storage->lock */ @@ -429,19 +439,21 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, goto unlock; } - /* local_storage->lock is held. Hence, we are sure - * we can unlink and uncharge the old_sdata successfully - * later. Hence, instead of charging the new selem now - * and then uncharge the old selem later (which may cause - * a potential but unnecessary charge failure), avoid taking - * a charge at all here (the "!old_sdata" check) and the - * old_sdata will not be uncharged later during - * bpf_selem_unlink_storage_nolock(). - */ - selem = bpf_selem_alloc(smap, owner, value, !old_sdata); - if (!selem) { - err = -ENOMEM; - goto unlock_err; + if (gfp_flags != GFP_KERNEL) { + /* local_storage->lock is held. Hence, we are sure + * we can unlink and uncharge the old_sdata successfully + * later. Hence, instead of charging the new selem now + * and then uncharge the old selem later (which may cause + * a potential but unnecessary charge failure), avoid taking + * a charge at all here (the "!old_sdata" check) and the + * old_sdata will not be uncharged later during + * bpf_selem_unlink_storage_nolock(). + */ + selem = bpf_selem_alloc(smap, owner, value, !old_sdata, gfp_flags); + if (!selem) { + err = -ENOMEM; + goto unlock_err; + } } /* First, link the new selem to the map */ @@ -463,6 +475,10 @@ unlock: unlock_err: raw_spin_unlock_irqrestore(&local_storage->lock, flags); + if (selem) { + mem_uncharge(smap, owner, smap->elem_size); + kfree(selem); + } return ERR_PTR(err); } diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 9e4ecc990647..064eccba641d 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -99,6 +99,24 @@ static const struct bpf_func_proto bpf_ima_inode_hash_proto = { .allowed = bpf_ima_inode_hash_allowed, }; +BPF_CALL_3(bpf_ima_file_hash, struct file *, file, void *, dst, u32, size) +{ + return ima_file_hash(file, dst, size); +} + +BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file) + +static const struct bpf_func_proto bpf_ima_file_hash_proto = { + .func = bpf_ima_file_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_ima_file_hash_btf_ids[0], + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, + .allowed = bpf_ima_inode_hash_allowed, +}; + static const struct bpf_func_proto * bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -121,6 +139,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_bprm_opts_set_proto; case BPF_FUNC_ima_inode_hash: return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL; + case BPF_FUNC_ima_file_hash: + return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL; default: return tracing_prog_func_proto(func_id, prog); } @@ -167,6 +187,7 @@ BTF_ID(func, bpf_lsm_inode_setxattr) BTF_ID(func, bpf_lsm_inode_symlink) BTF_ID(func, bpf_lsm_inode_unlink) BTF_ID(func, bpf_lsm_kernel_module_request) +BTF_ID(func, bpf_lsm_kernel_read_file) BTF_ID(func, bpf_lsm_kernfs_init_security) #ifdef CONFIG_KEYS diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 5da7bed0f5f6..6638a0ecc3d2 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -174,7 +174,8 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, bpf_task_storage_lock(); sdata = bpf_local_storage_update( - task, (struct bpf_local_storage_map *)map, value, map_flags); + task, (struct bpf_local_storage_map *)map, value, map_flags, + GFP_ATOMIC); bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); @@ -226,8 +227,9 @@ out: return err; } -BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, - task, void *, value, u64, flags) +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, + task, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; @@ -250,7 +252,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST); + BPF_NOEXIST, gfp_flags); unlock: bpf_task_storage_unlock(); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index b472cf0c8fdb..24788ce564a0 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -525,6 +525,50 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) return -ENOENT; } +static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p) +{ + struct btf *btf; + s32 ret; + int id; + + btf = bpf_get_btf_vmlinux(); + if (IS_ERR(btf)) + return PTR_ERR(btf); + if (!btf) + return -EINVAL; + + ret = btf_find_by_name_kind(btf, name, kind); + /* ret is never zero, since btf_find_by_name_kind returns + * positive btf_id or negative error. + */ + if (ret > 0) { + btf_get(btf); + *btf_p = btf; + return ret; + } + + /* If name is not found in vmlinux's BTF then search in module's BTFs */ + spin_lock_bh(&btf_idr_lock); + idr_for_each_entry(&btf_idr, btf, id) { + if (!btf_is_module(btf)) + continue; + /* linear search could be slow hence unlock/lock + * the IDR to avoiding holding it for too long + */ + btf_get(btf); + spin_unlock_bh(&btf_idr_lock); + ret = btf_find_by_name_kind(btf, name, kind); + if (ret > 0) { + *btf_p = btf; + return ret; + } + spin_lock_bh(&btf_idr_lock); + btf_put(btf); + } + spin_unlock_bh(&btf_idr_lock); + return ret; +} + const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, u32 id, u32 *res_id) { @@ -4438,8 +4482,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env) btf = env->btf; btf_data_size = btf->data_size; - if (btf_data_size < - offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) { + if (btf_data_size < offsetofend(struct btf_header, hdr_len)) { btf_verifier_log(env, "hdr_len not found"); return -EINVAL; } @@ -5057,6 +5100,8 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, tag_value = __btf_name_by_offset(btf, t->name_off); if (strcmp(tag_value, "user") == 0) info->reg_type |= MEM_USER; + if (strcmp(tag_value, "percpu") == 0) + info->reg_type |= MEM_PERCPU; } /* skip modifiers */ @@ -5285,12 +5330,16 @@ error: return -EACCES; } - /* check __user tag */ + /* check type tag */ t = btf_type_by_id(btf, mtype->type); if (btf_type_is_type_tag(t)) { tag_value = __btf_name_by_offset(btf, t->name_off); + /* check __user tag */ if (strcmp(tag_value, "user") == 0) tmp_flag = MEM_USER; + /* check __percpu tag */ + if (strcmp(tag_value, "percpu") == 0) + tmp_flag = MEM_PERCPU; } stype = btf_type_skip_modifiers(btf, mtype->type, &id); @@ -5726,7 +5775,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, const char *func_name, *ref_tname; const struct btf_type *t, *ref_t; const struct btf_param *args; - int ref_regno = 0; + int ref_regno = 0, ret; bool rel = false; t = btf_type_by_id(btf, func_id); @@ -5753,6 +5802,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } + /* Only kfunc can be release func */ + if (is_kfunc) + rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_RELEASE, func_id); /* check that BTF function arguments match actual types that the * verifier sees. */ @@ -5776,6 +5829,11 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); + + ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE, rel); + if (ret < 0) + return ret; + if (btf_get_prog_ctx_type(log, btf, t, env->prog->type, i)) { /* If function expects ctx type in BTF check that caller @@ -5787,8 +5845,6 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, i, btf_type_str(t)); return -EINVAL; } - if (check_ptr_off_reg(env, reg, regno)) - return -EINVAL; } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || (reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) { const struct btf_type *reg_ref_t; @@ -5806,7 +5862,11 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (reg->type == PTR_TO_BTF_ID) { reg_btf = reg->btf; reg_ref_id = reg->btf_id; - /* Ensure only one argument is referenced PTR_TO_BTF_ID */ + /* Ensure only one argument is referenced + * PTR_TO_BTF_ID, check_func_arg_reg_off relies + * on only one referenced register being allowed + * for kfuncs. + */ if (reg->ref_obj_id) { if (ref_obj_id) { bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", @@ -5888,18 +5948,15 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, /* Either both are set, or neither */ WARN_ON_ONCE((ref_obj_id && !ref_regno) || (!ref_obj_id && ref_regno)); - if (is_kfunc) { - rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog), - BTF_KFUNC_TYPE_RELEASE, func_id); - /* We already made sure ref_obj_id is set only for one argument */ - if (rel && !ref_obj_id) { - bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n", - func_name); - return -EINVAL; - } - /* Allow (!rel && ref_obj_id), so that passing such referenced PTR_TO_BTF_ID to - * other kfuncs works - */ + /* We already made sure ref_obj_id is set only for one argument. We do + * allow (!rel && ref_obj_id), so that passing such referenced + * PTR_TO_BTF_ID to other kfuncs works. Note that rel is only true when + * is_kfunc is true. + */ + if (rel && !ref_obj_id) { + bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n", + func_name); + return -EINVAL; } /* returns argument register number > 0 in case of reference release kfunc */ return rel ? ref_regno : 0; @@ -6516,20 +6573,23 @@ struct module *btf_try_get_module(const struct btf *btf) return res; } -/* Returns struct btf corresponding to the struct module - * - * This function can return NULL or ERR_PTR. Note that caller must - * release reference for struct btf iff btf_is_module is true. +/* Returns struct btf corresponding to the struct module. + * This function can return NULL or ERR_PTR. */ static struct btf *btf_get_module_btf(const struct module *module) { - struct btf *btf = NULL; #ifdef CONFIG_DEBUG_INFO_BTF_MODULES struct btf_module *btf_mod, *tmp; #endif + struct btf *btf = NULL; + + if (!module) { + btf = bpf_get_btf_vmlinux(); + if (!IS_ERR_OR_NULL(btf)) + btf_get(btf); + return btf; + } - if (!module) - return bpf_get_btf_vmlinux(); #ifdef CONFIG_DEBUG_INFO_BTF_MODULES mutex_lock(&btf_module_mutex); list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { @@ -6548,7 +6608,8 @@ static struct btf *btf_get_module_btf(const struct module *module) BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags) { - struct btf *btf; + struct btf *btf = NULL; + int btf_obj_fd = 0; long ret; if (flags) @@ -6557,44 +6618,17 @@ BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int if (name_sz <= 1 || name[name_sz - 1]) return -EINVAL; - btf = bpf_get_btf_vmlinux(); - if (IS_ERR(btf)) - return PTR_ERR(btf); - - ret = btf_find_by_name_kind(btf, name, kind); - /* ret is never zero, since btf_find_by_name_kind returns - * positive btf_id or negative error. - */ - if (ret < 0) { - struct btf *mod_btf; - int id; - - /* If name is not found in vmlinux's BTF then search in module's BTFs */ - spin_lock_bh(&btf_idr_lock); - idr_for_each_entry(&btf_idr, mod_btf, id) { - if (!btf_is_module(mod_btf)) - continue; - /* linear search could be slow hence unlock/lock - * the IDR to avoiding holding it for too long - */ - btf_get(mod_btf); - spin_unlock_bh(&btf_idr_lock); - ret = btf_find_by_name_kind(mod_btf, name, kind); - if (ret > 0) { - int btf_obj_fd; - - btf_obj_fd = __btf_new_fd(mod_btf); - if (btf_obj_fd < 0) { - btf_put(mod_btf); - return btf_obj_fd; - } - return ret | (((u64)btf_obj_fd) << 32); - } - spin_lock_bh(&btf_idr_lock); - btf_put(mod_btf); + ret = bpf_find_btf_id(name, kind, &btf); + if (ret > 0 && btf_is_module(btf)) { + btf_obj_fd = __btf_new_fd(btf); + if (btf_obj_fd < 0) { + btf_put(btf); + return btf_obj_fd; } - spin_unlock_bh(&btf_idr_lock); + return ret | (((u64)btf_obj_fd) << 32); } + if (ret > 0) + btf_put(btf); return ret; } @@ -6793,9 +6827,7 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, hook = bpf_prog_type_to_kfunc_hook(prog_type); ret = btf_populate_kfunc_set(btf, hook, kset); - /* reference is only taken for module BTF */ - if (btf_is_module(btf)) - btf_put(btf); + btf_put(btf); return ret; } EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set); @@ -7149,6 +7181,8 @@ bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id) main_btf = bpf_get_btf_vmlinux(); if (IS_ERR(main_btf)) return ERR_CAST(main_btf); + if (!main_btf) + return ERR_PTR(-EINVAL); local_type = btf_type_by_id(local_btf, local_type_id); if (!local_type) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ab630f773ec1..13e9dbeeedf3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -33,6 +33,7 @@ #include <linux/extable.h> #include <linux/log2.h> #include <linux/bpf_verifier.h> +#include <linux/nodemask.h> #include <asm/barrier.h> #include <asm/unaligned.h> @@ -105,6 +106,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->aux = aux; fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); + fp->blinding_requested = bpf_jit_blinding_enabled(fp); INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); mutex_init(&fp->aux->used_maps_mutex); @@ -814,15 +816,9 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86) * to host BPF programs. */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define BPF_PROG_PACK_SIZE HPAGE_PMD_SIZE -#else -#define BPF_PROG_PACK_SIZE PAGE_SIZE -#endif #define BPF_PROG_CHUNK_SHIFT 6 #define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT) #define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1)) -#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE) struct bpf_prog_pack { struct list_head list; @@ -830,30 +826,72 @@ struct bpf_prog_pack { unsigned long bitmap[]; }; -#define BPF_PROG_MAX_PACK_PROG_SIZE BPF_PROG_PACK_SIZE #define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) +static size_t bpf_prog_pack_size = -1; +static size_t bpf_prog_pack_mask = -1; + +static int bpf_prog_chunk_count(void) +{ + WARN_ON_ONCE(bpf_prog_pack_size == -1); + return bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE; +} + static DEFINE_MUTEX(pack_mutex); static LIST_HEAD(pack_list); +/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with + * CONFIG_MMU=n. Use PAGE_SIZE in these cases. + */ +#ifdef PMD_SIZE +#define BPF_HPAGE_SIZE PMD_SIZE +#define BPF_HPAGE_MASK PMD_MASK +#else +#define BPF_HPAGE_SIZE PAGE_SIZE +#define BPF_HPAGE_MASK PAGE_MASK +#endif + +static size_t select_bpf_prog_pack_size(void) +{ + size_t size; + void *ptr; + + size = BPF_HPAGE_SIZE * num_online_nodes(); + ptr = module_alloc(size); + + /* Test whether we can get huge pages. If not just use PAGE_SIZE + * packs. + */ + if (!ptr || !is_vm_area_hugepages(ptr)) { + size = PAGE_SIZE; + bpf_prog_pack_mask = PAGE_MASK; + } else { + bpf_prog_pack_mask = BPF_HPAGE_MASK; + } + + vfree(ptr); + return size; +} + static struct bpf_prog_pack *alloc_new_pack(void) { struct bpf_prog_pack *pack; - pack = kzalloc(sizeof(*pack) + BITS_TO_BYTES(BPF_PROG_CHUNK_COUNT), GFP_KERNEL); + pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(bpf_prog_chunk_count())), + GFP_KERNEL); if (!pack) return NULL; - pack->ptr = module_alloc(BPF_PROG_PACK_SIZE); + pack->ptr = module_alloc(bpf_prog_pack_size); if (!pack->ptr) { kfree(pack); return NULL; } - bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE); + bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE); list_add_tail(&pack->list, &pack_list); set_vm_flush_reset_perms(pack->ptr); - set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); - set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); + set_memory_ro((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE); + set_memory_x((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE); return pack; } @@ -864,7 +902,11 @@ static void *bpf_prog_pack_alloc(u32 size) unsigned long pos; void *ptr = NULL; - if (size > BPF_PROG_MAX_PACK_PROG_SIZE) { + mutex_lock(&pack_mutex); + if (bpf_prog_pack_size == -1) + bpf_prog_pack_size = select_bpf_prog_pack_size(); + + if (size > bpf_prog_pack_size) { size = round_up(size, PAGE_SIZE); ptr = module_alloc(size); if (ptr) { @@ -872,13 +914,12 @@ static void *bpf_prog_pack_alloc(u32 size) set_memory_ro((unsigned long)ptr, size / PAGE_SIZE); set_memory_x((unsigned long)ptr, size / PAGE_SIZE); } - return ptr; + goto out; } - mutex_lock(&pack_mutex); list_for_each_entry(pack, &pack_list, list) { - pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, + pos = bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0, nbits, 0); - if (pos < BPF_PROG_CHUNK_COUNT) + if (pos < bpf_prog_chunk_count()) goto found_free_area; } @@ -904,13 +945,13 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr) unsigned long pos; void *pack_ptr; - if (hdr->size > BPF_PROG_MAX_PACK_PROG_SIZE) { + mutex_lock(&pack_mutex); + if (hdr->size > bpf_prog_pack_size) { module_memfree(hdr); - return; + goto out; } - pack_ptr = (void *)((unsigned long)hdr & ~(BPF_PROG_PACK_SIZE - 1)); - mutex_lock(&pack_mutex); + pack_ptr = (void *)((unsigned long)hdr & bpf_prog_pack_mask); list_for_each_entry(tmp, &pack_list, list) { if (tmp->ptr == pack_ptr) { @@ -926,8 +967,8 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr) pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT; bitmap_clear(pack->bitmap, pos, nbits); - if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, - BPF_PROG_CHUNK_COUNT, 0) == 0) { + if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0, + bpf_prog_chunk_count(), 0) == 0) { list_del(&pack->list); module_memfree(pack->ptr); kfree(pack); @@ -1382,7 +1423,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) struct bpf_insn *insn; int i, rewritten; - if (!bpf_jit_blinding_enabled(prog) || prog->blinded) + if (!prog->blinding_requested || prog->blinded) return prog; clone = bpf_prog_clone_create(prog, GFP_USER); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ae64110a98b5..315053ef6a75 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -225,13 +225,8 @@ BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size) if (unlikely(!task)) goto err_clear; - strncpy(buf, task->comm, size); - - /* Verifier guarantees that size > 0. For task->comm exceeding - * size, guarantee that buf is %NUL-terminated. Unconditionally - * done here to save the size test. - */ - buf[size - 1] = 0; + /* Verifier guarantees that size > 0 */ + strscpy(buf, task->comm, size); return 0; err_clear: memset(buf, 0, size); diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile index 167534e3b0b4..20f89cc0a0a6 100644 --- a/kernel/bpf/preload/Makefile +++ b/kernel/bpf/preload/Makefile @@ -1,8 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 -LIBBPF_SRCS = $(srctree)/tools/lib/bpf/ -LIBBPF_INCLUDE = $(LIBBPF_SRCS)/.. +LIBBPF_INCLUDE = $(srctree)/tools/lib obj-$(CONFIG_BPF_PRELOAD_UMD) += bpf_preload.o -CFLAGS_bpf_preload_kern.o += -I $(LIBBPF_INCLUDE) +CFLAGS_bpf_preload_kern.o += -I$(LIBBPF_INCLUDE) bpf_preload-objs += bpf_preload_kern.o diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 38bdfcd06f55..34725bfa1e97 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -176,7 +176,7 @@ build_id_valid: } static struct perf_callchain_entry * -get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) +get_callchain_entry_for_task(struct task_struct *task, u32 max_depth) { #ifdef CONFIG_STACKTRACE struct perf_callchain_entry *entry; @@ -187,9 +187,8 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) if (!entry) return NULL; - entry->nr = init_nr + - stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr), - sysctl_perf_event_max_stack - init_nr, 0); + entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip, + max_depth, 0); /* stack_trace_save_tsk() works on unsigned long array, while * perf_callchain_entry uses u64 array. For 32-bit systems, it is @@ -201,7 +200,7 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) int i; /* copy data from the end to avoid using extra buffer */ - for (i = entry->nr - 1; i >= (int)init_nr; i--) + for (i = entry->nr - 1; i >= 0; i--) to[i] = (u64)(from[i]); } @@ -218,27 +217,19 @@ static long __bpf_get_stackid(struct bpf_map *map, { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *new_bucket, *old_bucket; - u32 max_depth = map->value_size / stack_map_data_size(map); - /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ - u32 init_nr = sysctl_perf_event_max_stack - max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len; bool user = flags & BPF_F_USER_STACK; u64 *ips; bool hash_matches; - /* get_perf_callchain() guarantees that trace->nr >= init_nr - * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth - */ - trace_nr = trace->nr - init_nr; - - if (trace_nr <= skip) + if (trace->nr <= skip) /* skipping more than usable stack trace */ return -EFAULT; - trace_nr -= skip; + trace_nr = trace->nr - skip; trace_len = trace_nr * sizeof(u64); - ips = trace->ip + skip + init_nr; + ips = trace->ip + skip; hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); id = hash & (smap->n_buckets - 1); bucket = READ_ONCE(smap->buckets[id]); @@ -295,8 +286,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { u32 max_depth = map->value_size / stack_map_data_size(map); - /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ - u32 init_nr = sysctl_perf_event_max_stack - max_depth; + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; bool kernel = !user; @@ -305,8 +295,12 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); + max_depth += skip; + if (max_depth > sysctl_perf_event_max_stack) + max_depth = sysctl_perf_event_max_stack; + + trace = get_perf_callchain(regs, 0, kernel, user, max_depth, + false, false); if (unlikely(!trace)) /* couldn't fetch the stack trace */ @@ -397,7 +391,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, struct perf_callchain_entry *trace_in, void *buf, u32 size, u64 flags) { - u32 init_nr, trace_nr, copy_len, elem_size, num_elem; + u32 trace_nr, copy_len, elem_size, num_elem, max_depth; bool user_build_id = flags & BPF_F_USER_BUILD_ID; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; @@ -422,30 +416,28 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, goto err_fault; num_elem = size / elem_size; - if (sysctl_perf_event_max_stack < num_elem) - init_nr = 0; - else - init_nr = sysctl_perf_event_max_stack - num_elem; + max_depth = num_elem + skip; + if (sysctl_perf_event_max_stack < max_depth) + max_depth = sysctl_perf_event_max_stack; if (trace_in) trace = trace_in; else if (kernel && task) - trace = get_callchain_entry_for_task(task, init_nr); + trace = get_callchain_entry_for_task(task, max_depth); else - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, + trace = get_perf_callchain(regs, 0, kernel, user, max_depth, false, false); if (unlikely(!trace)) goto err_fault; - trace_nr = trace->nr - init_nr; - if (trace_nr < skip) + if (trace->nr < skip) goto err_fault; - trace_nr -= skip; + trace_nr = trace->nr - skip; trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; copy_len = trace_nr * elem_size; - ips = trace->ip + skip + init_nr; + + ips = trace->ip + skip; if (user && user_build_id) stack_map_get_build_id_offset(buf, ips, trace_nr, user); else diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index db402ebc5570..cdaa1152436a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -32,6 +32,7 @@ #include <linux/bpf-netns.h> #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> +#include <linux/trace_events.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -3022,6 +3023,11 @@ out_put_file: fput(perf_file); return err; } +#else +static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_PERF_EVENTS */ #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd @@ -3336,7 +3342,7 @@ static int bpf_prog_query(const union bpf_attr *attr, } } -#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu +#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size static int bpf_prog_test_run(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -4255,7 +4261,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr, return -EINVAL; } -#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len +#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies static int link_create(union bpf_attr *attr, bpfptr_t uattr) { enum bpf_prog_type ptype; @@ -4279,7 +4285,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = tracing_bpf_link_attach(attr, uattr, prog); goto out; case BPF_PROG_TYPE_PERF_EVENT: - case BPF_PROG_TYPE_KPROBE: case BPF_PROG_TYPE_TRACEPOINT: if (attr->link_create.attach_type != BPF_PERF_EVENT) { ret = -EINVAL; @@ -4287,6 +4292,14 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) } ptype = prog->type; break; + case BPF_PROG_TYPE_KPROBE: + if (attr->link_create.attach_type != BPF_PERF_EVENT && + attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) { + ret = -EINVAL; + goto out; + } + ptype = prog->type; + break; default: ptype = attach_type_to_prog_type(attr->link_create.attach_type); if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { @@ -4318,13 +4331,16 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_xdp_link_attach(attr, prog); break; #endif -#ifdef CONFIG_PERF_EVENTS case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: - case BPF_PROG_TYPE_KPROBE: ret = bpf_perf_link_attach(attr, prog); break; -#endif + case BPF_PROG_TYPE_KPROBE: + if (attr->link_create.attach_type == BPF_PERF_EVENT) + ret = bpf_perf_link_attach(attr, prog); + else + ret = bpf_kprobe_multi_link_attach(attr, prog); + break; default: ret = -EINVAL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a57db4b2803c..d175b70067b3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -554,7 +554,6 @@ static const char *reg_type_str(struct bpf_verifier_env *env, [PTR_TO_TP_BUFFER] = "tp_buffer", [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", - [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", [PTR_TO_MEM] = "mem", [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", @@ -562,8 +561,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env, }; if (type & PTR_MAYBE_NULL) { - if (base_type(type) == PTR_TO_BTF_ID || - base_type(type) == PTR_TO_PERCPU_BTF_ID) + if (base_type(type) == PTR_TO_BTF_ID) strncpy(postfix, "or_null_", 16); else strncpy(postfix, "_or_null", 16); @@ -575,6 +573,8 @@ static const char *reg_type_str(struct bpf_verifier_env *env, strncpy(prefix, "alloc_", 32); if (type & MEM_USER) strncpy(prefix, "user_", 32); + if (type & MEM_PERCPU) + strncpy(prefix, "percpu_", 32); snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", prefix, str[base_type(type)], postfix); @@ -697,8 +697,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, const char *sep = ""; verbose(env, "%s", reg_type_str(env, t)); - if (base_type(t) == PTR_TO_BTF_ID || - base_type(t) == PTR_TO_PERCPU_BTF_ID) + if (base_type(t) == PTR_TO_BTF_ID) verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id)); verbose(env, "("); /* @@ -2783,7 +2782,6 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: case PTR_TO_BUF: - case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: case PTR_TO_FUNC: case PTR_TO_MAP_KEY: @@ -3990,6 +3988,12 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env, * is only allowed in its original, unmodified form. */ + if (reg->off < 0) { + verbose(env, "negative offset %s ptr R%d off=%d disallowed\n", + reg_type_str(env, reg->type), regno, reg->off); + return -EACCES; + } + if (!fixed_off_ok && reg->off) { verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", reg_type_str(env, reg->type), regno, reg->off); @@ -4058,9 +4062,9 @@ static int check_buffer_access(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, int off, int size, bool zero_size_allowed, - const char *buf_info, u32 *max_access) { + const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr"; int err; err = __check_buffer_access(env, buf_info, reg, regno, off, size); @@ -4197,6 +4201,13 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } + if (reg->type & MEM_PERCPU) { + verbose(env, + "R%d is ptr_%s access percpu memory: off=%d\n", + regno, tname, off); + return -EACCES; + } + if (env->ops->btf_struct_access) { ret = env->ops->btf_struct_access(&env->log, reg->btf, t, off, size, atype, &btf_id, &flag); @@ -4556,7 +4567,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_tp_buffer_access(env, reg, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); - } else if (reg->type == PTR_TO_BTF_ID) { + } else if (base_type(reg->type) == PTR_TO_BTF_ID && + !type_may_be_null(reg->type)) { err = check_ptr_to_btf_access(env, regs, regno, off, size, t, value_regno); } else if (reg->type == CONST_PTR_TO_MAP) { @@ -4564,7 +4576,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn value_regno); } else if (base_type(reg->type) == PTR_TO_BUF) { bool rdonly_mem = type_is_rdonly_mem(reg->type); - const char *buf_info; u32 *max_access; if (rdonly_mem) { @@ -4573,15 +4584,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn regno, reg_type_str(env, reg->type)); return -EACCES; } - buf_info = "rdonly"; max_access = &env->prog->aux->max_rdonly_access; } else { - buf_info = "rdwr"; max_access = &env->prog->aux->max_rdwr_access; } err = check_buffer_access(env, reg, regno, off, size, false, - buf_info, max_access); + max_access); if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ)) mark_reg_unknown(env, regs, value_regno); @@ -4802,7 +4811,7 @@ static int check_stack_range_initialized( } if (is_spilled_reg(&state->stack[spi]) && - state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID) + base_type(state->stack[spi].spilled_ptr.type) == PTR_TO_BTF_ID) goto mark; if (is_spilled_reg(&state->stack[spi]) && @@ -4844,7 +4853,6 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - const char *buf_info; u32 *max_access; switch (base_type(reg->type)) { @@ -4871,15 +4879,13 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, if (meta && meta->raw_mode) return -EACCES; - buf_info = "rdonly"; max_access = &env->prog->aux->max_rdonly_access; } else { - buf_info = "rdwr"; max_access = &env->prog->aux->max_rdwr_access; } return check_buffer_access(env, reg, regno, reg->off, access_size, zero_size_allowed, - buf_info, max_access); + max_access); case PTR_TO_STACK: return check_stack_range_initialized( env, @@ -5258,7 +5264,7 @@ static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | ME static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; -static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; +static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU } }; static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; @@ -5359,6 +5365,60 @@ found: return 0; } +int check_func_arg_reg_off(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + enum bpf_arg_type arg_type, + bool is_release_func) +{ + bool fixed_off_ok = false, release_reg; + enum bpf_reg_type type = reg->type; + + switch ((u32)type) { + case SCALAR_VALUE: + /* Pointer types where reg offset is explicitly allowed: */ + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_MAP_KEY: + case PTR_TO_MAP_VALUE: + case PTR_TO_MEM: + case PTR_TO_MEM | MEM_RDONLY: + case PTR_TO_MEM | MEM_ALLOC: + case PTR_TO_BUF: + case PTR_TO_BUF | MEM_RDONLY: + case PTR_TO_STACK: + /* Some of the argument types nevertheless require a + * zero register offset. + */ + if (arg_type != ARG_PTR_TO_ALLOC_MEM) + return 0; + break; + /* All the rest must be rejected, except PTR_TO_BTF_ID which allows + * fixed offset. + */ + case PTR_TO_BTF_ID: + /* When referenced PTR_TO_BTF_ID is passed to release function, + * it's fixed offset must be 0. We rely on the property that + * only one referenced register can be passed to BPF helpers and + * kfuncs. In the other cases, fixed offset can be non-zero. + */ + release_reg = is_release_func && reg->ref_obj_id; + if (release_reg && reg->off) { + verbose(env, "R%d must have zero offset when passed to release func\n", + regno); + return -EINVAL; + } + /* For release_reg == true, fixed_off_ok must be false, but we + * already checked and rejected reg->off != 0 above, so set to + * true to allow fixed offset for all other cases. + */ + fixed_off_ok = true; + break; + default: + break; + } + return __check_ptr_off_reg(env, reg, regno, fixed_off_ok); +} + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn) @@ -5408,36 +5468,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, if (err) return err; - switch ((u32)type) { - case SCALAR_VALUE: - /* Pointer types where reg offset is explicitly allowed: */ - case PTR_TO_PACKET: - case PTR_TO_PACKET_META: - case PTR_TO_MAP_KEY: - case PTR_TO_MAP_VALUE: - case PTR_TO_MEM: - case PTR_TO_MEM | MEM_RDONLY: - case PTR_TO_MEM | MEM_ALLOC: - case PTR_TO_BUF: - case PTR_TO_BUF | MEM_RDONLY: - case PTR_TO_STACK: - /* Some of the argument types nevertheless require a - * zero register offset. - */ - if (arg_type == ARG_PTR_TO_ALLOC_MEM) - goto force_off_check; - break; - /* All the rest must be rejected: */ - default: -force_off_check: - err = __check_ptr_off_reg(env, reg, regno, - type == PTR_TO_BTF_ID); - if (err < 0) - return err; - break; - } + err = check_func_arg_reg_off(env, reg, regno, arg_type, is_release_function(meta->func_id)); + if (err) + return err; skip_type_check: + /* check_func_arg_reg_off relies on only one referenced register being + * allowed for BPF helpers. + */ if (reg->ref_obj_id) { if (meta->ref_obj_id) { verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", @@ -9638,7 +9676,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) dst_reg->mem_size = aux->btf_var.mem_size; break; case PTR_TO_BTF_ID: - case PTR_TO_PERCPU_BTF_ID: dst_reg->btf = aux->btf_var.btf; dst_reg->btf_id = aux->btf_var.btf_id; break; @@ -10363,8 +10400,7 @@ static void adjust_btf_func(struct bpf_verifier_env *env) aux->func_info[i].insn_off = env->subprog_info[i].start; } -#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \ - sizeof(((struct bpf_line_info *)(0))->line_col)) +#define MIN_BPF_LINEINFO_SIZE offsetofend(struct bpf_line_info, line_col) #define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE static int check_btf_line(struct bpf_verifier_env *env, @@ -11838,7 +11874,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, type = t->type; t = btf_type_skip_modifiers(btf, type, NULL); if (percpu) { - aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; + aux->btf_var.reg_type = PTR_TO_BTF_ID | MEM_PERCPU; aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } else if (!btf_type_is_struct(t)) { @@ -12987,6 +13023,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; + func[i]->blinding_requested = prog->blinding_requested; func[i]->aux->kfunc_tab = prog->aux->kfunc_tab; func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab; func[i]->aux->linfo = prog->aux->linfo; @@ -13110,6 +13147,7 @@ out_free: out_undo_insn: /* cleanup main prog to be interpreted */ prog->jit_requested = 0; + prog->blinding_requested = 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { if (!bpf_pseudo_call(insn)) continue; @@ -13203,7 +13241,6 @@ static int do_misc_fixups(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; enum bpf_attach_type eatype = prog->expected_attach_type; - bool expect_blinding = bpf_jit_blinding_enabled(prog); enum bpf_prog_type prog_type = resolve_prog_type(prog); struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; @@ -13367,7 +13404,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; - if (env->bpf_capable && !expect_blinding && + if (env->bpf_capable && !prog->blinding_requested && prog->jit_requested && !bpf_map_key_poisoned(aux) && !bpf_map_ptr_poisoned(aux) && @@ -13455,6 +13492,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto patch_call_imm; } + if (insn->imm == BPF_FUNC_task_storage_get || + insn->imm == BPF_FUNC_sk_storage_get || + insn->imm == BPF_FUNC_inode_storage_get) { + if (env->prog->aux->sleepable) + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); + else + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC); + insn_buf[1] = *insn; + cnt = 2; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto patch_call_imm; + } + /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * and other inlining handlers are currently limited to 64 bit * only. diff --git a/kernel/exit.c b/kernel/exit.c index b00a25bb4ab9..2d1803fa8fe6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -64,6 +64,7 @@ #include <linux/compat.h> #include <linux/io_uring.h> #include <linux/kprobes.h> +#include <linux/rethook.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -169,6 +170,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); kprobe_flush_task(tsk); + rethook_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index f1e89007f228..d94f002d00cc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2255,6 +2255,9 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_KRETPROBES p->kretprobe_instances.first = NULL; #endif +#ifdef CONFIG_RETHOOK + p->rethooks.first = NULL; +#endif /* * Ensure that the cgroup subsystem policies allow the new process to be diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 951c93216fc4..79f2eb617a62 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -212,6 +212,10 @@ unsigned long kallsyms_lookup_name(const char *name) unsigned long i; unsigned int off; + /* Skip the search for empty string. */ + if (!*name) + return 0; + for (i = 0, off = 0; i < kallsyms_num_syms; i++) { off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a5eb5e7fd624..99dd4ca63d68 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -10,6 +10,17 @@ config USER_STACKTRACE_SUPPORT config NOP_TRACER bool +config HAVE_RETHOOK + bool + +config RETHOOK + bool + depends on HAVE_RETHOOK + help + Enable generic return hooking feature. This is an internal + API, which will be used by other function-entry hooking + features like fprobe and kprobes. + config HAVE_FUNCTION_TRACER bool help @@ -236,6 +247,21 @@ config DYNAMIC_FTRACE_WITH_ARGS depends on DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS +config FPROBE + bool "Kernel Function Probe (fprobe)" + depends on FUNCTION_TRACER + depends on DYNAMIC_FTRACE_WITH_REGS + depends on HAVE_RETHOOK + select RETHOOK + default n + help + This option enables kernel function probe (fprobe) based on ftrace. + The fprobe is similar to kprobes, but probes only for kernel function + entries and exits. This also can probe multiple functions by one + fprobe. + + If unsure, say N. + config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index bedc5caceec7..c6f11a139eac 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -97,6 +97,8 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o +obj-$(CONFIG_FPROBE) += fprobe.o +obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a2024ba32a20..172ef545730d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -17,6 +17,9 @@ #include <linux/error-injection.h> #include <linux/btf_ids.h> #include <linux/bpf_lsm.h> +#include <linux/fprobe.h> +#include <linux/bsearch.h> +#include <linux/sort.h> #include <net/bpf_sk_storage.h> @@ -77,6 +80,8 @@ u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags, const struct btf **btf, s32 *btf_id); +static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx); +static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx); /** * trace_call_bpf - invoke BPF program @@ -1036,6 +1041,30 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_func_ip_kprobe_multi, struct pt_regs *, regs) +{ + return bpf_kprobe_multi_entry_ip(current->bpf_ctx); +} + +static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe_multi = { + .func = bpf_get_func_ip_kprobe_multi, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_attach_cookie_kprobe_multi, struct pt_regs *, regs) +{ + return bpf_kprobe_multi_cookie(current->bpf_ctx); +} + +static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = { + .func = bpf_get_attach_cookie_kprobe_multi, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx) { struct bpf_trace_run_ctx *run_ctx; @@ -1279,9 +1308,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_override_return_proto; #endif case BPF_FUNC_get_func_ip: - return &bpf_get_func_ip_proto_kprobe; + return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ? + &bpf_get_func_ip_proto_kprobe_multi : + &bpf_get_func_ip_proto_kprobe; case BPF_FUNC_get_attach_cookie: - return &bpf_get_attach_cookie_proto_trace; + return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ? + &bpf_get_attach_cookie_proto_kmulti : + &bpf_get_attach_cookie_proto_trace; default: return bpf_tracing_func_proto(func_id, prog); } @@ -2181,3 +2214,314 @@ static int __init bpf_event_init(void) fs_initcall(bpf_event_init); #endif /* CONFIG_MODULES */ + +#ifdef CONFIG_FPROBE +struct bpf_kprobe_multi_link { + struct bpf_link link; + struct fprobe fp; + unsigned long *addrs; + u64 *cookies; + u32 cnt; +}; + +struct bpf_kprobe_multi_run_ctx { + struct bpf_run_ctx run_ctx; + struct bpf_kprobe_multi_link *link; + unsigned long entry_ip; +}; + +static void bpf_kprobe_multi_link_release(struct bpf_link *link) +{ + struct bpf_kprobe_multi_link *kmulti_link; + + kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + unregister_fprobe(&kmulti_link->fp); +} + +static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link) +{ + struct bpf_kprobe_multi_link *kmulti_link; + + kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + kvfree(kmulti_link->addrs); + kvfree(kmulti_link->cookies); + kfree(kmulti_link); +} + +static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { + .release = bpf_kprobe_multi_link_release, + .dealloc = bpf_kprobe_multi_link_dealloc, +}; + +static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv) +{ + const struct bpf_kprobe_multi_link *link = priv; + unsigned long *addr_a = a, *addr_b = b; + u64 *cookie_a, *cookie_b; + unsigned long tmp1; + u64 tmp2; + + cookie_a = link->cookies + (addr_a - link->addrs); + cookie_b = link->cookies + (addr_b - link->addrs); + + /* swap addr_a/addr_b and cookie_a/cookie_b values */ + tmp1 = *addr_a; *addr_a = *addr_b; *addr_b = tmp1; + tmp2 = *cookie_a; *cookie_a = *cookie_b; *cookie_b = tmp2; +} + +static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b) +{ + const unsigned long *addr_a = a, *addr_b = b; + + if (*addr_a == *addr_b) + return 0; + return *addr_a < *addr_b ? -1 : 1; +} + +static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv) +{ + return __bpf_kprobe_multi_cookie_cmp(a, b); +} + +static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) +{ + struct bpf_kprobe_multi_run_ctx *run_ctx; + struct bpf_kprobe_multi_link *link; + u64 *cookie, entry_ip; + unsigned long *addr; + + if (WARN_ON_ONCE(!ctx)) + return 0; + run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx); + link = run_ctx->link; + if (!link->cookies) + return 0; + entry_ip = run_ctx->entry_ip; + addr = bsearch(&entry_ip, link->addrs, link->cnt, sizeof(entry_ip), + __bpf_kprobe_multi_cookie_cmp); + if (!addr) + return 0; + cookie = link->cookies + (addr - link->addrs); + return *cookie; +} + +static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) +{ + struct bpf_kprobe_multi_run_ctx *run_ctx; + + run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx); + return run_ctx->entry_ip; +} + +static int +kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, + unsigned long entry_ip, struct pt_regs *regs) +{ + struct bpf_kprobe_multi_run_ctx run_ctx = { + .link = link, + .entry_ip = entry_ip, + }; + struct bpf_run_ctx *old_run_ctx; + int err; + + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { + err = 0; + goto out; + } + + migrate_disable(); + rcu_read_lock(); + old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); + err = bpf_prog_run(link->link.prog, regs); + bpf_reset_run_ctx(old_run_ctx); + rcu_read_unlock(); + migrate_enable(); + + out: + __this_cpu_dec(bpf_prog_active); + return err; +} + +static void +kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip, + struct pt_regs *regs) +{ + struct bpf_kprobe_multi_link *link; + + link = container_of(fp, struct bpf_kprobe_multi_link, fp); + kprobe_multi_link_prog_run(link, entry_ip, regs); +} + +static int +kprobe_multi_resolve_syms(const void *usyms, u32 cnt, + unsigned long *addrs) +{ + unsigned long addr, size; + const char **syms; + int err = -ENOMEM; + unsigned int i; + char *func; + + size = cnt * sizeof(*syms); + syms = kvzalloc(size, GFP_KERNEL); + if (!syms) + return -ENOMEM; + + func = kmalloc(KSYM_NAME_LEN, GFP_KERNEL); + if (!func) + goto error; + + if (copy_from_user(syms, usyms, size)) { + err = -EFAULT; + goto error; + } + + for (i = 0; i < cnt; i++) { + err = strncpy_from_user(func, syms[i], KSYM_NAME_LEN); + if (err == KSYM_NAME_LEN) + err = -E2BIG; + if (err < 0) + goto error; + err = -EINVAL; + addr = kallsyms_lookup_name(func); + if (!addr) + goto error; + if (!kallsyms_lookup_size_offset(addr, &size, NULL)) + goto error; + addr = ftrace_location_range(addr, addr + size - 1); + if (!addr) + goto error; + addrs[i] = addr; + } + + err = 0; +error: + kvfree(syms); + kfree(func); + return err; +} + +int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_kprobe_multi_link *link = NULL; + struct bpf_link_primer link_primer; + void __user *ucookies; + unsigned long *addrs; + u32 flags, cnt, size; + void __user *uaddrs; + u64 *cookies = NULL; + void __user *usyms; + int err; + + /* no support for 32bit archs yet */ + if (sizeof(u64) != sizeof(void *)) + return -EOPNOTSUPP; + + if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI) + return -EINVAL; + + flags = attr->link_create.kprobe_multi.flags; + if (flags & ~BPF_F_KPROBE_MULTI_RETURN) + return -EINVAL; + + uaddrs = u64_to_user_ptr(attr->link_create.kprobe_multi.addrs); + usyms = u64_to_user_ptr(attr->link_create.kprobe_multi.syms); + if (!!uaddrs == !!usyms) + return -EINVAL; + + cnt = attr->link_create.kprobe_multi.cnt; + if (!cnt) + return -EINVAL; + + size = cnt * sizeof(*addrs); + addrs = kvmalloc(size, GFP_KERNEL); + if (!addrs) + return -ENOMEM; + + if (uaddrs) { + if (copy_from_user(addrs, uaddrs, size)) { + err = -EFAULT; + goto error; + } + } else { + err = kprobe_multi_resolve_syms(usyms, cnt, addrs); + if (err) + goto error; + } + + ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies); + if (ucookies) { + cookies = kvmalloc(size, GFP_KERNEL); + if (!cookies) { + err = -ENOMEM; + goto error; + } + if (copy_from_user(cookies, ucookies, size)) { + err = -EFAULT; + goto error; + } + } + + link = kzalloc(sizeof(*link), GFP_KERNEL); + if (!link) { + err = -ENOMEM; + goto error; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI, + &bpf_kprobe_multi_link_lops, prog); + + err = bpf_link_prime(&link->link, &link_primer); + if (err) + goto error; + + if (flags & BPF_F_KPROBE_MULTI_RETURN) + link->fp.exit_handler = kprobe_multi_link_handler; + else + link->fp.entry_handler = kprobe_multi_link_handler; + + link->addrs = addrs; + link->cookies = cookies; + link->cnt = cnt; + + if (cookies) { + /* + * Sorting addresses will trigger sorting cookies as well + * (check bpf_kprobe_multi_cookie_swap). This way we can + * find cookie based on the address in bpf_get_attach_cookie + * helper. + */ + sort_r(addrs, cnt, sizeof(*addrs), + bpf_kprobe_multi_cookie_cmp, + bpf_kprobe_multi_cookie_swap, + link); + } + + err = register_fprobe_ips(&link->fp, addrs, cnt); + if (err) { + bpf_link_cleanup(&link_primer); + return err; + } + + return bpf_link_settle(&link_primer); + +error: + kfree(link); + kvfree(addrs); + kvfree(cookies); + return err; +} +#else /* !CONFIG_FPROBE */ +int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} +static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx) +{ + return 0; +} +static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx) +{ + return 0; +} +#endif diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c new file mode 100644 index 000000000000..8b2dd5b9dcd1 --- /dev/null +++ b/kernel/trace/fprobe.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fprobe - Simple ftrace probe wrapper for function entry. + */ +#define pr_fmt(fmt) "fprobe: " fmt + +#include <linux/err.h> +#include <linux/fprobe.h> +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/rethook.h> +#include <linux/slab.h> +#include <linux/sort.h> + +#include "trace.h" + +struct fprobe_rethook_node { + struct rethook_node node; + unsigned long entry_ip; +}; + +static void fprobe_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct fprobe_rethook_node *fpr; + struct rethook_node *rh; + struct fprobe *fp; + int bit; + + fp = container_of(ops, struct fprobe, ops); + if (fprobe_disabled(fp)) + return; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) { + fp->nmissed++; + return; + } + + if (fp->entry_handler) + fp->entry_handler(fp, ip, ftrace_get_regs(fregs)); + + if (fp->exit_handler) { + rh = rethook_try_get(fp->rethook); + if (!rh) { + fp->nmissed++; + goto out; + } + fpr = container_of(rh, struct fprobe_rethook_node, node); + fpr->entry_ip = ip; + rethook_hook(rh, ftrace_get_regs(fregs), true); + } + +out: + ftrace_test_recursion_unlock(bit); +} +NOKPROBE_SYMBOL(fprobe_handler); + +static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct fprobe *fp = container_of(ops, struct fprobe, ops); + + if (unlikely(kprobe_running())) { + fp->nmissed++; + return; + } + kprobe_busy_begin(); + fprobe_handler(ip, parent_ip, ops, fregs); + kprobe_busy_end(); +} + +static void fprobe_exit_handler(struct rethook_node *rh, void *data, + struct pt_regs *regs) +{ + struct fprobe *fp = (struct fprobe *)data; + struct fprobe_rethook_node *fpr; + + if (!fp || fprobe_disabled(fp)) + return; + + fpr = container_of(rh, struct fprobe_rethook_node, node); + + fp->exit_handler(fp, fpr->entry_ip, regs); +} +NOKPROBE_SYMBOL(fprobe_exit_handler); + +/* Convert ftrace location address from symbols */ +static unsigned long *get_ftrace_locations(const char **syms, int num) +{ + unsigned long addr, size; + unsigned long *addrs; + int i; + + /* Convert symbols to symbol address */ + addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL); + if (!addrs) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < num; i++) { + addr = kallsyms_lookup_name(syms[i]); + if (!addr) /* Maybe wrong symbol */ + goto error; + + /* Convert symbol address to ftrace location. */ + if (!kallsyms_lookup_size_offset(addr, &size, NULL) || !size) + goto error; + + addr = ftrace_location_range(addr, addr + size - 1); + if (!addr) /* No dynamic ftrace there. */ + goto error; + + addrs[i] = addr; + } + + return addrs; + +error: + kfree(addrs); + + return ERR_PTR(-ENOENT); +} + +static void fprobe_init(struct fprobe *fp) +{ + fp->nmissed = 0; + if (fprobe_shared_with_kprobes(fp)) + fp->ops.func = fprobe_kprobe_handler; + else + fp->ops.func = fprobe_handler; + fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS; +} + +static int fprobe_init_rethook(struct fprobe *fp, int num) +{ + int i, size; + + if (num < 0) + return -EINVAL; + + if (!fp->exit_handler) { + fp->rethook = NULL; + return 0; + } + + /* Initialize rethook if needed */ + size = num * num_possible_cpus() * 2; + if (size < 0) + return -E2BIG; + + fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler); + for (i = 0; i < size; i++) { + struct rethook_node *node; + + node = kzalloc(sizeof(struct fprobe_rethook_node), GFP_KERNEL); + if (!node) { + rethook_free(fp->rethook); + fp->rethook = NULL; + return -ENOMEM; + } + rethook_add_node(fp->rethook, node); + } + return 0; +} + +static void fprobe_fail_cleanup(struct fprobe *fp) +{ + if (fp->rethook) { + /* Don't need to cleanup rethook->handler because this is not used. */ + rethook_free(fp->rethook); + fp->rethook = NULL; + } + ftrace_free_filter(&fp->ops); +} + +/** + * register_fprobe() - Register fprobe to ftrace by pattern. + * @fp: A fprobe data structure to be registered. + * @filter: A wildcard pattern of probed symbols. + * @notfilter: A wildcard pattern of NOT probed symbols. + * + * Register @fp to ftrace for enabling the probe on the symbols matched to @filter. + * If @notfilter is not NULL, the symbols matched the @notfilter are not probed. + * + * Return 0 if @fp is registered successfully, -errno if not. + */ +int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter) +{ + struct ftrace_hash *hash; + unsigned char *str; + int ret, len; + + if (!fp || !filter) + return -EINVAL; + + fprobe_init(fp); + + len = strlen(filter); + str = kstrdup(filter, GFP_KERNEL); + ret = ftrace_set_filter(&fp->ops, str, len, 0); + kfree(str); + if (ret) + return ret; + + if (notfilter) { + len = strlen(notfilter); + str = kstrdup(notfilter, GFP_KERNEL); + ret = ftrace_set_notrace(&fp->ops, str, len, 0); + kfree(str); + if (ret) + goto out; + } + + /* TODO: + * correctly calculate the total number of filtered symbols + * from both filter and notfilter. + */ + hash = fp->ops.local_hash.filter_hash; + if (WARN_ON_ONCE(!hash)) + goto out; + + ret = fprobe_init_rethook(fp, (int)hash->count); + if (!ret) + ret = register_ftrace_function(&fp->ops); + +out: + if (ret) + fprobe_fail_cleanup(fp); + return ret; +} +EXPORT_SYMBOL_GPL(register_fprobe); + +/** + * register_fprobe_ips() - Register fprobe to ftrace by address. + * @fp: A fprobe data structure to be registered. + * @addrs: An array of target ftrace location addresses. + * @num: The number of entries of @addrs. + * + * Register @fp to ftrace for enabling the probe on the address given by @addrs. + * The @addrs must be the addresses of ftrace location address, which may be + * the symbol address + arch-dependent offset. + * If you unsure what this mean, please use other registration functions. + * + * Return 0 if @fp is registered successfully, -errno if not. + */ +int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) +{ + int ret; + + if (!fp || !addrs || num <= 0) + return -EINVAL; + + fprobe_init(fp); + + ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0); + if (ret) + return ret; + + ret = fprobe_init_rethook(fp, num); + if (!ret) + ret = register_ftrace_function(&fp->ops); + + if (ret) + fprobe_fail_cleanup(fp); + return ret; +} +EXPORT_SYMBOL_GPL(register_fprobe_ips); + +/** + * register_fprobe_syms() - Register fprobe to ftrace by symbols. + * @fp: A fprobe data structure to be registered. + * @syms: An array of target symbols. + * @num: The number of entries of @syms. + * + * Register @fp to the symbols given by @syms array. This will be useful if + * you are sure the symbols exist in the kernel. + * + * Return 0 if @fp is registered successfully, -errno if not. + */ +int register_fprobe_syms(struct fprobe *fp, const char **syms, int num) +{ + unsigned long *addrs; + int ret; + + if (!fp || !syms || num <= 0) + return -EINVAL; + + addrs = get_ftrace_locations(syms, num); + if (IS_ERR(addrs)) + return PTR_ERR(addrs); + + ret = register_fprobe_ips(fp, addrs, num); + + kfree(addrs); + + return ret; +} +EXPORT_SYMBOL_GPL(register_fprobe_syms); + +/** + * unregister_fprobe() - Unregister fprobe from ftrace + * @fp: A fprobe data structure to be unregistered. + * + * Unregister fprobe (and remove ftrace hooks from the function entries). + * + * Return 0 if @fp is unregistered successfully, -errno if not. + */ +int unregister_fprobe(struct fprobe *fp) +{ + int ret; + + if (!fp || fp->ops.func != fprobe_handler) + return -EINVAL; + + /* + * rethook_free() starts disabling the rethook, but the rethook handlers + * may be running on other processors at this point. To make sure that all + * current running handlers are finished, call unregister_ftrace_function() + * after this. + */ + if (fp->rethook) + rethook_free(fp->rethook); + + ret = unregister_ftrace_function(&fp->ops); + if (ret < 0) + return ret; + + ftrace_free_filter(&fp->ops); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_fprobe); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6105b7036482..4214d6a69e60 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4958,7 +4958,7 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, } static int -ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) +__ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) { struct ftrace_func_entry *entry; @@ -4977,8 +4977,29 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) } static int +ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips, + unsigned int cnt, int remove) +{ + unsigned int i; + int err; + + for (i = 0; i < cnt; i++) { + err = __ftrace_match_addr(hash, ips[i], remove); + if (err) { + /* + * This expects the @hash is a temporary hash and if this + * fails the caller must free the @hash. + */ + return err; + } + } + return 0; +} + +static int ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, - unsigned long ip, int remove, int reset, int enable) + unsigned long *ips, unsigned int cnt, + int remove, int reset, int enable) { struct ftrace_hash **orig_hash; struct ftrace_hash *hash; @@ -5008,8 +5029,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, ret = -EINVAL; goto out_regex_unlock; } - if (ip) { - ret = ftrace_match_addr(hash, ip, remove); + if (ips) { + ret = ftrace_match_addr(hash, ips, cnt, remove); if (ret < 0) goto out_regex_unlock; } @@ -5026,10 +5047,10 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, } static int -ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, - int reset, int enable) +ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt, + int remove, int reset, int enable) { - return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable); + return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable); } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS @@ -5634,11 +5655,30 @@ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, int remove, int reset) { ftrace_ops_init(ops); - return ftrace_set_addr(ops, ip, remove, reset, 1); + return ftrace_set_addr(ops, &ip, 1, remove, reset, 1); } EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); /** + * ftrace_set_filter_ips - set functions to filter on in ftrace by addresses + * @ops - the ops to set the filter with + * @ips - the array of addresses to add to or remove from the filter. + * @cnt - the number of addresses in @ips + * @remove - non zero to remove ips from the filter + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled + * If @ips array or any ip specified within is NULL , it fails to update filter. + */ +int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips, + unsigned int cnt, int remove, int reset) +{ + ftrace_ops_init(ops); + return ftrace_set_addr(ops, ips, cnt, remove, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_filter_ips); + +/** * ftrace_ops_set_global_filter - setup ops to use global filters * @ops - the ops which will use the global filters * @@ -5659,7 +5699,7 @@ static int ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int reset, int enable) { - return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable); + return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable); } /** diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c new file mode 100644 index 000000000000..ab463a4d2b23 --- /dev/null +++ b/kernel/trace/rethook.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) "rethook: " fmt + +#include <linux/bug.h> +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/preempt.h> +#include <linux/rethook.h> +#include <linux/slab.h> +#include <linux/sort.h> + +/* Return hook list (shadow stack by list) */ + +/* + * This function is called from delayed_put_task_struct() when a task is + * dead and cleaned up to recycle any kretprobe instances associated with + * this task. These left over instances represent probed functions that + * have been called but will never return. + */ +void rethook_flush_task(struct task_struct *tk) +{ + struct rethook_node *rhn; + struct llist_node *node; + + node = __llist_del_all(&tk->rethooks); + while (node) { + rhn = container_of(node, struct rethook_node, llist); + node = node->next; + preempt_disable(); + rethook_recycle(rhn); + preempt_enable(); + } +} + +static void rethook_free_rcu(struct rcu_head *head) +{ + struct rethook *rh = container_of(head, struct rethook, rcu); + struct rethook_node *rhn; + struct freelist_node *node; + int count = 1; + + node = rh->pool.head; + while (node) { + rhn = container_of(node, struct rethook_node, freelist); + node = node->next; + kfree(rhn); + count++; + } + + /* The rh->ref is the number of pooled node + 1 */ + if (refcount_sub_and_test(count, &rh->ref)) + kfree(rh); +} + +/** + * rethook_free() - Free struct rethook. + * @rh: the struct rethook to be freed. + * + * Free the rethook. Before calling this function, user must ensure the + * @rh::data is cleaned if needed (or, the handler can access it after + * calling this function.) This function will set the @rh to be freed + * after all rethook_node are freed (not soon). And the caller must + * not touch @rh after calling this. + */ +void rethook_free(struct rethook *rh) +{ + rcu_assign_pointer(rh->handler, NULL); + + call_rcu(&rh->rcu, rethook_free_rcu); +} + +/** + * rethook_alloc() - Allocate struct rethook. + * @data: a data to pass the @handler when hooking the return. + * @handler: the return hook callback function. + * + * Allocate and initialize a new rethook with @data and @handler. + * Return NULL if memory allocation fails or @handler is NULL. + * Note that @handler == NULL means this rethook is going to be freed. + */ +struct rethook *rethook_alloc(void *data, rethook_handler_t handler) +{ + struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL); + + if (!rh || !handler) + return NULL; + + rh->data = data; + rh->handler = handler; + rh->pool.head = NULL; + refcount_set(&rh->ref, 1); + + return rh; +} + +/** + * rethook_add_node() - Add a new node to the rethook. + * @rh: the struct rethook. + * @node: the struct rethook_node to be added. + * + * Add @node to @rh. User must allocate @node (as a part of user's + * data structure.) The @node fields are initialized in this function. + */ +void rethook_add_node(struct rethook *rh, struct rethook_node *node) +{ + node->rethook = rh; + freelist_add(&node->freelist, &rh->pool); + refcount_inc(&rh->ref); +} + +static void free_rethook_node_rcu(struct rcu_head *head) +{ + struct rethook_node *node = container_of(head, struct rethook_node, rcu); + + if (refcount_dec_and_test(&node->rethook->ref)) + kfree(node->rethook); + kfree(node); +} + +/** + * rethook_recycle() - return the node to rethook. + * @node: The struct rethook_node to be returned. + * + * Return back the @node to @node::rethook. If the @node::rethook is already + * marked as freed, this will free the @node. + */ +void rethook_recycle(struct rethook_node *node) +{ + lockdep_assert_preemption_disabled(); + + if (likely(READ_ONCE(node->rethook->handler))) + freelist_add(&node->freelist, &node->rethook->pool); + else + call_rcu(&node->rcu, free_rethook_node_rcu); +} +NOKPROBE_SYMBOL(rethook_recycle); + +/** + * rethook_try_get() - get an unused rethook node. + * @rh: The struct rethook which pools the nodes. + * + * Get an unused rethook node from @rh. If the node pool is empty, this + * will return NULL. Caller must disable preemption. + */ +struct rethook_node *rethook_try_get(struct rethook *rh) +{ + rethook_handler_t handler = READ_ONCE(rh->handler); + struct freelist_node *fn; + + lockdep_assert_preemption_disabled(); + + /* Check whether @rh is going to be freed. */ + if (unlikely(!handler)) + return NULL; + + fn = freelist_try_get(&rh->pool); + if (!fn) + return NULL; + + return container_of(fn, struct rethook_node, freelist); +} +NOKPROBE_SYMBOL(rethook_try_get); + +/** + * rethook_hook() - Hook the current function return. + * @node: The struct rethook node to hook the function return. + * @regs: The struct pt_regs for the function entry. + * @mcount: True if this is called from mcount(ftrace) context. + * + * Hook the current running function return. This must be called when the + * function entry (or at least @regs must be the registers of the function + * entry.) @mcount is used for identifying the context. If this is called + * from ftrace (mcount) callback, @mcount must be set true. If this is called + * from the real function entry (e.g. kprobes) @mcount must be set false. + * This is because the way to hook the function return depends on the context. + */ +void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount) +{ + arch_rethook_prepare(node, regs, mcount); + __llist_add(&node->llist, ¤t->rethooks); +} +NOKPROBE_SYMBOL(rethook_hook); + +/* This assumes the 'tsk' is the current task or is not running. */ +static unsigned long __rethook_find_ret_addr(struct task_struct *tsk, + struct llist_node **cur) +{ + struct rethook_node *rh = NULL; + struct llist_node *node = *cur; + + if (!node) + node = tsk->rethooks.first; + else + node = node->next; + + while (node) { + rh = container_of(node, struct rethook_node, llist); + if (rh->ret_addr != (unsigned long)arch_rethook_trampoline) { + *cur = node; + return rh->ret_addr; + } + node = node->next; + } + return 0; +} +NOKPROBE_SYMBOL(__rethook_find_ret_addr); + +/** + * rethook_find_ret_addr -- Find correct return address modified by rethook + * @tsk: Target task + * @frame: A frame pointer + * @cur: a storage of the loop cursor llist_node pointer for next call + * + * Find the correct return address modified by a rethook on @tsk in unsigned + * long type. + * The @tsk must be 'current' or a task which is not running. @frame is a hint + * to get the currect return address - which is compared with the + * rethook::frame field. The @cur is a loop cursor for searching the + * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the + * first call, but '@cur' itself must NOT NULL. + * + * Returns found address value or zero if not found. + */ +unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame, + struct llist_node **cur) +{ + struct rethook_node *rhn = NULL; + unsigned long ret; + + if (WARN_ON_ONCE(!cur)) + return 0; + + if (WARN_ON_ONCE(tsk != current && task_is_running(tsk))) + return 0; + + do { + ret = __rethook_find_ret_addr(tsk, cur); + if (!ret) + break; + rhn = container_of(*cur, struct rethook_node, llist); + } while (rhn->frame != frame); + + return ret; +} +NOKPROBE_SYMBOL(rethook_find_ret_addr); + +void __weak arch_rethook_fixup_return(struct pt_regs *regs, + unsigned long correct_ret_addr) +{ + /* + * Do nothing by default. If the architecture which uses a + * frame pointer to record real return address on the stack, + * it should fill this function to fixup the return address + * so that stacktrace works from the rethook handler. + */ +} + +/* This function will be called from each arch-defined trampoline. */ +unsigned long rethook_trampoline_handler(struct pt_regs *regs, + unsigned long frame) +{ + struct llist_node *first, *node = NULL; + unsigned long correct_ret_addr; + rethook_handler_t handler; + struct rethook_node *rhn; + + correct_ret_addr = __rethook_find_ret_addr(current, &node); + if (!correct_ret_addr) { + pr_err("rethook: Return address not found! Maybe there is a bug in the kernel\n"); + BUG_ON(1); + } + + instruction_pointer_set(regs, correct_ret_addr); + + /* + * These loops must be protected from rethook_free_rcu() because those + * are accessing 'rhn->rethook'. + */ + preempt_disable(); + + /* + * Run the handler on the shadow stack. Do not unlink the list here because + * stackdump inside the handlers needs to decode it. + */ + first = current->rethooks.first; + while (first) { + rhn = container_of(first, struct rethook_node, llist); + if (WARN_ON_ONCE(rhn->frame != frame)) + break; + handler = READ_ONCE(rhn->rethook->handler); + if (handler) + handler(rhn, rhn->rethook->data, regs); + + if (first == node) + break; + first = first->next; + } + + /* Fixup registers for returning to correct address. */ + arch_rethook_fixup_return(regs, correct_ret_addr); + + /* Unlink used shadow stack */ + first = current->rethooks.first; + current->rethooks.first = node->next; + node->next = NULL; + + while (first) { + rhn = container_of(first, struct rethook_node, llist); + first = first->next; + rethook_recycle(rhn); + } + preempt_enable(); + + return correct_ret_addr; +} +NOKPROBE_SYMBOL(rethook_trampoline_handler); |